]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
aarch64: Treat p12-p15 as call-preserved in SVE PCS functions
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74 #include "intl.h"
75 #include "expmed.h"
76 #include "function-abi.h"
77
78 /* This file should be included last. */
79 #include "target-def.h"
80
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
86 {
87 enum insn_type { MOV, MVN, INDEX, PTRUE };
88 enum modifier_type { LSL, MSL };
89
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode, rtx);
92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 insn_type = MOV, modifier_type = LSL,
94 unsigned int = 0);
95 simd_immediate_info (scalar_mode, rtx, rtx);
96 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
97
98 /* The mode of the elements. */
99 scalar_mode elt_mode;
100
101 /* The instruction to use to move the immediate into a vector. */
102 insn_type insn;
103
104 union
105 {
106 /* For MOV and MVN. */
107 struct
108 {
109 /* The value of each element. */
110 rtx value;
111
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier;
115 unsigned int shift;
116 } mov;
117
118 /* For INDEX. */
119 struct
120 {
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
123 rtx base, step;
124 } index;
125
126 /* For PTRUE. */
127 aarch64_svpattern pattern;
128 } u;
129 };
130
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
135 : elt_mode (elt_mode_in), insn (MOV)
136 {
137 u.mov.value = value_in;
138 u.mov.modifier = LSL;
139 u.mov.shift = 0;
140 }
141
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
144 fields. */
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
147 unsigned HOST_WIDE_INT value_in,
148 insn_type insn_in, modifier_type modifier_in,
149 unsigned int shift_in)
150 : elt_mode (elt_mode_in), insn (insn_in)
151 {
152 u.mov.value = gen_int_mode (value_in, elt_mode_in);
153 u.mov.modifier = modifier_in;
154 u.mov.shift = shift_in;
155 }
156
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161 : elt_mode (elt_mode_in), insn (INDEX)
162 {
163 u.index.base = base_in;
164 u.index.step = step_in;
165 }
166
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
171 aarch64_svpattern pattern_in)
172 : elt_mode (elt_mode_in), insn (PTRUE)
173 {
174 u.pattern = pattern_in;
175 }
176
177 /* The current code model. */
178 enum aarch64_code_model aarch64_cmodel;
179
180 /* The number of 64-bit elements in an SVE vector. */
181 poly_uint16 aarch64_sve_vg;
182
183 #ifdef HAVE_AS_TLS
184 #undef TARGET_HAVE_TLS
185 #define TARGET_HAVE_TLS 1
186 #endif
187
188 static bool aarch64_composite_type_p (const_tree, machine_mode);
189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
190 const_tree,
191 machine_mode *, int *,
192 bool *);
193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
195 static void aarch64_override_options_after_change (void);
196 static bool aarch64_vector_mode_supported_p (machine_mode);
197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
199 const_tree type,
200 int misalignment,
201 bool is_packed);
202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
204 aarch64_addr_query_type);
205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
206
207 /* Major revision number of the ARM Architecture implemented by the target. */
208 unsigned aarch64_architecture_version;
209
210 /* The processor for which instructions should be scheduled. */
211 enum aarch64_processor aarch64_tune = cortexa53;
212
213 /* Mask to specify which instruction scheduling options should be used. */
214 uint64_t aarch64_tune_flags = 0;
215
216 /* Global flag for PC relative loads. */
217 bool aarch64_pcrelative_literal_loads;
218
219 /* Global flag for whether frame pointer is enabled. */
220 bool aarch64_use_frame_pointer;
221
222 #define BRANCH_PROTECT_STR_MAX 255
223 char *accepted_branch_protection_string = NULL;
224
225 static enum aarch64_parse_opt_result
226 aarch64_parse_branch_protection (const char*, char**);
227
228 /* Support for command line parsing of boolean flags in the tuning
229 structures. */
230 struct aarch64_flag_desc
231 {
232 const char* name;
233 unsigned int flag;
234 };
235
236 #define AARCH64_FUSION_PAIR(name, internal_name) \
237 { name, AARCH64_FUSE_##internal_name },
238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
239 {
240 { "none", AARCH64_FUSE_NOTHING },
241 #include "aarch64-fusion-pairs.def"
242 { "all", AARCH64_FUSE_ALL },
243 { NULL, AARCH64_FUSE_NOTHING }
244 };
245
246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
247 { name, AARCH64_EXTRA_TUNE_##internal_name },
248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
249 {
250 { "none", AARCH64_EXTRA_TUNE_NONE },
251 #include "aarch64-tuning-flags.def"
252 { "all", AARCH64_EXTRA_TUNE_ALL },
253 { NULL, AARCH64_EXTRA_TUNE_NONE }
254 };
255
256 /* Tuning parameters. */
257
258 static const struct cpu_addrcost_table generic_addrcost_table =
259 {
260 {
261 1, /* hi */
262 0, /* si */
263 0, /* di */
264 1, /* ti */
265 },
266 0, /* pre_modify */
267 0, /* post_modify */
268 0, /* register_offset */
269 0, /* register_sextend */
270 0, /* register_zextend */
271 0 /* imm_offset */
272 };
273
274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
275 {
276 {
277 0, /* hi */
278 0, /* si */
279 0, /* di */
280 2, /* ti */
281 },
282 0, /* pre_modify */
283 0, /* post_modify */
284 1, /* register_offset */
285 1, /* register_sextend */
286 2, /* register_zextend */
287 0, /* imm_offset */
288 };
289
290 static const struct cpu_addrcost_table xgene1_addrcost_table =
291 {
292 {
293 1, /* hi */
294 0, /* si */
295 0, /* di */
296 1, /* ti */
297 },
298 1, /* pre_modify */
299 1, /* post_modify */
300 0, /* register_offset */
301 1, /* register_sextend */
302 1, /* register_zextend */
303 0, /* imm_offset */
304 };
305
306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
307 {
308 {
309 1, /* hi */
310 1, /* si */
311 1, /* di */
312 2, /* ti */
313 },
314 0, /* pre_modify */
315 0, /* post_modify */
316 2, /* register_offset */
317 3, /* register_sextend */
318 3, /* register_zextend */
319 0, /* imm_offset */
320 };
321
322 static const struct cpu_addrcost_table tsv110_addrcost_table =
323 {
324 {
325 1, /* hi */
326 0, /* si */
327 0, /* di */
328 1, /* ti */
329 },
330 0, /* pre_modify */
331 0, /* post_modify */
332 0, /* register_offset */
333 1, /* register_sextend */
334 1, /* register_zextend */
335 0, /* imm_offset */
336 };
337
338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
339 {
340 {
341 1, /* hi */
342 1, /* si */
343 1, /* di */
344 2, /* ti */
345 },
346 1, /* pre_modify */
347 1, /* post_modify */
348 3, /* register_offset */
349 3, /* register_sextend */
350 3, /* register_zextend */
351 2, /* imm_offset */
352 };
353
354 static const struct cpu_regmove_cost generic_regmove_cost =
355 {
356 1, /* GP2GP */
357 /* Avoid the use of slow int<->fp moves for spilling by setting
358 their cost higher than memmov_cost. */
359 5, /* GP2FP */
360 5, /* FP2GP */
361 2 /* FP2FP */
362 };
363
364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
365 {
366 1, /* GP2GP */
367 /* Avoid the use of slow int<->fp moves for spilling by setting
368 their cost higher than memmov_cost. */
369 5, /* GP2FP */
370 5, /* FP2GP */
371 2 /* FP2FP */
372 };
373
374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
375 {
376 1, /* GP2GP */
377 /* Avoid the use of slow int<->fp moves for spilling by setting
378 their cost higher than memmov_cost. */
379 5, /* GP2FP */
380 5, /* FP2GP */
381 2 /* FP2FP */
382 };
383
384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
385 {
386 1, /* GP2GP */
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost (actual, 4 and 9). */
389 9, /* GP2FP */
390 9, /* FP2GP */
391 1 /* FP2FP */
392 };
393
394 static const struct cpu_regmove_cost thunderx_regmove_cost =
395 {
396 2, /* GP2GP */
397 2, /* GP2FP */
398 6, /* FP2GP */
399 4 /* FP2FP */
400 };
401
402 static const struct cpu_regmove_cost xgene1_regmove_cost =
403 {
404 1, /* GP2GP */
405 /* Avoid the use of slow int<->fp moves for spilling by setting
406 their cost higher than memmov_cost. */
407 8, /* GP2FP */
408 8, /* FP2GP */
409 2 /* FP2FP */
410 };
411
412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
413 {
414 2, /* GP2GP */
415 /* Avoid the use of int<->fp moves for spilling. */
416 6, /* GP2FP */
417 6, /* FP2GP */
418 4 /* FP2FP */
419 };
420
421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
422 {
423 1, /* GP2GP */
424 /* Avoid the use of int<->fp moves for spilling. */
425 8, /* GP2FP */
426 8, /* FP2GP */
427 4 /* FP2FP */
428 };
429
430 static const struct cpu_regmove_cost tsv110_regmove_cost =
431 {
432 1, /* GP2GP */
433 /* Avoid the use of slow int<->fp moves for spilling by setting
434 their cost higher than memmov_cost. */
435 2, /* GP2FP */
436 3, /* FP2GP */
437 2 /* FP2FP */
438 };
439
440 /* Generic costs for vector insn classes. */
441 static const struct cpu_vector_cost generic_vector_cost =
442 {
443 1, /* scalar_int_stmt_cost */
444 1, /* scalar_fp_stmt_cost */
445 1, /* scalar_load_cost */
446 1, /* scalar_store_cost */
447 1, /* vec_int_stmt_cost */
448 1, /* vec_fp_stmt_cost */
449 2, /* vec_permute_cost */
450 2, /* vec_to_scalar_cost */
451 1, /* scalar_to_vec_cost */
452 1, /* vec_align_load_cost */
453 1, /* vec_unalign_load_cost */
454 1, /* vec_unalign_store_cost */
455 1, /* vec_store_cost */
456 3, /* cond_taken_branch_cost */
457 1 /* cond_not_taken_branch_cost */
458 };
459
460 /* QDF24XX costs for vector insn classes. */
461 static const struct cpu_vector_cost qdf24xx_vector_cost =
462 {
463 1, /* scalar_int_stmt_cost */
464 1, /* scalar_fp_stmt_cost */
465 1, /* scalar_load_cost */
466 1, /* scalar_store_cost */
467 1, /* vec_int_stmt_cost */
468 3, /* vec_fp_stmt_cost */
469 2, /* vec_permute_cost */
470 1, /* vec_to_scalar_cost */
471 1, /* scalar_to_vec_cost */
472 1, /* vec_align_load_cost */
473 1, /* vec_unalign_load_cost */
474 1, /* vec_unalign_store_cost */
475 1, /* vec_store_cost */
476 3, /* cond_taken_branch_cost */
477 1 /* cond_not_taken_branch_cost */
478 };
479
480 /* ThunderX costs for vector insn classes. */
481 static const struct cpu_vector_cost thunderx_vector_cost =
482 {
483 1, /* scalar_int_stmt_cost */
484 1, /* scalar_fp_stmt_cost */
485 3, /* scalar_load_cost */
486 1, /* scalar_store_cost */
487 4, /* vec_int_stmt_cost */
488 1, /* vec_fp_stmt_cost */
489 4, /* vec_permute_cost */
490 2, /* vec_to_scalar_cost */
491 2, /* scalar_to_vec_cost */
492 3, /* vec_align_load_cost */
493 5, /* vec_unalign_load_cost */
494 5, /* vec_unalign_store_cost */
495 1, /* vec_store_cost */
496 3, /* cond_taken_branch_cost */
497 3 /* cond_not_taken_branch_cost */
498 };
499
500 static const struct cpu_vector_cost tsv110_vector_cost =
501 {
502 1, /* scalar_int_stmt_cost */
503 1, /* scalar_fp_stmt_cost */
504 5, /* scalar_load_cost */
505 1, /* scalar_store_cost */
506 2, /* vec_int_stmt_cost */
507 2, /* vec_fp_stmt_cost */
508 2, /* vec_permute_cost */
509 3, /* vec_to_scalar_cost */
510 2, /* scalar_to_vec_cost */
511 5, /* vec_align_load_cost */
512 5, /* vec_unalign_load_cost */
513 1, /* vec_unalign_store_cost */
514 1, /* vec_store_cost */
515 1, /* cond_taken_branch_cost */
516 1 /* cond_not_taken_branch_cost */
517 };
518
519 /* Generic costs for vector insn classes. */
520 static const struct cpu_vector_cost cortexa57_vector_cost =
521 {
522 1, /* scalar_int_stmt_cost */
523 1, /* scalar_fp_stmt_cost */
524 4, /* scalar_load_cost */
525 1, /* scalar_store_cost */
526 2, /* vec_int_stmt_cost */
527 2, /* vec_fp_stmt_cost */
528 3, /* vec_permute_cost */
529 8, /* vec_to_scalar_cost */
530 8, /* scalar_to_vec_cost */
531 4, /* vec_align_load_cost */
532 4, /* vec_unalign_load_cost */
533 1, /* vec_unalign_store_cost */
534 1, /* vec_store_cost */
535 1, /* cond_taken_branch_cost */
536 1 /* cond_not_taken_branch_cost */
537 };
538
539 static const struct cpu_vector_cost exynosm1_vector_cost =
540 {
541 1, /* scalar_int_stmt_cost */
542 1, /* scalar_fp_stmt_cost */
543 5, /* scalar_load_cost */
544 1, /* scalar_store_cost */
545 3, /* vec_int_stmt_cost */
546 3, /* vec_fp_stmt_cost */
547 3, /* vec_permute_cost */
548 3, /* vec_to_scalar_cost */
549 3, /* scalar_to_vec_cost */
550 5, /* vec_align_load_cost */
551 5, /* vec_unalign_load_cost */
552 1, /* vec_unalign_store_cost */
553 1, /* vec_store_cost */
554 1, /* cond_taken_branch_cost */
555 1 /* cond_not_taken_branch_cost */
556 };
557
558 /* Generic costs for vector insn classes. */
559 static const struct cpu_vector_cost xgene1_vector_cost =
560 {
561 1, /* scalar_int_stmt_cost */
562 1, /* scalar_fp_stmt_cost */
563 5, /* scalar_load_cost */
564 1, /* scalar_store_cost */
565 2, /* vec_int_stmt_cost */
566 2, /* vec_fp_stmt_cost */
567 2, /* vec_permute_cost */
568 4, /* vec_to_scalar_cost */
569 4, /* scalar_to_vec_cost */
570 10, /* vec_align_load_cost */
571 10, /* vec_unalign_load_cost */
572 2, /* vec_unalign_store_cost */
573 2, /* vec_store_cost */
574 2, /* cond_taken_branch_cost */
575 1 /* cond_not_taken_branch_cost */
576 };
577
578 /* Costs for vector insn classes for Vulcan. */
579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
580 {
581 1, /* scalar_int_stmt_cost */
582 6, /* scalar_fp_stmt_cost */
583 4, /* scalar_load_cost */
584 1, /* scalar_store_cost */
585 5, /* vec_int_stmt_cost */
586 6, /* vec_fp_stmt_cost */
587 10, /* vec_permute_cost */
588 6, /* vec_to_scalar_cost */
589 5, /* scalar_to_vec_cost */
590 8, /* vec_align_load_cost */
591 8, /* vec_unalign_load_cost */
592 4, /* vec_unalign_store_cost */
593 4, /* vec_store_cost */
594 2, /* cond_taken_branch_cost */
595 1 /* cond_not_taken_branch_cost */
596 };
597
598 /* Generic costs for branch instructions. */
599 static const struct cpu_branch_cost generic_branch_cost =
600 {
601 1, /* Predictable. */
602 3 /* Unpredictable. */
603 };
604
605 /* Generic approximation modes. */
606 static const cpu_approx_modes generic_approx_modes =
607 {
608 AARCH64_APPROX_NONE, /* division */
609 AARCH64_APPROX_NONE, /* sqrt */
610 AARCH64_APPROX_NONE /* recip_sqrt */
611 };
612
613 /* Approximation modes for Exynos M1. */
614 static const cpu_approx_modes exynosm1_approx_modes =
615 {
616 AARCH64_APPROX_NONE, /* division */
617 AARCH64_APPROX_ALL, /* sqrt */
618 AARCH64_APPROX_ALL /* recip_sqrt */
619 };
620
621 /* Approximation modes for X-Gene 1. */
622 static const cpu_approx_modes xgene1_approx_modes =
623 {
624 AARCH64_APPROX_NONE, /* division */
625 AARCH64_APPROX_NONE, /* sqrt */
626 AARCH64_APPROX_ALL /* recip_sqrt */
627 };
628
629 /* Generic prefetch settings (which disable prefetch). */
630 static const cpu_prefetch_tune generic_prefetch_tune =
631 {
632 0, /* num_slots */
633 -1, /* l1_cache_size */
634 -1, /* l1_cache_line_size */
635 -1, /* l2_cache_size */
636 true, /* prefetch_dynamic_strides */
637 -1, /* minimum_stride */
638 -1 /* default_opt_level */
639 };
640
641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
642 {
643 0, /* num_slots */
644 -1, /* l1_cache_size */
645 64, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
650 };
651
652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
653 {
654 4, /* num_slots */
655 32, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 512, /* l2_cache_size */
658 false, /* prefetch_dynamic_strides */
659 2048, /* minimum_stride */
660 3 /* default_opt_level */
661 };
662
663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
664 {
665 8, /* num_slots */
666 32, /* l1_cache_size */
667 128, /* l1_cache_line_size */
668 16*1024, /* l2_cache_size */
669 true, /* prefetch_dynamic_strides */
670 -1, /* minimum_stride */
671 3 /* default_opt_level */
672 };
673
674 static const cpu_prefetch_tune thunderx_prefetch_tune =
675 {
676 8, /* num_slots */
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 -1, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 -1 /* default_opt_level */
683 };
684
685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
686 {
687 8, /* num_slots */
688 32, /* l1_cache_size */
689 64, /* l1_cache_line_size */
690 256, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
694 };
695
696 static const cpu_prefetch_tune tsv110_prefetch_tune =
697 {
698 0, /* num_slots */
699 64, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 512, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
705 };
706
707 static const cpu_prefetch_tune xgene1_prefetch_tune =
708 {
709 8, /* num_slots */
710 32, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 256, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
716 };
717
718 static const struct tune_params generic_tunings =
719 {
720 &cortexa57_extra_costs,
721 &generic_addrcost_table,
722 &generic_regmove_cost,
723 &generic_vector_cost,
724 &generic_branch_cost,
725 &generic_approx_modes,
726 SVE_NOT_IMPLEMENTED, /* sve_width */
727 4, /* memmov_cost */
728 2, /* issue_rate */
729 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
730 "16:12", /* function_align. */
731 "4", /* jump_align. */
732 "8", /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
741 &generic_prefetch_tune
742 };
743
744 static const struct tune_params cortexa35_tunings =
745 {
746 &cortexa53_extra_costs,
747 &generic_addrcost_table,
748 &cortexa53_regmove_cost,
749 &generic_vector_cost,
750 &generic_branch_cost,
751 &generic_approx_modes,
752 SVE_NOT_IMPLEMENTED, /* sve_width */
753 4, /* memmov_cost */
754 1, /* issue_rate */
755 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
756 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
757 "16", /* function_align. */
758 "4", /* jump_align. */
759 "8", /* loop_align. */
760 2, /* int_reassoc_width. */
761 4, /* fp_reassoc_width. */
762 1, /* vec_reassoc_width. */
763 2, /* min_div_recip_mul_sf. */
764 2, /* min_div_recip_mul_df. */
765 0, /* max_case_values. */
766 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
768 &generic_prefetch_tune
769 };
770
771 static const struct tune_params cortexa53_tunings =
772 {
773 &cortexa53_extra_costs,
774 &generic_addrcost_table,
775 &cortexa53_regmove_cost,
776 &generic_vector_cost,
777 &generic_branch_cost,
778 &generic_approx_modes,
779 SVE_NOT_IMPLEMENTED, /* sve_width */
780 4, /* memmov_cost */
781 2, /* issue_rate */
782 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
783 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
784 "16", /* function_align. */
785 "4", /* jump_align. */
786 "8", /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
795 &generic_prefetch_tune
796 };
797
798 static const struct tune_params cortexa57_tunings =
799 {
800 &cortexa57_extra_costs,
801 &generic_addrcost_table,
802 &cortexa57_regmove_cost,
803 &cortexa57_vector_cost,
804 &generic_branch_cost,
805 &generic_approx_modes,
806 SVE_NOT_IMPLEMENTED, /* sve_width */
807 4, /* memmov_cost */
808 3, /* issue_rate */
809 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
811 "16", /* function_align. */
812 "4", /* jump_align. */
813 "8", /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
822 &generic_prefetch_tune
823 };
824
825 static const struct tune_params cortexa72_tunings =
826 {
827 &cortexa57_extra_costs,
828 &generic_addrcost_table,
829 &cortexa57_regmove_cost,
830 &cortexa57_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 SVE_NOT_IMPLEMENTED, /* sve_width */
834 4, /* memmov_cost */
835 3, /* issue_rate */
836 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
837 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
838 "16", /* function_align. */
839 "4", /* jump_align. */
840 "8", /* loop_align. */
841 2, /* int_reassoc_width. */
842 4, /* fp_reassoc_width. */
843 1, /* vec_reassoc_width. */
844 2, /* min_div_recip_mul_sf. */
845 2, /* min_div_recip_mul_df. */
846 0, /* max_case_values. */
847 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
848 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
849 &generic_prefetch_tune
850 };
851
852 static const struct tune_params cortexa73_tunings =
853 {
854 &cortexa57_extra_costs,
855 &generic_addrcost_table,
856 &cortexa57_regmove_cost,
857 &cortexa57_vector_cost,
858 &generic_branch_cost,
859 &generic_approx_modes,
860 SVE_NOT_IMPLEMENTED, /* sve_width */
861 4, /* memmov_cost. */
862 2, /* issue_rate. */
863 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
864 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
865 "16", /* function_align. */
866 "4", /* jump_align. */
867 "8", /* loop_align. */
868 2, /* int_reassoc_width. */
869 4, /* fp_reassoc_width. */
870 1, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
876 &generic_prefetch_tune
877 };
878
879
880
881 static const struct tune_params exynosm1_tunings =
882 {
883 &exynosm1_extra_costs,
884 &exynosm1_addrcost_table,
885 &exynosm1_regmove_cost,
886 &exynosm1_vector_cost,
887 &generic_branch_cost,
888 &exynosm1_approx_modes,
889 SVE_NOT_IMPLEMENTED, /* sve_width */
890 4, /* memmov_cost */
891 3, /* issue_rate */
892 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
893 "4", /* function_align. */
894 "4", /* jump_align. */
895 "4", /* loop_align. */
896 2, /* int_reassoc_width. */
897 4, /* fp_reassoc_width. */
898 1, /* vec_reassoc_width. */
899 2, /* min_div_recip_mul_sf. */
900 2, /* min_div_recip_mul_df. */
901 48, /* max_case_values. */
902 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
903 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
904 &exynosm1_prefetch_tune
905 };
906
907 static const struct tune_params thunderxt88_tunings =
908 {
909 &thunderx_extra_costs,
910 &generic_addrcost_table,
911 &thunderx_regmove_cost,
912 &thunderx_vector_cost,
913 &generic_branch_cost,
914 &generic_approx_modes,
915 SVE_NOT_IMPLEMENTED, /* sve_width */
916 6, /* memmov_cost */
917 2, /* issue_rate */
918 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
919 "8", /* function_align. */
920 "8", /* jump_align. */
921 "8", /* loop_align. */
922 2, /* int_reassoc_width. */
923 4, /* fp_reassoc_width. */
924 1, /* vec_reassoc_width. */
925 2, /* min_div_recip_mul_sf. */
926 2, /* min_div_recip_mul_df. */
927 0, /* max_case_values. */
928 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
929 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
930 &thunderxt88_prefetch_tune
931 };
932
933 static const struct tune_params thunderx_tunings =
934 {
935 &thunderx_extra_costs,
936 &generic_addrcost_table,
937 &thunderx_regmove_cost,
938 &thunderx_vector_cost,
939 &generic_branch_cost,
940 &generic_approx_modes,
941 SVE_NOT_IMPLEMENTED, /* sve_width */
942 6, /* memmov_cost */
943 2, /* issue_rate */
944 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
945 "8", /* function_align. */
946 "8", /* jump_align. */
947 "8", /* loop_align. */
948 2, /* int_reassoc_width. */
949 4, /* fp_reassoc_width. */
950 1, /* vec_reassoc_width. */
951 2, /* min_div_recip_mul_sf. */
952 2, /* min_div_recip_mul_df. */
953 0, /* max_case_values. */
954 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
955 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
956 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
957 &thunderx_prefetch_tune
958 };
959
960 static const struct tune_params tsv110_tunings =
961 {
962 &tsv110_extra_costs,
963 &tsv110_addrcost_table,
964 &tsv110_regmove_cost,
965 &tsv110_vector_cost,
966 &generic_branch_cost,
967 &generic_approx_modes,
968 SVE_NOT_IMPLEMENTED, /* sve_width */
969 4, /* memmov_cost */
970 4, /* issue_rate */
971 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
972 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
973 "16", /* function_align. */
974 "4", /* jump_align. */
975 "8", /* loop_align. */
976 2, /* int_reassoc_width. */
977 4, /* fp_reassoc_width. */
978 1, /* vec_reassoc_width. */
979 2, /* min_div_recip_mul_sf. */
980 2, /* min_div_recip_mul_df. */
981 0, /* max_case_values. */
982 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
983 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
984 &tsv110_prefetch_tune
985 };
986
987 static const struct tune_params xgene1_tunings =
988 {
989 &xgene1_extra_costs,
990 &xgene1_addrcost_table,
991 &xgene1_regmove_cost,
992 &xgene1_vector_cost,
993 &generic_branch_cost,
994 &xgene1_approx_modes,
995 SVE_NOT_IMPLEMENTED, /* sve_width */
996 6, /* memmov_cost */
997 4, /* issue_rate */
998 AARCH64_FUSE_NOTHING, /* fusible_ops */
999 "16", /* function_align. */
1000 "16", /* jump_align. */
1001 "16", /* loop_align. */
1002 2, /* int_reassoc_width. */
1003 4, /* fp_reassoc_width. */
1004 1, /* vec_reassoc_width. */
1005 2, /* min_div_recip_mul_sf. */
1006 2, /* min_div_recip_mul_df. */
1007 17, /* max_case_values. */
1008 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1009 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1010 &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015 &xgene1_extra_costs,
1016 &xgene1_addrcost_table,
1017 &xgene1_regmove_cost,
1018 &xgene1_vector_cost,
1019 &generic_branch_cost,
1020 &xgene1_approx_modes,
1021 SVE_NOT_IMPLEMENTED,
1022 6, /* memmov_cost */
1023 4, /* issue_rate */
1024 AARCH64_FUSE_NOTHING, /* fusible_ops */
1025 "16", /* function_align. */
1026 "16", /* jump_align. */
1027 "16", /* loop_align. */
1028 2, /* int_reassoc_width. */
1029 4, /* fp_reassoc_width. */
1030 1, /* vec_reassoc_width. */
1031 2, /* min_div_recip_mul_sf. */
1032 2, /* min_div_recip_mul_df. */
1033 17, /* max_case_values. */
1034 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1035 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1036 &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041 &qdf24xx_extra_costs,
1042 &qdf24xx_addrcost_table,
1043 &qdf24xx_regmove_cost,
1044 &qdf24xx_vector_cost,
1045 &generic_branch_cost,
1046 &generic_approx_modes,
1047 SVE_NOT_IMPLEMENTED, /* sve_width */
1048 4, /* memmov_cost */
1049 4, /* issue_rate */
1050 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1052 "16", /* function_align. */
1053 "8", /* jump_align. */
1054 "16", /* loop_align. */
1055 2, /* int_reassoc_width. */
1056 4, /* fp_reassoc_width. */
1057 1, /* vec_reassoc_width. */
1058 2, /* min_div_recip_mul_sf. */
1059 2, /* min_div_recip_mul_df. */
1060 0, /* max_case_values. */
1061 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1062 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1063 &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1067 for now. */
1068 static const struct tune_params saphira_tunings =
1069 {
1070 &generic_extra_costs,
1071 &generic_addrcost_table,
1072 &generic_regmove_cost,
1073 &generic_vector_cost,
1074 &generic_branch_cost,
1075 &generic_approx_modes,
1076 SVE_NOT_IMPLEMENTED, /* sve_width */
1077 4, /* memmov_cost */
1078 4, /* issue_rate */
1079 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1081 "16", /* function_align. */
1082 "8", /* jump_align. */
1083 "16", /* loop_align. */
1084 2, /* int_reassoc_width. */
1085 4, /* fp_reassoc_width. */
1086 1, /* vec_reassoc_width. */
1087 2, /* min_div_recip_mul_sf. */
1088 2, /* min_div_recip_mul_df. */
1089 0, /* max_case_values. */
1090 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1091 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1092 &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097 &thunderx2t99_extra_costs,
1098 &thunderx2t99_addrcost_table,
1099 &thunderx2t99_regmove_cost,
1100 &thunderx2t99_vector_cost,
1101 &generic_branch_cost,
1102 &generic_approx_modes,
1103 SVE_NOT_IMPLEMENTED, /* sve_width */
1104 4, /* memmov_cost. */
1105 4, /* issue_rate. */
1106 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1107 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1108 "16", /* function_align. */
1109 "8", /* jump_align. */
1110 "16", /* loop_align. */
1111 3, /* int_reassoc_width. */
1112 2, /* fp_reassoc_width. */
1113 2, /* vec_reassoc_width. */
1114 2, /* min_div_recip_mul_sf. */
1115 2, /* min_div_recip_mul_df. */
1116 0, /* max_case_values. */
1117 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1118 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1119 &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124 &cortexa57_extra_costs,
1125 &generic_addrcost_table,
1126 &generic_regmove_cost,
1127 &cortexa57_vector_cost,
1128 &generic_branch_cost,
1129 &generic_approx_modes,
1130 SVE_NOT_IMPLEMENTED, /* sve_width */
1131 4, /* memmov_cost */
1132 3, /* issue_rate */
1133 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1134 "32:16", /* function_align. */
1135 "4", /* jump_align. */
1136 "32:16", /* loop_align. */
1137 2, /* int_reassoc_width. */
1138 4, /* fp_reassoc_width. */
1139 2, /* vec_reassoc_width. */
1140 2, /* min_div_recip_mul_sf. */
1141 2, /* min_div_recip_mul_df. */
1142 0, /* max_case_values. */
1143 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1144 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1145 &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures. */
1149 struct aarch64_tuning_override_function
1150 {
1151 const char* name;
1152 void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162 { "fuse", aarch64_parse_fuse_string },
1163 { "tune", aarch64_parse_tune_string },
1164 { "sve_width", aarch64_parse_sve_width_string },
1165 { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64. */
1169 struct processor
1170 {
1171 const char *const name;
1172 enum aarch64_processor ident;
1173 enum aarch64_processor sched_core;
1174 enum aarch64_arch arch;
1175 unsigned architecture_version;
1176 const uint64_t flags;
1177 const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64. */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64. */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1194 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1195 FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification. These are populated by the -march, -mtune, -mcpu
1204 handling code or by target attributes. */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set. */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1215
1216 static tree
1217 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1218 int, bool *no_add_attrs)
1219 {
1220 /* Since we set fn_type_req to true, the caller should have checked
1221 this for us. */
1222 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1223 switch ((arm_pcs) fntype_abi (*node).id ())
1224 {
1225 case ARM_PCS_AAPCS64:
1226 case ARM_PCS_SIMD:
1227 return NULL_TREE;
1228
1229 case ARM_PCS_SVE:
1230 error ("the %qE attribute cannot be applied to an SVE function type",
1231 name);
1232 *no_add_attrs = true;
1233 return NULL_TREE;
1234
1235 case ARM_PCS_TLSDESC:
1236 case ARM_PCS_UNKNOWN:
1237 break;
1238 }
1239 gcc_unreachable ();
1240 }
1241
1242 /* Table of machine attributes. */
1243 static const struct attribute_spec aarch64_attribute_table[] =
1244 {
1245 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1246 affects_type_identity, handler, exclude } */
1247 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1248 handle_aarch64_vector_pcs_attribute, NULL },
1249 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
1250 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1251 };
1252
1253 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1254
1255 /* An ISA extension in the co-processor and main instruction set space. */
1256 struct aarch64_option_extension
1257 {
1258 const char *const name;
1259 const unsigned long flags_on;
1260 const unsigned long flags_off;
1261 };
1262
1263 typedef enum aarch64_cond_code
1264 {
1265 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1266 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1267 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1268 }
1269 aarch64_cc;
1270
1271 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1272
1273 struct aarch64_branch_protect_type
1274 {
1275 /* The type's name that the user passes to the branch-protection option
1276 string. */
1277 const char* name;
1278 /* Function to handle the protection type and set global variables.
1279 First argument is the string token corresponding with this type and the
1280 second argument is the next token in the option string.
1281 Return values:
1282 * AARCH64_PARSE_OK: Handling was sucessful.
1283 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1284 should print an error.
1285 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1286 own error. */
1287 enum aarch64_parse_opt_result (*handler)(char*, char*);
1288 /* A list of types that can follow this type in the option string. */
1289 const aarch64_branch_protect_type* subtypes;
1290 unsigned int num_subtypes;
1291 };
1292
1293 static enum aarch64_parse_opt_result
1294 aarch64_handle_no_branch_protection (char* str, char* rest)
1295 {
1296 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1297 aarch64_enable_bti = 0;
1298 if (rest)
1299 {
1300 error ("unexpected %<%s%> after %<%s%>", rest, str);
1301 return AARCH64_PARSE_INVALID_FEATURE;
1302 }
1303 return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_standard_branch_protection (char* str, char* rest)
1308 {
1309 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1310 aarch64_ra_sign_key = AARCH64_KEY_A;
1311 aarch64_enable_bti = 1;
1312 if (rest)
1313 {
1314 error ("unexpected %<%s%> after %<%s%>", rest, str);
1315 return AARCH64_PARSE_INVALID_FEATURE;
1316 }
1317 return AARCH64_PARSE_OK;
1318 }
1319
1320 static enum aarch64_parse_opt_result
1321 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1322 char* rest ATTRIBUTE_UNUSED)
1323 {
1324 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1325 aarch64_ra_sign_key = AARCH64_KEY_A;
1326 return AARCH64_PARSE_OK;
1327 }
1328
1329 static enum aarch64_parse_opt_result
1330 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1331 char* rest ATTRIBUTE_UNUSED)
1332 {
1333 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1334 return AARCH64_PARSE_OK;
1335 }
1336
1337 static enum aarch64_parse_opt_result
1338 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1339 char* rest ATTRIBUTE_UNUSED)
1340 {
1341 aarch64_ra_sign_key = AARCH64_KEY_B;
1342 return AARCH64_PARSE_OK;
1343 }
1344
1345 static enum aarch64_parse_opt_result
1346 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1347 char* rest ATTRIBUTE_UNUSED)
1348 {
1349 aarch64_enable_bti = 1;
1350 return AARCH64_PARSE_OK;
1351 }
1352
1353 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1354 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1355 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1356 { NULL, NULL, NULL, 0 }
1357 };
1358
1359 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1360 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1361 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1362 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1363 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1364 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1365 { NULL, NULL, NULL, 0 }
1366 };
1367
1368 /* The condition codes of the processor, and the inverse function. */
1369 static const char * const aarch64_condition_codes[] =
1370 {
1371 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1372 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1373 };
1374
1375 /* The preferred condition codes for SVE conditions. */
1376 static const char *const aarch64_sve_condition_codes[] =
1377 {
1378 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1379 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1380 };
1381
1382 /* Return the assembly token for svpattern value VALUE. */
1383
1384 static const char *
1385 svpattern_token (enum aarch64_svpattern pattern)
1386 {
1387 switch (pattern)
1388 {
1389 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1390 AARCH64_FOR_SVPATTERN (CASE)
1391 #undef CASE
1392 case AARCH64_NUM_SVPATTERNS:
1393 break;
1394 }
1395 gcc_unreachable ();
1396 }
1397
1398 /* Return the descriptor of the SIMD ABI. */
1399
1400 static const predefined_function_abi &
1401 aarch64_simd_abi (void)
1402 {
1403 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1404 if (!simd_abi.initialized_p ())
1405 {
1406 HARD_REG_SET full_reg_clobbers
1407 = default_function_abi.full_reg_clobbers ();
1408 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1409 if (FP_SIMD_SAVED_REGNUM_P (regno))
1410 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1411 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1412 }
1413 return simd_abi;
1414 }
1415
1416 /* Return the descriptor of the SVE PCS. */
1417
1418 static const predefined_function_abi &
1419 aarch64_sve_abi (void)
1420 {
1421 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1422 if (!sve_abi.initialized_p ())
1423 {
1424 HARD_REG_SET full_reg_clobbers
1425 = default_function_abi.full_reg_clobbers ();
1426 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1427 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1428 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1429 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1430 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1431 }
1432 return sve_abi;
1433 }
1434
1435 /* Generate code to enable conditional branches in functions over 1 MiB. */
1436 const char *
1437 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1438 const char * branch_format)
1439 {
1440 rtx_code_label * tmp_label = gen_label_rtx ();
1441 char label_buf[256];
1442 char buffer[128];
1443 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1444 CODE_LABEL_NUMBER (tmp_label));
1445 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1446 rtx dest_label = operands[pos_label];
1447 operands[pos_label] = tmp_label;
1448
1449 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1450 output_asm_insn (buffer, operands);
1451
1452 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1453 operands[pos_label] = dest_label;
1454 output_asm_insn (buffer, operands);
1455 return "";
1456 }
1457
1458 void
1459 aarch64_err_no_fpadvsimd (machine_mode mode)
1460 {
1461 if (TARGET_GENERAL_REGS_ONLY)
1462 if (FLOAT_MODE_P (mode))
1463 error ("%qs is incompatible with the use of floating-point types",
1464 "-mgeneral-regs-only");
1465 else
1466 error ("%qs is incompatible with the use of vector types",
1467 "-mgeneral-regs-only");
1468 else
1469 if (FLOAT_MODE_P (mode))
1470 error ("%qs feature modifier is incompatible with the use of"
1471 " floating-point types", "+nofp");
1472 else
1473 error ("%qs feature modifier is incompatible with the use of"
1474 " vector types", "+nofp");
1475 }
1476
1477 /* Report when we try to do something that requires SVE when SVE is disabled.
1478 This is an error of last resort and isn't very high-quality. It usually
1479 involves attempts to measure the vector length in some way. */
1480 static void
1481 aarch64_report_sve_required (void)
1482 {
1483 static bool reported_p = false;
1484
1485 /* Avoid reporting a slew of messages for a single oversight. */
1486 if (reported_p)
1487 return;
1488
1489 error ("this operation requires the SVE ISA extension");
1490 inform (input_location, "you can enable SVE using the command-line"
1491 " option %<-march%>, or by using the %<target%>"
1492 " attribute or pragma");
1493 reported_p = true;
1494 }
1495
1496 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1497 registers. */
1498 inline bool
1499 pr_or_ffr_regnum_p (unsigned int regno)
1500 {
1501 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1502 }
1503
1504 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1505 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1506 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1507 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1508 and GENERAL_REGS is lower than the memory cost (in this case the best class
1509 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1510 cost results in bad allocations with many redundant int<->FP moves which
1511 are expensive on various cores.
1512 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1513 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1514 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1515 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1516 The result of this is that it is no longer inefficient to have a higher
1517 memory move cost than the register move cost.
1518 */
1519
1520 static reg_class_t
1521 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1522 reg_class_t best_class)
1523 {
1524 machine_mode mode;
1525
1526 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1527 || !reg_class_subset_p (FP_REGS, allocno_class))
1528 return allocno_class;
1529
1530 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1531 || !reg_class_subset_p (FP_REGS, best_class))
1532 return best_class;
1533
1534 mode = PSEUDO_REGNO_MODE (regno);
1535 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1536 }
1537
1538 static unsigned int
1539 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1540 {
1541 if (GET_MODE_UNIT_SIZE (mode) == 4)
1542 return aarch64_tune_params.min_div_recip_mul_sf;
1543 return aarch64_tune_params.min_div_recip_mul_df;
1544 }
1545
1546 /* Return the reassociation width of treeop OPC with mode MODE. */
1547 static int
1548 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1549 {
1550 if (VECTOR_MODE_P (mode))
1551 return aarch64_tune_params.vec_reassoc_width;
1552 if (INTEGRAL_MODE_P (mode))
1553 return aarch64_tune_params.int_reassoc_width;
1554 /* Avoid reassociating floating point addition so we emit more FMAs. */
1555 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1556 return aarch64_tune_params.fp_reassoc_width;
1557 return 1;
1558 }
1559
1560 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1561 unsigned
1562 aarch64_dbx_register_number (unsigned regno)
1563 {
1564 if (GP_REGNUM_P (regno))
1565 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1566 else if (regno == SP_REGNUM)
1567 return AARCH64_DWARF_SP;
1568 else if (FP_REGNUM_P (regno))
1569 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1570 else if (PR_REGNUM_P (regno))
1571 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1572 else if (regno == VG_REGNUM)
1573 return AARCH64_DWARF_VG;
1574
1575 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1576 equivalent DWARF register. */
1577 return DWARF_FRAME_REGISTERS;
1578 }
1579
1580 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1581 integer, otherwise return X unmodified. */
1582 static rtx
1583 aarch64_bit_representation (rtx x)
1584 {
1585 if (CONST_DOUBLE_P (x))
1586 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1587 return x;
1588 }
1589
1590 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1591 static bool
1592 aarch64_advsimd_struct_mode_p (machine_mode mode)
1593 {
1594 return (TARGET_SIMD
1595 && (mode == OImode || mode == CImode || mode == XImode));
1596 }
1597
1598 /* Return true if MODE is an SVE predicate mode. */
1599 static bool
1600 aarch64_sve_pred_mode_p (machine_mode mode)
1601 {
1602 return (TARGET_SVE
1603 && (mode == VNx16BImode
1604 || mode == VNx8BImode
1605 || mode == VNx4BImode
1606 || mode == VNx2BImode));
1607 }
1608
1609 /* Three mutually-exclusive flags describing a vector or predicate type. */
1610 const unsigned int VEC_ADVSIMD = 1;
1611 const unsigned int VEC_SVE_DATA = 2;
1612 const unsigned int VEC_SVE_PRED = 4;
1613 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1614 a structure of 2, 3 or 4 vectors. */
1615 const unsigned int VEC_STRUCT = 8;
1616 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1617 vector has fewer significant bytes than a full SVE vector. */
1618 const unsigned int VEC_PARTIAL = 16;
1619 /* Useful combinations of the above. */
1620 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1621 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1622
1623 /* Return a set of flags describing the vector properties of mode MODE.
1624 Ignore modes that are not supported by the current target. */
1625 static unsigned int
1626 aarch64_classify_vector_mode (machine_mode mode)
1627 {
1628 if (aarch64_advsimd_struct_mode_p (mode))
1629 return VEC_ADVSIMD | VEC_STRUCT;
1630
1631 if (aarch64_sve_pred_mode_p (mode))
1632 return VEC_SVE_PRED;
1633
1634 /* Make the decision based on the mode's enum value rather than its
1635 properties, so that we keep the correct classification regardless
1636 of -msve-vector-bits. */
1637 switch (mode)
1638 {
1639 /* Partial SVE QI vectors. */
1640 case E_VNx2QImode:
1641 case E_VNx4QImode:
1642 case E_VNx8QImode:
1643 /* Partial SVE HI vectors. */
1644 case E_VNx2HImode:
1645 case E_VNx4HImode:
1646 /* Partial SVE SI vector. */
1647 case E_VNx2SImode:
1648 /* Partial SVE HF vectors. */
1649 case E_VNx2HFmode:
1650 case E_VNx4HFmode:
1651 /* Partial SVE SF vector. */
1652 case E_VNx2SFmode:
1653 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1654
1655 case E_VNx16QImode:
1656 case E_VNx8HImode:
1657 case E_VNx4SImode:
1658 case E_VNx2DImode:
1659 case E_VNx8BFmode:
1660 case E_VNx8HFmode:
1661 case E_VNx4SFmode:
1662 case E_VNx2DFmode:
1663 return TARGET_SVE ? VEC_SVE_DATA : 0;
1664
1665 /* x2 SVE vectors. */
1666 case E_VNx32QImode:
1667 case E_VNx16HImode:
1668 case E_VNx8SImode:
1669 case E_VNx4DImode:
1670 case E_VNx16BFmode:
1671 case E_VNx16HFmode:
1672 case E_VNx8SFmode:
1673 case E_VNx4DFmode:
1674 /* x3 SVE vectors. */
1675 case E_VNx48QImode:
1676 case E_VNx24HImode:
1677 case E_VNx12SImode:
1678 case E_VNx6DImode:
1679 case E_VNx24BFmode:
1680 case E_VNx24HFmode:
1681 case E_VNx12SFmode:
1682 case E_VNx6DFmode:
1683 /* x4 SVE vectors. */
1684 case E_VNx64QImode:
1685 case E_VNx32HImode:
1686 case E_VNx16SImode:
1687 case E_VNx8DImode:
1688 case E_VNx32BFmode:
1689 case E_VNx32HFmode:
1690 case E_VNx16SFmode:
1691 case E_VNx8DFmode:
1692 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1693
1694 /* 64-bit Advanced SIMD vectors. */
1695 case E_V8QImode:
1696 case E_V4HImode:
1697 case E_V2SImode:
1698 /* ...E_V1DImode doesn't exist. */
1699 case E_V4HFmode:
1700 case E_V4BFmode:
1701 case E_V2SFmode:
1702 case E_V1DFmode:
1703 /* 128-bit Advanced SIMD vectors. */
1704 case E_V16QImode:
1705 case E_V8HImode:
1706 case E_V4SImode:
1707 case E_V2DImode:
1708 case E_V8HFmode:
1709 case E_V8BFmode:
1710 case E_V4SFmode:
1711 case E_V2DFmode:
1712 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1713
1714 default:
1715 return 0;
1716 }
1717 }
1718
1719 /* Return true if MODE is any of the data vector modes, including
1720 structure modes. */
1721 static bool
1722 aarch64_vector_data_mode_p (machine_mode mode)
1723 {
1724 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1725 }
1726
1727 /* Return true if MODE is any form of SVE mode, including predicates,
1728 vectors and structures. */
1729 bool
1730 aarch64_sve_mode_p (machine_mode mode)
1731 {
1732 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1733 }
1734
1735 /* Return true if MODE is an SVE data vector mode; either a single vector
1736 or a structure of vectors. */
1737 static bool
1738 aarch64_sve_data_mode_p (machine_mode mode)
1739 {
1740 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1741 }
1742
1743 /* Return the number of defined bytes in one constituent vector of
1744 SVE mode MODE, which has vector flags VEC_FLAGS. */
1745 static poly_int64
1746 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1747 {
1748 if (vec_flags & VEC_PARTIAL)
1749 /* A single partial vector. */
1750 return GET_MODE_SIZE (mode);
1751
1752 if (vec_flags & VEC_SVE_DATA)
1753 /* A single vector or a tuple. */
1754 return BYTES_PER_SVE_VECTOR;
1755
1756 /* A single predicate. */
1757 gcc_assert (vec_flags & VEC_SVE_PRED);
1758 return BYTES_PER_SVE_PRED;
1759 }
1760
1761 /* Implement target hook TARGET_ARRAY_MODE. */
1762 static opt_machine_mode
1763 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1764 {
1765 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1766 && IN_RANGE (nelems, 2, 4))
1767 return mode_for_vector (GET_MODE_INNER (mode),
1768 GET_MODE_NUNITS (mode) * nelems);
1769
1770 return opt_machine_mode ();
1771 }
1772
1773 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1774 static bool
1775 aarch64_array_mode_supported_p (machine_mode mode,
1776 unsigned HOST_WIDE_INT nelems)
1777 {
1778 if (TARGET_SIMD
1779 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1780 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1781 && (nelems >= 2 && nelems <= 4))
1782 return true;
1783
1784 return false;
1785 }
1786
1787 /* MODE is some form of SVE vector mode. For data modes, return the number
1788 of vector register bits that each element of MODE occupies, such as 64
1789 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1790 in a 64-bit container). For predicate modes, return the number of
1791 data bits controlled by each significant predicate bit. */
1792
1793 static unsigned int
1794 aarch64_sve_container_bits (machine_mode mode)
1795 {
1796 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1797 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1798 ? BITS_PER_SVE_VECTOR
1799 : GET_MODE_BITSIZE (mode));
1800 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1801 }
1802
1803 /* Return the SVE predicate mode to use for elements that have
1804 ELEM_NBYTES bytes, if such a mode exists. */
1805
1806 opt_machine_mode
1807 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1808 {
1809 if (TARGET_SVE)
1810 {
1811 if (elem_nbytes == 1)
1812 return VNx16BImode;
1813 if (elem_nbytes == 2)
1814 return VNx8BImode;
1815 if (elem_nbytes == 4)
1816 return VNx4BImode;
1817 if (elem_nbytes == 8)
1818 return VNx2BImode;
1819 }
1820 return opt_machine_mode ();
1821 }
1822
1823 /* Return the SVE predicate mode that should be used to control
1824 SVE mode MODE. */
1825
1826 machine_mode
1827 aarch64_sve_pred_mode (machine_mode mode)
1828 {
1829 unsigned int bits = aarch64_sve_container_bits (mode);
1830 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1831 }
1832
1833 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1834
1835 static opt_machine_mode
1836 aarch64_get_mask_mode (machine_mode mode)
1837 {
1838 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1839 if (vec_flags & VEC_SVE_DATA)
1840 return aarch64_sve_pred_mode (mode);
1841
1842 return default_get_mask_mode (mode);
1843 }
1844
1845 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1846
1847 opt_machine_mode
1848 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1849 {
1850 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1851 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1852 machine_mode mode;
1853 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1854 if (inner_mode == GET_MODE_INNER (mode)
1855 && known_eq (nunits, GET_MODE_NUNITS (mode))
1856 && aarch64_sve_data_mode_p (mode))
1857 return mode;
1858 return opt_machine_mode ();
1859 }
1860
1861 /* Return the integer element mode associated with SVE mode MODE. */
1862
1863 static scalar_int_mode
1864 aarch64_sve_element_int_mode (machine_mode mode)
1865 {
1866 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1867 ? BITS_PER_SVE_VECTOR
1868 : GET_MODE_BITSIZE (mode));
1869 unsigned int elt_bits = vector_element_size (vector_bits,
1870 GET_MODE_NUNITS (mode));
1871 return int_mode_for_size (elt_bits, 0).require ();
1872 }
1873
1874 /* Return an integer element mode that contains exactly
1875 aarch64_sve_container_bits (MODE) bits. This is wider than
1876 aarch64_sve_element_int_mode if MODE is a partial vector,
1877 otherwise it's the same. */
1878
1879 static scalar_int_mode
1880 aarch64_sve_container_int_mode (machine_mode mode)
1881 {
1882 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1883 }
1884
1885 /* Return the integer vector mode associated with SVE mode MODE.
1886 Unlike related_int_vector_mode, this can handle the case in which
1887 MODE is a predicate (and thus has a different total size). */
1888
1889 machine_mode
1890 aarch64_sve_int_mode (machine_mode mode)
1891 {
1892 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1893 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1894 }
1895
1896 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
1897
1898 static opt_machine_mode
1899 aarch64_vectorize_related_mode (machine_mode vector_mode,
1900 scalar_mode element_mode,
1901 poly_uint64 nunits)
1902 {
1903 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1904
1905 /* If we're operating on SVE vectors, try to return an SVE mode. */
1906 poly_uint64 sve_nunits;
1907 if ((vec_flags & VEC_SVE_DATA)
1908 && multiple_p (BYTES_PER_SVE_VECTOR,
1909 GET_MODE_SIZE (element_mode), &sve_nunits))
1910 {
1911 machine_mode sve_mode;
1912 if (maybe_ne (nunits, 0U))
1913 {
1914 /* Try to find a full or partial SVE mode with exactly
1915 NUNITS units. */
1916 if (multiple_p (sve_nunits, nunits)
1917 && aarch64_sve_data_mode (element_mode,
1918 nunits).exists (&sve_mode))
1919 return sve_mode;
1920 }
1921 else
1922 {
1923 /* Take the preferred number of units from the number of bytes
1924 that fit in VECTOR_MODE. We always start by "autodetecting"
1925 a full vector mode with preferred_simd_mode, so vectors
1926 chosen here will also be full vector modes. Then
1927 autovectorize_vector_modes tries smaller starting modes
1928 and thus smaller preferred numbers of units. */
1929 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1930 if (aarch64_sve_data_mode (element_mode,
1931 sve_nunits).exists (&sve_mode))
1932 return sve_mode;
1933 }
1934 }
1935
1936 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
1937 if ((vec_flags & VEC_ADVSIMD)
1938 && known_eq (nunits, 0U)
1939 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1940 && maybe_ge (GET_MODE_BITSIZE (element_mode)
1941 * GET_MODE_NUNITS (vector_mode), 128U))
1942 {
1943 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1944 if (VECTOR_MODE_P (res))
1945 return res;
1946 }
1947
1948 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1949 }
1950
1951 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1952 prefer to use the first arithmetic operand as the else value if
1953 the else value doesn't matter, since that exactly matches the SVE
1954 destructive merging form. For ternary operations we could either
1955 pick the first operand and use FMAD-like instructions or the last
1956 operand and use FMLA-like instructions; the latter seems more
1957 natural. */
1958
1959 static tree
1960 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1961 {
1962 return nops == 3 ? ops[2] : ops[0];
1963 }
1964
1965 /* Implement TARGET_HARD_REGNO_NREGS. */
1966
1967 static unsigned int
1968 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1969 {
1970 /* ??? Logically we should only need to provide a value when
1971 HARD_REGNO_MODE_OK says that the combination is valid,
1972 but at the moment we need to handle all modes. Just ignore
1973 any runtime parts for registers that can't store them. */
1974 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1975 switch (aarch64_regno_regclass (regno))
1976 {
1977 case FP_REGS:
1978 case FP_LO_REGS:
1979 case FP_LO8_REGS:
1980 {
1981 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1982 if (vec_flags & VEC_SVE_DATA)
1983 return exact_div (GET_MODE_SIZE (mode),
1984 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
1985 return CEIL (lowest_size, UNITS_PER_VREG);
1986 }
1987 case PR_REGS:
1988 case PR_LO_REGS:
1989 case PR_HI_REGS:
1990 case FFR_REGS:
1991 case PR_AND_FFR_REGS:
1992 return 1;
1993 default:
1994 return CEIL (lowest_size, UNITS_PER_WORD);
1995 }
1996 gcc_unreachable ();
1997 }
1998
1999 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2000
2001 static bool
2002 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2003 {
2004 if (GET_MODE_CLASS (mode) == MODE_CC)
2005 return regno == CC_REGNUM;
2006
2007 if (regno == VG_REGNUM)
2008 /* This must have the same size as _Unwind_Word. */
2009 return mode == DImode;
2010
2011 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2012 if (vec_flags & VEC_SVE_PRED)
2013 return pr_or_ffr_regnum_p (regno);
2014
2015 if (pr_or_ffr_regnum_p (regno))
2016 return false;
2017
2018 if (regno == SP_REGNUM)
2019 /* The purpose of comparing with ptr_mode is to support the
2020 global register variable associated with the stack pointer
2021 register via the syntax of asm ("wsp") in ILP32. */
2022 return mode == Pmode || mode == ptr_mode;
2023
2024 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2025 return mode == Pmode;
2026
2027 if (GP_REGNUM_P (regno))
2028 {
2029 if (vec_flags & VEC_ANY_SVE)
2030 return false;
2031 if (known_le (GET_MODE_SIZE (mode), 8))
2032 return true;
2033 if (known_le (GET_MODE_SIZE (mode), 16))
2034 return (regno & 1) == 0;
2035 }
2036 else if (FP_REGNUM_P (regno))
2037 {
2038 if (vec_flags & VEC_STRUCT)
2039 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2040 else
2041 return !VECTOR_MODE_P (mode) || vec_flags != 0;
2042 }
2043
2044 return false;
2045 }
2046
2047 /* Return true if TYPE is a type that should be passed or returned in
2048 SVE registers, assuming enough registers are available. When returning
2049 true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
2050 respectively. */
2051
2052 /* Return true if a function with type FNTYPE returns its value in
2053 SVE vector or predicate registers. */
2054
2055 static bool
2056 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2057 {
2058 tree return_type = TREE_TYPE (fntype);
2059 return (return_type != error_mark_node
2060 && aarch64_sve::builtin_type_p (return_type));
2061 }
2062
2063 /* Return true if a function with type FNTYPE takes arguments in
2064 SVE vector or predicate registers. */
2065
2066 static bool
2067 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2068 {
2069 CUMULATIVE_ARGS args_so_far_v;
2070 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2071 NULL_TREE, 0, true);
2072 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2073
2074 for (tree chain = TYPE_ARG_TYPES (fntype);
2075 chain && chain != void_list_node;
2076 chain = TREE_CHAIN (chain))
2077 {
2078 tree arg_type = TREE_VALUE (chain);
2079 if (arg_type == error_mark_node)
2080 return false;
2081
2082 function_arg_info arg (arg_type, /*named=*/true);
2083 apply_pass_by_reference_rules (&args_so_far_v, arg);
2084 if (aarch64_sve::builtin_type_p (arg.type))
2085 return true;
2086
2087 targetm.calls.function_arg_advance (args_so_far, arg);
2088 }
2089 return false;
2090 }
2091
2092 /* Implement TARGET_FNTYPE_ABI. */
2093
2094 static const predefined_function_abi &
2095 aarch64_fntype_abi (const_tree fntype)
2096 {
2097 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2098 return aarch64_simd_abi ();
2099
2100 if (aarch64_returns_value_in_sve_regs_p (fntype)
2101 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2102 return aarch64_sve_abi ();
2103
2104 return default_function_abi;
2105 }
2106
2107 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2108
2109 static bool
2110 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2111 {
2112 return (aarch64_sve::builtin_type_p (type1)
2113 == aarch64_sve::builtin_type_p (type2));
2114 }
2115
2116 /* Return true if we should emit CFI for register REGNO. */
2117
2118 static bool
2119 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2120 {
2121 return (GP_REGNUM_P (regno)
2122 || !default_function_abi.clobbers_full_reg_p (regno));
2123 }
2124
2125 /* Return the mode we should use to save and restore register REGNO. */
2126
2127 static machine_mode
2128 aarch64_reg_save_mode (unsigned int regno)
2129 {
2130 if (GP_REGNUM_P (regno))
2131 return DImode;
2132
2133 if (FP_REGNUM_P (regno))
2134 switch (crtl->abi->id ())
2135 {
2136 case ARM_PCS_AAPCS64:
2137 /* Only the low 64 bits are saved by the base PCS. */
2138 return DFmode;
2139
2140 case ARM_PCS_SIMD:
2141 /* The vector PCS saves the low 128 bits (which is the full
2142 register on non-SVE targets). */
2143 return TFmode;
2144
2145 case ARM_PCS_SVE:
2146 /* Use vectors of DImode for registers that need frame
2147 information, so that the first 64 bytes of the save slot
2148 are always the equivalent of what storing D<n> would give. */
2149 if (aarch64_emit_cfi_for_reg_p (regno))
2150 return VNx2DImode;
2151
2152 /* Use vectors of bytes otherwise, so that the layout is
2153 endian-agnostic, and so that we can use LDR and STR for
2154 big-endian targets. */
2155 return VNx16QImode;
2156
2157 case ARM_PCS_TLSDESC:
2158 case ARM_PCS_UNKNOWN:
2159 break;
2160 }
2161
2162 if (PR_REGNUM_P (regno))
2163 /* Save the full predicate register. */
2164 return VNx16BImode;
2165
2166 gcc_unreachable ();
2167 }
2168
2169 /* Implement TARGET_INSN_CALLEE_ABI. */
2170
2171 const predefined_function_abi &
2172 aarch64_insn_callee_abi (const rtx_insn *insn)
2173 {
2174 rtx pat = PATTERN (insn);
2175 gcc_assert (GET_CODE (pat) == PARALLEL);
2176 rtx unspec = XVECEXP (pat, 0, 1);
2177 gcc_assert (GET_CODE (unspec) == UNSPEC
2178 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2179 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2180 }
2181
2182 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2183 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2184 clobbers the top 64 bits when restoring the bottom 64 bits. */
2185
2186 static bool
2187 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2188 unsigned int regno,
2189 machine_mode mode)
2190 {
2191 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2192 {
2193 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2194 unsigned int nregs = hard_regno_nregs (regno, mode);
2195 if (nregs > 1)
2196 per_register_size = exact_div (per_register_size, nregs);
2197 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2198 return maybe_gt (per_register_size, 16);
2199 return maybe_gt (per_register_size, 8);
2200 }
2201 return false;
2202 }
2203
2204 /* Implement REGMODE_NATURAL_SIZE. */
2205 poly_uint64
2206 aarch64_regmode_natural_size (machine_mode mode)
2207 {
2208 /* The natural size for SVE data modes is one SVE data vector,
2209 and similarly for predicates. We can't independently modify
2210 anything smaller than that. */
2211 /* ??? For now, only do this for variable-width SVE registers.
2212 Doing it for constant-sized registers breaks lower-subreg.c. */
2213 /* ??? And once that's fixed, we should probably have similar
2214 code for Advanced SIMD. */
2215 if (!aarch64_sve_vg.is_constant ())
2216 {
2217 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2218 if (vec_flags & VEC_SVE_PRED)
2219 return BYTES_PER_SVE_PRED;
2220 if (vec_flags & VEC_SVE_DATA)
2221 return BYTES_PER_SVE_VECTOR;
2222 }
2223 return UNITS_PER_WORD;
2224 }
2225
2226 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2227 machine_mode
2228 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2229 machine_mode mode)
2230 {
2231 /* The predicate mode determines which bits are significant and
2232 which are "don't care". Decreasing the number of lanes would
2233 lose data while increasing the number of lanes would make bits
2234 unnecessarily significant. */
2235 if (PR_REGNUM_P (regno))
2236 return mode;
2237 if (known_ge (GET_MODE_SIZE (mode), 4))
2238 return mode;
2239 else
2240 return SImode;
2241 }
2242
2243 /* Return true if I's bits are consecutive ones from the MSB. */
2244 bool
2245 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2246 {
2247 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2248 }
2249
2250 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2251 that strcpy from constants will be faster. */
2252
2253 static HOST_WIDE_INT
2254 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2255 {
2256 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2257 return MAX (align, BITS_PER_WORD);
2258 return align;
2259 }
2260
2261 /* Return true if calls to DECL should be treated as
2262 long-calls (ie called via a register). */
2263 static bool
2264 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2265 {
2266 return false;
2267 }
2268
2269 /* Return true if calls to symbol-ref SYM should be treated as
2270 long-calls (ie called via a register). */
2271 bool
2272 aarch64_is_long_call_p (rtx sym)
2273 {
2274 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2275 }
2276
2277 /* Return true if calls to symbol-ref SYM should not go through
2278 plt stubs. */
2279
2280 bool
2281 aarch64_is_noplt_call_p (rtx sym)
2282 {
2283 const_tree decl = SYMBOL_REF_DECL (sym);
2284
2285 if (flag_pic
2286 && decl
2287 && (!flag_plt
2288 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2289 && !targetm.binds_local_p (decl))
2290 return true;
2291
2292 return false;
2293 }
2294
2295 /* Return true if the offsets to a zero/sign-extract operation
2296 represent an expression that matches an extend operation. The
2297 operands represent the parameters from
2298
2299 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2300 bool
2301 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2302 rtx extract_imm)
2303 {
2304 HOST_WIDE_INT mult_val, extract_val;
2305
2306 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2307 return false;
2308
2309 mult_val = INTVAL (mult_imm);
2310 extract_val = INTVAL (extract_imm);
2311
2312 if (extract_val > 8
2313 && extract_val < GET_MODE_BITSIZE (mode)
2314 && exact_log2 (extract_val & ~7) > 0
2315 && (extract_val & 7) <= 4
2316 && mult_val == (1 << (extract_val & 7)))
2317 return true;
2318
2319 return false;
2320 }
2321
2322 /* Emit an insn that's a simple single-set. Both the operands must be
2323 known to be valid. */
2324 inline static rtx_insn *
2325 emit_set_insn (rtx x, rtx y)
2326 {
2327 return emit_insn (gen_rtx_SET (x, y));
2328 }
2329
2330 /* X and Y are two things to compare using CODE. Emit the compare insn and
2331 return the rtx for register 0 in the proper mode. */
2332 rtx
2333 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2334 {
2335 machine_mode cmp_mode = GET_MODE (x);
2336 machine_mode cc_mode;
2337 rtx cc_reg;
2338
2339 if (cmp_mode == TImode)
2340 {
2341 gcc_assert (code == NE);
2342
2343 cc_mode = CCmode;
2344 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2345
2346 rtx x_lo = operand_subword (x, 0, 0, TImode);
2347 rtx y_lo = operand_subword (y, 0, 0, TImode);
2348 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2349
2350 rtx x_hi = operand_subword (x, 1, 0, TImode);
2351 rtx y_hi = operand_subword (y, 1, 0, TImode);
2352 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2353 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2354 GEN_INT (AARCH64_EQ)));
2355 }
2356 else
2357 {
2358 cc_mode = SELECT_CC_MODE (code, x, y);
2359 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2360 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2361 }
2362 return cc_reg;
2363 }
2364
2365 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2366
2367 static rtx
2368 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2369 machine_mode y_mode)
2370 {
2371 if (y_mode == E_QImode || y_mode == E_HImode)
2372 {
2373 if (CONST_INT_P (y))
2374 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2375 else
2376 {
2377 rtx t, cc_reg;
2378 machine_mode cc_mode;
2379
2380 t = gen_rtx_ZERO_EXTEND (SImode, y);
2381 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2382 cc_mode = CC_SWPmode;
2383 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2384 emit_set_insn (cc_reg, t);
2385 return cc_reg;
2386 }
2387 }
2388
2389 if (!aarch64_plus_operand (y, y_mode))
2390 y = force_reg (y_mode, y);
2391
2392 return aarch64_gen_compare_reg (code, x, y);
2393 }
2394
2395 /* Build the SYMBOL_REF for __tls_get_addr. */
2396
2397 static GTY(()) rtx tls_get_addr_libfunc;
2398
2399 rtx
2400 aarch64_tls_get_addr (void)
2401 {
2402 if (!tls_get_addr_libfunc)
2403 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2404 return tls_get_addr_libfunc;
2405 }
2406
2407 /* Return the TLS model to use for ADDR. */
2408
2409 static enum tls_model
2410 tls_symbolic_operand_type (rtx addr)
2411 {
2412 enum tls_model tls_kind = TLS_MODEL_NONE;
2413 if (GET_CODE (addr) == CONST)
2414 {
2415 poly_int64 addend;
2416 rtx sym = strip_offset (addr, &addend);
2417 if (GET_CODE (sym) == SYMBOL_REF)
2418 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2419 }
2420 else if (GET_CODE (addr) == SYMBOL_REF)
2421 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2422
2423 return tls_kind;
2424 }
2425
2426 /* We'll allow lo_sum's in addresses in our legitimate addresses
2427 so that combine would take care of combining addresses where
2428 necessary, but for generation purposes, we'll generate the address
2429 as :
2430 RTL Absolute
2431 tmp = hi (symbol_ref); adrp x1, foo
2432 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2433 nop
2434
2435 PIC TLS
2436 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2437 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2438 bl __tls_get_addr
2439 nop
2440
2441 Load TLS symbol, depending on TLS mechanism and TLS access model.
2442
2443 Global Dynamic - Traditional TLS:
2444 adrp tmp, :tlsgd:imm
2445 add dest, tmp, #:tlsgd_lo12:imm
2446 bl __tls_get_addr
2447
2448 Global Dynamic - TLS Descriptors:
2449 adrp dest, :tlsdesc:imm
2450 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2451 add dest, dest, #:tlsdesc_lo12:imm
2452 blr tmp
2453 mrs tp, tpidr_el0
2454 add dest, dest, tp
2455
2456 Initial Exec:
2457 mrs tp, tpidr_el0
2458 adrp tmp, :gottprel:imm
2459 ldr dest, [tmp, #:gottprel_lo12:imm]
2460 add dest, dest, tp
2461
2462 Local Exec:
2463 mrs tp, tpidr_el0
2464 add t0, tp, #:tprel_hi12:imm, lsl #12
2465 add t0, t0, #:tprel_lo12_nc:imm
2466 */
2467
2468 static void
2469 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2470 enum aarch64_symbol_type type)
2471 {
2472 switch (type)
2473 {
2474 case SYMBOL_SMALL_ABSOLUTE:
2475 {
2476 /* In ILP32, the mode of dest can be either SImode or DImode. */
2477 rtx tmp_reg = dest;
2478 machine_mode mode = GET_MODE (dest);
2479
2480 gcc_assert (mode == Pmode || mode == ptr_mode);
2481
2482 if (can_create_pseudo_p ())
2483 tmp_reg = gen_reg_rtx (mode);
2484
2485 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2486 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2487 return;
2488 }
2489
2490 case SYMBOL_TINY_ABSOLUTE:
2491 emit_insn (gen_rtx_SET (dest, imm));
2492 return;
2493
2494 case SYMBOL_SMALL_GOT_28K:
2495 {
2496 machine_mode mode = GET_MODE (dest);
2497 rtx gp_rtx = pic_offset_table_rtx;
2498 rtx insn;
2499 rtx mem;
2500
2501 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2502 here before rtl expand. Tree IVOPT will generate rtl pattern to
2503 decide rtx costs, in which case pic_offset_table_rtx is not
2504 initialized. For that case no need to generate the first adrp
2505 instruction as the final cost for global variable access is
2506 one instruction. */
2507 if (gp_rtx != NULL)
2508 {
2509 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2510 using the page base as GOT base, the first page may be wasted,
2511 in the worst scenario, there is only 28K space for GOT).
2512
2513 The generate instruction sequence for accessing global variable
2514 is:
2515
2516 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2517
2518 Only one instruction needed. But we must initialize
2519 pic_offset_table_rtx properly. We generate initialize insn for
2520 every global access, and allow CSE to remove all redundant.
2521
2522 The final instruction sequences will look like the following
2523 for multiply global variables access.
2524
2525 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2526
2527 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2528 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2529 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2530 ... */
2531
2532 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2533 crtl->uses_pic_offset_table = 1;
2534 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2535
2536 if (mode != GET_MODE (gp_rtx))
2537 gp_rtx = gen_lowpart (mode, gp_rtx);
2538
2539 }
2540
2541 if (mode == ptr_mode)
2542 {
2543 if (mode == DImode)
2544 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2545 else
2546 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2547
2548 mem = XVECEXP (SET_SRC (insn), 0, 0);
2549 }
2550 else
2551 {
2552 gcc_assert (mode == Pmode);
2553
2554 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2555 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2556 }
2557
2558 /* The operand is expected to be MEM. Whenever the related insn
2559 pattern changed, above code which calculate mem should be
2560 updated. */
2561 gcc_assert (GET_CODE (mem) == MEM);
2562 MEM_READONLY_P (mem) = 1;
2563 MEM_NOTRAP_P (mem) = 1;
2564 emit_insn (insn);
2565 return;
2566 }
2567
2568 case SYMBOL_SMALL_GOT_4G:
2569 {
2570 /* In ILP32, the mode of dest can be either SImode or DImode,
2571 while the got entry is always of SImode size. The mode of
2572 dest depends on how dest is used: if dest is assigned to a
2573 pointer (e.g. in the memory), it has SImode; it may have
2574 DImode if dest is dereferenced to access the memeory.
2575 This is why we have to handle three different ldr_got_small
2576 patterns here (two patterns for ILP32). */
2577
2578 rtx insn;
2579 rtx mem;
2580 rtx tmp_reg = dest;
2581 machine_mode mode = GET_MODE (dest);
2582
2583 if (can_create_pseudo_p ())
2584 tmp_reg = gen_reg_rtx (mode);
2585
2586 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2587 if (mode == ptr_mode)
2588 {
2589 if (mode == DImode)
2590 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2591 else
2592 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2593
2594 mem = XVECEXP (SET_SRC (insn), 0, 0);
2595 }
2596 else
2597 {
2598 gcc_assert (mode == Pmode);
2599
2600 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2601 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2602 }
2603
2604 gcc_assert (GET_CODE (mem) == MEM);
2605 MEM_READONLY_P (mem) = 1;
2606 MEM_NOTRAP_P (mem) = 1;
2607 emit_insn (insn);
2608 return;
2609 }
2610
2611 case SYMBOL_SMALL_TLSGD:
2612 {
2613 rtx_insn *insns;
2614 /* The return type of __tls_get_addr is the C pointer type
2615 so use ptr_mode. */
2616 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
2617 rtx tmp_reg = dest;
2618
2619 if (GET_MODE (dest) != ptr_mode)
2620 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
2621
2622 start_sequence ();
2623 if (ptr_mode == SImode)
2624 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2625 else
2626 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2627 insns = get_insns ();
2628 end_sequence ();
2629
2630 RTL_CONST_CALL_P (insns) = 1;
2631 emit_libcall_block (insns, tmp_reg, result, imm);
2632 /* Convert back to the mode of the dest adding a zero_extend
2633 from SImode (ptr_mode) to DImode (Pmode). */
2634 if (dest != tmp_reg)
2635 convert_move (dest, tmp_reg, true);
2636 return;
2637 }
2638
2639 case SYMBOL_SMALL_TLSDESC:
2640 {
2641 machine_mode mode = GET_MODE (dest);
2642 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2643 rtx tp;
2644
2645 gcc_assert (mode == Pmode || mode == ptr_mode);
2646
2647 /* In ILP32, the got entry is always of SImode size. Unlike
2648 small GOT, the dest is fixed at reg 0. */
2649 if (TARGET_ILP32)
2650 emit_insn (gen_tlsdesc_small_si (imm));
2651 else
2652 emit_insn (gen_tlsdesc_small_di (imm));
2653 tp = aarch64_load_tp (NULL);
2654
2655 if (mode != Pmode)
2656 tp = gen_lowpart (mode, tp);
2657
2658 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2659 if (REG_P (dest))
2660 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2661 return;
2662 }
2663
2664 case SYMBOL_SMALL_TLSIE:
2665 {
2666 /* In ILP32, the mode of dest can be either SImode or DImode,
2667 while the got entry is always of SImode size. The mode of
2668 dest depends on how dest is used: if dest is assigned to a
2669 pointer (e.g. in the memory), it has SImode; it may have
2670 DImode if dest is dereferenced to access the memeory.
2671 This is why we have to handle three different tlsie_small
2672 patterns here (two patterns for ILP32). */
2673 machine_mode mode = GET_MODE (dest);
2674 rtx tmp_reg = gen_reg_rtx (mode);
2675 rtx tp = aarch64_load_tp (NULL);
2676
2677 if (mode == ptr_mode)
2678 {
2679 if (mode == DImode)
2680 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2681 else
2682 {
2683 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2684 tp = gen_lowpart (mode, tp);
2685 }
2686 }
2687 else
2688 {
2689 gcc_assert (mode == Pmode);
2690 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2691 }
2692
2693 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2694 if (REG_P (dest))
2695 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2696 return;
2697 }
2698
2699 case SYMBOL_TLSLE12:
2700 case SYMBOL_TLSLE24:
2701 case SYMBOL_TLSLE32:
2702 case SYMBOL_TLSLE48:
2703 {
2704 machine_mode mode = GET_MODE (dest);
2705 rtx tp = aarch64_load_tp (NULL);
2706
2707 if (mode != Pmode)
2708 tp = gen_lowpart (mode, tp);
2709
2710 switch (type)
2711 {
2712 case SYMBOL_TLSLE12:
2713 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2714 (dest, tp, imm));
2715 break;
2716 case SYMBOL_TLSLE24:
2717 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2718 (dest, tp, imm));
2719 break;
2720 case SYMBOL_TLSLE32:
2721 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2722 (dest, imm));
2723 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2724 (dest, dest, tp));
2725 break;
2726 case SYMBOL_TLSLE48:
2727 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2728 (dest, imm));
2729 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2730 (dest, dest, tp));
2731 break;
2732 default:
2733 gcc_unreachable ();
2734 }
2735
2736 if (REG_P (dest))
2737 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2738 return;
2739 }
2740
2741 case SYMBOL_TINY_GOT:
2742 emit_insn (gen_ldr_got_tiny (dest, imm));
2743 return;
2744
2745 case SYMBOL_TINY_TLSIE:
2746 {
2747 machine_mode mode = GET_MODE (dest);
2748 rtx tp = aarch64_load_tp (NULL);
2749
2750 if (mode == ptr_mode)
2751 {
2752 if (mode == DImode)
2753 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2754 else
2755 {
2756 tp = gen_lowpart (mode, tp);
2757 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2758 }
2759 }
2760 else
2761 {
2762 gcc_assert (mode == Pmode);
2763 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2764 }
2765
2766 if (REG_P (dest))
2767 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2768 return;
2769 }
2770
2771 default:
2772 gcc_unreachable ();
2773 }
2774 }
2775
2776 /* Emit a move from SRC to DEST. Assume that the move expanders can
2777 handle all moves if !can_create_pseudo_p (). The distinction is
2778 important because, unlike emit_move_insn, the move expanders know
2779 how to force Pmode objects into the constant pool even when the
2780 constant pool address is not itself legitimate. */
2781 static rtx
2782 aarch64_emit_move (rtx dest, rtx src)
2783 {
2784 return (can_create_pseudo_p ()
2785 ? emit_move_insn (dest, src)
2786 : emit_move_insn_1 (dest, src));
2787 }
2788
2789 /* Apply UNOPTAB to OP and store the result in DEST. */
2790
2791 static void
2792 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2793 {
2794 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2795 if (dest != tmp)
2796 emit_move_insn (dest, tmp);
2797 }
2798
2799 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2800
2801 static void
2802 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2803 {
2804 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2805 OPTAB_DIRECT);
2806 if (dest != tmp)
2807 emit_move_insn (dest, tmp);
2808 }
2809
2810 /* Split a 128-bit move operation into two 64-bit move operations,
2811 taking care to handle partial overlap of register to register
2812 copies. Special cases are needed when moving between GP regs and
2813 FP regs. SRC can be a register, constant or memory; DST a register
2814 or memory. If either operand is memory it must not have any side
2815 effects. */
2816 void
2817 aarch64_split_128bit_move (rtx dst, rtx src)
2818 {
2819 rtx dst_lo, dst_hi;
2820 rtx src_lo, src_hi;
2821
2822 machine_mode mode = GET_MODE (dst);
2823
2824 gcc_assert (mode == TImode || mode == TFmode);
2825 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2826 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2827
2828 if (REG_P (dst) && REG_P (src))
2829 {
2830 int src_regno = REGNO (src);
2831 int dst_regno = REGNO (dst);
2832
2833 /* Handle FP <-> GP regs. */
2834 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2835 {
2836 src_lo = gen_lowpart (word_mode, src);
2837 src_hi = gen_highpart (word_mode, src);
2838
2839 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2840 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2841 return;
2842 }
2843 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2844 {
2845 dst_lo = gen_lowpart (word_mode, dst);
2846 dst_hi = gen_highpart (word_mode, dst);
2847
2848 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2849 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2850 return;
2851 }
2852 }
2853
2854 dst_lo = gen_lowpart (word_mode, dst);
2855 dst_hi = gen_highpart (word_mode, dst);
2856 src_lo = gen_lowpart (word_mode, src);
2857 src_hi = gen_highpart_mode (word_mode, mode, src);
2858
2859 /* At most one pairing may overlap. */
2860 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2861 {
2862 aarch64_emit_move (dst_hi, src_hi);
2863 aarch64_emit_move (dst_lo, src_lo);
2864 }
2865 else
2866 {
2867 aarch64_emit_move (dst_lo, src_lo);
2868 aarch64_emit_move (dst_hi, src_hi);
2869 }
2870 }
2871
2872 bool
2873 aarch64_split_128bit_move_p (rtx dst, rtx src)
2874 {
2875 return (! REG_P (src)
2876 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2877 }
2878
2879 /* Split a complex SIMD combine. */
2880
2881 void
2882 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2883 {
2884 machine_mode src_mode = GET_MODE (src1);
2885 machine_mode dst_mode = GET_MODE (dst);
2886
2887 gcc_assert (VECTOR_MODE_P (dst_mode));
2888 gcc_assert (register_operand (dst, dst_mode)
2889 && register_operand (src1, src_mode)
2890 && register_operand (src2, src_mode));
2891
2892 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2893 return;
2894 }
2895
2896 /* Split a complex SIMD move. */
2897
2898 void
2899 aarch64_split_simd_move (rtx dst, rtx src)
2900 {
2901 machine_mode src_mode = GET_MODE (src);
2902 machine_mode dst_mode = GET_MODE (dst);
2903
2904 gcc_assert (VECTOR_MODE_P (dst_mode));
2905
2906 if (REG_P (dst) && REG_P (src))
2907 {
2908 gcc_assert (VECTOR_MODE_P (src_mode));
2909 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2910 }
2911 }
2912
2913 bool
2914 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2915 machine_mode ymode, rtx y)
2916 {
2917 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2918 gcc_assert (r != NULL);
2919 return rtx_equal_p (x, r);
2920 }
2921
2922 /* Return TARGET if it is nonnull and a register of mode MODE.
2923 Otherwise, return a fresh register of mode MODE if we can,
2924 or TARGET reinterpreted as MODE if we can't. */
2925
2926 static rtx
2927 aarch64_target_reg (rtx target, machine_mode mode)
2928 {
2929 if (target && REG_P (target) && GET_MODE (target) == mode)
2930 return target;
2931 if (!can_create_pseudo_p ())
2932 {
2933 gcc_assert (target);
2934 return gen_lowpart (mode, target);
2935 }
2936 return gen_reg_rtx (mode);
2937 }
2938
2939 /* Return a register that contains the constant in BUILDER, given that
2940 the constant is a legitimate move operand. Use TARGET as the register
2941 if it is nonnull and convenient. */
2942
2943 static rtx
2944 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2945 {
2946 rtx src = builder.build ();
2947 target = aarch64_target_reg (target, GET_MODE (src));
2948 emit_insn (gen_rtx_SET (target, src));
2949 return target;
2950 }
2951
2952 static rtx
2953 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2954 {
2955 if (can_create_pseudo_p ())
2956 return force_reg (mode, value);
2957 else
2958 {
2959 gcc_assert (x);
2960 aarch64_emit_move (x, value);
2961 return x;
2962 }
2963 }
2964
2965 /* Return true if predicate value X is a constant in which every element
2966 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2967 value, i.e. as a predicate in which all bits are significant. */
2968
2969 static bool
2970 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2971 {
2972 if (GET_CODE (x) != CONST_VECTOR)
2973 return false;
2974
2975 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2976 GET_MODE_NUNITS (GET_MODE (x)));
2977 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2978 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2979 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2980
2981 unsigned int nelts = const_vector_encoded_nelts (x);
2982 for (unsigned int i = 0; i < nelts; ++i)
2983 {
2984 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2985 if (!CONST_INT_P (elt))
2986 return false;
2987
2988 builder.quick_push (elt);
2989 for (unsigned int j = 1; j < factor; ++j)
2990 builder.quick_push (const0_rtx);
2991 }
2992 builder.finalize ();
2993 return true;
2994 }
2995
2996 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2997 widest predicate element size it can have (that is, the largest size
2998 for which each element would still be 0 or 1). */
2999
3000 unsigned int
3001 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3002 {
3003 /* Start with the most optimistic assumption: that we only need
3004 one bit per pattern. This is what we will use if only the first
3005 bit in each pattern is ever set. */
3006 unsigned int mask = GET_MODE_SIZE (DImode);
3007 mask |= builder.npatterns ();
3008
3009 /* Look for set bits. */
3010 unsigned int nelts = builder.encoded_nelts ();
3011 for (unsigned int i = 1; i < nelts; ++i)
3012 if (INTVAL (builder.elt (i)) != 0)
3013 {
3014 if (i & 1)
3015 return 1;
3016 mask |= i;
3017 }
3018 return mask & -mask;
3019 }
3020
3021 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3022 return that predicate mode, otherwise return opt_machine_mode (). */
3023
3024 opt_machine_mode
3025 aarch64_ptrue_all_mode (rtx x)
3026 {
3027 gcc_assert (GET_MODE (x) == VNx16BImode);
3028 if (GET_CODE (x) != CONST_VECTOR
3029 || !CONST_VECTOR_DUPLICATE_P (x)
3030 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3031 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3032 return opt_machine_mode ();
3033
3034 unsigned int nelts = const_vector_encoded_nelts (x);
3035 for (unsigned int i = 1; i < nelts; ++i)
3036 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3037 return opt_machine_mode ();
3038
3039 return aarch64_sve_pred_mode (nelts);
3040 }
3041
3042 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3043 that the constant would have with predicate element size ELT_SIZE
3044 (ignoring the upper bits in each element) and return:
3045
3046 * -1 if all bits are set
3047 * N if the predicate has N leading set bits followed by all clear bits
3048 * 0 if the predicate does not have any of these forms. */
3049
3050 int
3051 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3052 unsigned int elt_size)
3053 {
3054 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3055 followed by set bits. */
3056 if (builder.nelts_per_pattern () == 3)
3057 return 0;
3058
3059 /* Skip over leading set bits. */
3060 unsigned int nelts = builder.encoded_nelts ();
3061 unsigned int i = 0;
3062 for (; i < nelts; i += elt_size)
3063 if (INTVAL (builder.elt (i)) == 0)
3064 break;
3065 unsigned int vl = i / elt_size;
3066
3067 /* Check for the all-true case. */
3068 if (i == nelts)
3069 return -1;
3070
3071 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3072 repeating pattern of set bits followed by clear bits. */
3073 if (builder.nelts_per_pattern () != 2)
3074 return 0;
3075
3076 /* We have a "foreground" value and a duplicated "background" value.
3077 If the background might repeat and the last set bit belongs to it,
3078 we might have set bits followed by clear bits followed by set bits. */
3079 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3080 return 0;
3081
3082 /* Make sure that the rest are all clear. */
3083 for (; i < nelts; i += elt_size)
3084 if (INTVAL (builder.elt (i)) != 0)
3085 return 0;
3086
3087 return vl;
3088 }
3089
3090 /* See if there is an svpattern that encodes an SVE predicate of mode
3091 PRED_MODE in which the first VL bits are set and the rest are clear.
3092 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3093 A VL of -1 indicates an all-true vector. */
3094
3095 aarch64_svpattern
3096 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3097 {
3098 if (vl < 0)
3099 return AARCH64_SV_ALL;
3100
3101 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3102 return AARCH64_NUM_SVPATTERNS;
3103
3104 if (vl >= 1 && vl <= 8)
3105 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3106
3107 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3108 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3109
3110 int max_vl;
3111 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3112 {
3113 if (vl == (max_vl / 3) * 3)
3114 return AARCH64_SV_MUL3;
3115 /* These would only trigger for non-power-of-2 lengths. */
3116 if (vl == (max_vl & -4))
3117 return AARCH64_SV_MUL4;
3118 if (vl == (1 << floor_log2 (max_vl)))
3119 return AARCH64_SV_POW2;
3120 if (vl == max_vl)
3121 return AARCH64_SV_ALL;
3122 }
3123 return AARCH64_NUM_SVPATTERNS;
3124 }
3125
3126 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3127 bits has the lowest bit set and the upper bits clear. This is the
3128 VNx16BImode equivalent of a PTRUE for controlling elements of
3129 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3130 all bits are significant, even the upper zeros. */
3131
3132 rtx
3133 aarch64_ptrue_all (unsigned int elt_size)
3134 {
3135 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3136 builder.quick_push (const1_rtx);
3137 for (unsigned int i = 1; i < elt_size; ++i)
3138 builder.quick_push (const0_rtx);
3139 return builder.build ();
3140 }
3141
3142 /* Return an all-true predicate register of mode MODE. */
3143
3144 rtx
3145 aarch64_ptrue_reg (machine_mode mode)
3146 {
3147 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3148 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3149 return gen_lowpart (mode, reg);
3150 }
3151
3152 /* Return an all-false predicate register of mode MODE. */
3153
3154 rtx
3155 aarch64_pfalse_reg (machine_mode mode)
3156 {
3157 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3158 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3159 return gen_lowpart (mode, reg);
3160 }
3161
3162 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3163 true, or alternatively if we know that the operation predicated by
3164 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
3165 aarch64_sve_gp_strictness operand that describes the operation
3166 predicated by PRED1[0]. */
3167
3168 bool
3169 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3170 {
3171 machine_mode mode = GET_MODE (pred2);
3172 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3173 && mode == GET_MODE (pred1[0])
3174 && aarch64_sve_gp_strictness (pred1[1], SImode));
3175 return (pred1[0] == CONSTM1_RTX (mode)
3176 || INTVAL (pred1[1]) == SVE_RELAXED_GP
3177 || rtx_equal_p (pred1[0], pred2));
3178 }
3179
3180 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3181 for it. PRED2[0] is the predicate for the instruction whose result
3182 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3183 for it. Return true if we can prove that the two predicates are
3184 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3185 with PRED1[0] without changing behavior. */
3186
3187 bool
3188 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3189 {
3190 machine_mode mode = GET_MODE (pred1[0]);
3191 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3192 && mode == GET_MODE (pred2[0])
3193 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3194 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3195
3196 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3197 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3198 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3199 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3200 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3201 }
3202
3203 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3204 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3205 Use TARGET as the target register if nonnull and convenient. */
3206
3207 static rtx
3208 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3209 machine_mode data_mode, rtx op1, rtx op2)
3210 {
3211 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3212 expand_operand ops[5];
3213 create_output_operand (&ops[0], target, pred_mode);
3214 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3215 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3216 create_input_operand (&ops[3], op1, data_mode);
3217 create_input_operand (&ops[4], op2, data_mode);
3218 expand_insn (icode, 5, ops);
3219 return ops[0].value;
3220 }
3221
3222 /* Use a comparison to convert integer vector SRC into MODE, which is
3223 the corresponding SVE predicate mode. Use TARGET for the result
3224 if it's nonnull and convenient. */
3225
3226 rtx
3227 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3228 {
3229 machine_mode src_mode = GET_MODE (src);
3230 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3231 src, CONST0_RTX (src_mode));
3232 }
3233
3234 /* Return the assembly token for svprfop value PRFOP. */
3235
3236 static const char *
3237 svprfop_token (enum aarch64_svprfop prfop)
3238 {
3239 switch (prfop)
3240 {
3241 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3242 AARCH64_FOR_SVPRFOP (CASE)
3243 #undef CASE
3244 case AARCH64_NUM_SVPRFOPS:
3245 break;
3246 }
3247 gcc_unreachable ();
3248 }
3249
3250 /* Return the assembly string for an SVE prefetch operation with
3251 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3252 and that SUFFIX is the format for the remaining operands. */
3253
3254 char *
3255 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3256 const char *suffix)
3257 {
3258 static char buffer[128];
3259 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3260 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3261 mnemonic, svprfop_token (prfop), suffix);
3262 gcc_assert (written < sizeof (buffer));
3263 return buffer;
3264 }
3265
3266 /* Check whether we can calculate the number of elements in PATTERN
3267 at compile time, given that there are NELTS_PER_VQ elements per
3268 128-bit block. Return the value if so, otherwise return -1. */
3269
3270 HOST_WIDE_INT
3271 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3272 {
3273 unsigned int vl, const_vg;
3274 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3275 vl = 1 + (pattern - AARCH64_SV_VL1);
3276 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3277 vl = 16 << (pattern - AARCH64_SV_VL16);
3278 else if (aarch64_sve_vg.is_constant (&const_vg))
3279 {
3280 /* There are two vector granules per quadword. */
3281 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3282 switch (pattern)
3283 {
3284 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3285 case AARCH64_SV_MUL4: return nelts & -4;
3286 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3287 case AARCH64_SV_ALL: return nelts;
3288 default: gcc_unreachable ();
3289 }
3290 }
3291 else
3292 return -1;
3293
3294 /* There are two vector granules per quadword. */
3295 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3296 if (known_le (vl, nelts_all))
3297 return vl;
3298
3299 /* Requesting more elements than are available results in a PFALSE. */
3300 if (known_gt (vl, nelts_all))
3301 return 0;
3302
3303 return -1;
3304 }
3305
3306 /* Return true if we can move VALUE into a register using a single
3307 CNT[BHWD] instruction. */
3308
3309 static bool
3310 aarch64_sve_cnt_immediate_p (poly_int64 value)
3311 {
3312 HOST_WIDE_INT factor = value.coeffs[0];
3313 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3314 return (value.coeffs[1] == factor
3315 && IN_RANGE (factor, 2, 16 * 16)
3316 && (factor & 1) == 0
3317 && factor <= 16 * (factor & -factor));
3318 }
3319
3320 /* Likewise for rtx X. */
3321
3322 bool
3323 aarch64_sve_cnt_immediate_p (rtx x)
3324 {
3325 poly_int64 value;
3326 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3327 }
3328
3329 /* Return the asm string for an instruction with a CNT-like vector size
3330 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3331 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3332 first part of the operands template (the part that comes before the
3333 vector size itself). PATTERN is the pattern to use. FACTOR is the
3334 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3335 in each quadword. If it is zero, we can use any element size. */
3336
3337 static char *
3338 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3339 aarch64_svpattern pattern,
3340 unsigned int factor,
3341 unsigned int nelts_per_vq)
3342 {
3343 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3344
3345 if (nelts_per_vq == 0)
3346 /* There is some overlap in the ranges of the four CNT instructions.
3347 Here we always use the smallest possible element size, so that the
3348 multiplier is 1 whereever possible. */
3349 nelts_per_vq = factor & -factor;
3350 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3351 gcc_assert (IN_RANGE (shift, 1, 4));
3352 char suffix = "dwhb"[shift - 1];
3353
3354 factor >>= shift;
3355 unsigned int written;
3356 if (pattern == AARCH64_SV_ALL && factor == 1)
3357 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3358 prefix, suffix, operands);
3359 else if (factor == 1)
3360 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3361 prefix, suffix, operands, svpattern_token (pattern));
3362 else
3363 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3364 prefix, suffix, operands, svpattern_token (pattern),
3365 factor);
3366 gcc_assert (written < sizeof (buffer));
3367 return buffer;
3368 }
3369
3370 /* Return the asm string for an instruction with a CNT-like vector size
3371 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3372 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3373 first part of the operands template (the part that comes before the
3374 vector size itself). X is the value of the vector size operand,
3375 as a polynomial integer rtx; we need to convert this into an "all"
3376 pattern with a multiplier. */
3377
3378 char *
3379 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3380 rtx x)
3381 {
3382 poly_int64 value = rtx_to_poly_int64 (x);
3383 gcc_assert (aarch64_sve_cnt_immediate_p (value));
3384 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3385 value.coeffs[1], 0);
3386 }
3387
3388 /* Return the asm string for an instruction with a CNT-like vector size
3389 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3390 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3391 first part of the operands template (the part that comes before the
3392 vector size itself). CNT_PAT[0..2] are the operands of the
3393 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3394
3395 char *
3396 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3397 const char *operands, rtx *cnt_pat)
3398 {
3399 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3400 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3401 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3402 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3403 factor, nelts_per_vq);
3404 }
3405
3406 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3407
3408 bool
3409 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3410 {
3411 poly_int64 value;
3412 return (poly_int_rtx_p (x, &value)
3413 && (aarch64_sve_cnt_immediate_p (value)
3414 || aarch64_sve_cnt_immediate_p (-value)));
3415 }
3416
3417 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3418 operand 0. */
3419
3420 char *
3421 aarch64_output_sve_scalar_inc_dec (rtx offset)
3422 {
3423 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3424 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3425 if (offset_value.coeffs[1] > 0)
3426 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3427 offset_value.coeffs[1], 0);
3428 else
3429 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3430 -offset_value.coeffs[1], 0);
3431 }
3432
3433 /* Return true if we can add VALUE to a register using a single ADDVL
3434 or ADDPL instruction. */
3435
3436 static bool
3437 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3438 {
3439 HOST_WIDE_INT factor = value.coeffs[0];
3440 if (factor == 0 || value.coeffs[1] != factor)
3441 return false;
3442 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3443 and a value of 16 is one vector width. */
3444 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3445 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3446 }
3447
3448 /* Likewise for rtx X. */
3449
3450 bool
3451 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3452 {
3453 poly_int64 value;
3454 return (poly_int_rtx_p (x, &value)
3455 && aarch64_sve_addvl_addpl_immediate_p (value));
3456 }
3457
3458 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3459 to operand 1 and storing the result in operand 0. */
3460
3461 char *
3462 aarch64_output_sve_addvl_addpl (rtx offset)
3463 {
3464 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3465 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3466 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3467
3468 int factor = offset_value.coeffs[1];
3469 if ((factor & 15) == 0)
3470 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3471 else
3472 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3473 return buffer;
3474 }
3475
3476 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3477 instruction. If it is, store the number of elements in each vector
3478 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3479 factor in *FACTOR_OUT (if nonnull). */
3480
3481 bool
3482 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3483 unsigned int *nelts_per_vq_out)
3484 {
3485 rtx elt;
3486 poly_int64 value;
3487
3488 if (!const_vec_duplicate_p (x, &elt)
3489 || !poly_int_rtx_p (elt, &value))
3490 return false;
3491
3492 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3493 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3494 /* There's no vector INCB. */
3495 return false;
3496
3497 HOST_WIDE_INT factor = value.coeffs[0];
3498 if (value.coeffs[1] != factor)
3499 return false;
3500
3501 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3502 if ((factor % nelts_per_vq) != 0
3503 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3504 return false;
3505
3506 if (factor_out)
3507 *factor_out = factor;
3508 if (nelts_per_vq_out)
3509 *nelts_per_vq_out = nelts_per_vq;
3510 return true;
3511 }
3512
3513 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3514 instruction. */
3515
3516 bool
3517 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3518 {
3519 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3520 }
3521
3522 /* Return the asm template for an SVE vector INC or DEC instruction.
3523 OPERANDS gives the operands before the vector count and X is the
3524 value of the vector count operand itself. */
3525
3526 char *
3527 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3528 {
3529 int factor;
3530 unsigned int nelts_per_vq;
3531 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3532 gcc_unreachable ();
3533 if (factor < 0)
3534 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3535 -factor, nelts_per_vq);
3536 else
3537 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3538 factor, nelts_per_vq);
3539 }
3540
3541 static int
3542 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3543 scalar_int_mode mode)
3544 {
3545 int i;
3546 unsigned HOST_WIDE_INT val, val2, mask;
3547 int one_match, zero_match;
3548 int num_insns;
3549
3550 val = INTVAL (imm);
3551
3552 if (aarch64_move_imm (val, mode))
3553 {
3554 if (generate)
3555 emit_insn (gen_rtx_SET (dest, imm));
3556 return 1;
3557 }
3558
3559 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3560 (with XXXX non-zero). In that case check to see if the move can be done in
3561 a smaller mode. */
3562 val2 = val & 0xffffffff;
3563 if (mode == DImode
3564 && aarch64_move_imm (val2, SImode)
3565 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3566 {
3567 if (generate)
3568 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3569
3570 /* Check if we have to emit a second instruction by checking to see
3571 if any of the upper 32 bits of the original DI mode value is set. */
3572 if (val == val2)
3573 return 1;
3574
3575 i = (val >> 48) ? 48 : 32;
3576
3577 if (generate)
3578 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3579 GEN_INT ((val >> i) & 0xffff)));
3580
3581 return 2;
3582 }
3583
3584 if ((val >> 32) == 0 || mode == SImode)
3585 {
3586 if (generate)
3587 {
3588 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3589 if (mode == SImode)
3590 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3591 GEN_INT ((val >> 16) & 0xffff)));
3592 else
3593 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3594 GEN_INT ((val >> 16) & 0xffff)));
3595 }
3596 return 2;
3597 }
3598
3599 /* Remaining cases are all for DImode. */
3600
3601 mask = 0xffff;
3602 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3603 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3604 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3605 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3606
3607 if (zero_match != 2 && one_match != 2)
3608 {
3609 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3610 For a 64-bit bitmask try whether changing 16 bits to all ones or
3611 zeroes creates a valid bitmask. To check any repeated bitmask,
3612 try using 16 bits from the other 32-bit half of val. */
3613
3614 for (i = 0; i < 64; i += 16, mask <<= 16)
3615 {
3616 val2 = val & ~mask;
3617 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3618 break;
3619 val2 = val | mask;
3620 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3621 break;
3622 val2 = val2 & ~mask;
3623 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3624 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3625 break;
3626 }
3627 if (i != 64)
3628 {
3629 if (generate)
3630 {
3631 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3632 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3633 GEN_INT ((val >> i) & 0xffff)));
3634 }
3635 return 2;
3636 }
3637 }
3638
3639 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3640 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3641 otherwise skip zero bits. */
3642
3643 num_insns = 1;
3644 mask = 0xffff;
3645 val2 = one_match > zero_match ? ~val : val;
3646 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3647
3648 if (generate)
3649 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3650 ? (val | ~(mask << i))
3651 : (val & (mask << i)))));
3652 for (i += 16; i < 64; i += 16)
3653 {
3654 if ((val2 & (mask << i)) == 0)
3655 continue;
3656 if (generate)
3657 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3658 GEN_INT ((val >> i) & 0xffff)));
3659 num_insns ++;
3660 }
3661
3662 return num_insns;
3663 }
3664
3665 /* Return whether imm is a 128-bit immediate which is simple enough to
3666 expand inline. */
3667 bool
3668 aarch64_mov128_immediate (rtx imm)
3669 {
3670 if (GET_CODE (imm) == CONST_INT)
3671 return true;
3672
3673 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3674
3675 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3676 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3677
3678 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3679 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3680 }
3681
3682
3683 /* Return the number of temporary registers that aarch64_add_offset_1
3684 would need to add OFFSET to a register. */
3685
3686 static unsigned int
3687 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3688 {
3689 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3690 }
3691
3692 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3693 a non-polynomial OFFSET. MODE is the mode of the addition.
3694 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3695 be set and CFA adjustments added to the generated instructions.
3696
3697 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3698 temporary if register allocation is already complete. This temporary
3699 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3700 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3701 the immediate again.
3702
3703 Since this function may be used to adjust the stack pointer, we must
3704 ensure that it cannot cause transient stack deallocation (for example
3705 by first incrementing SP and then decrementing when adjusting by a
3706 large immediate). */
3707
3708 static void
3709 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3710 rtx src, HOST_WIDE_INT offset, rtx temp1,
3711 bool frame_related_p, bool emit_move_imm)
3712 {
3713 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3714 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3715
3716 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
3717 rtx_insn *insn;
3718
3719 if (!moffset)
3720 {
3721 if (!rtx_equal_p (dest, src))
3722 {
3723 insn = emit_insn (gen_rtx_SET (dest, src));
3724 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3725 }
3726 return;
3727 }
3728
3729 /* Single instruction adjustment. */
3730 if (aarch64_uimm12_shift (moffset))
3731 {
3732 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3733 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3734 return;
3735 }
3736
3737 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3738 and either:
3739
3740 a) the offset cannot be loaded by a 16-bit move or
3741 b) there is no spare register into which we can move it. */
3742 if (moffset < 0x1000000
3743 && ((!temp1 && !can_create_pseudo_p ())
3744 || !aarch64_move_imm (moffset, mode)))
3745 {
3746 HOST_WIDE_INT low_off = moffset & 0xfff;
3747
3748 low_off = offset < 0 ? -low_off : low_off;
3749 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3750 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3751 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3752 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3753 return;
3754 }
3755
3756 /* Emit a move immediate if required and an addition/subtraction. */
3757 if (emit_move_imm)
3758 {
3759 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3760 temp1 = aarch64_force_temporary (mode, temp1,
3761 gen_int_mode (moffset, mode));
3762 }
3763 insn = emit_insn (offset < 0
3764 ? gen_sub3_insn (dest, src, temp1)
3765 : gen_add3_insn (dest, src, temp1));
3766 if (frame_related_p)
3767 {
3768 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3769 rtx adj = plus_constant (mode, src, offset);
3770 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3771 }
3772 }
3773
3774 /* Return the number of temporary registers that aarch64_add_offset
3775 would need to move OFFSET into a register or add OFFSET to a register;
3776 ADD_P is true if we want the latter rather than the former. */
3777
3778 static unsigned int
3779 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3780 {
3781 /* This follows the same structure as aarch64_add_offset. */
3782 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3783 return 0;
3784
3785 unsigned int count = 0;
3786 HOST_WIDE_INT factor = offset.coeffs[1];
3787 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3788 poly_int64 poly_offset (factor, factor);
3789 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3790 /* Need one register for the ADDVL/ADDPL result. */
3791 count += 1;
3792 else if (factor != 0)
3793 {
3794 factor = abs (factor);
3795 if (factor > 16 * (factor & -factor))
3796 /* Need one register for the CNT result and one for the multiplication
3797 factor. If necessary, the second temporary can be reused for the
3798 constant part of the offset. */
3799 return 2;
3800 /* Need one register for the CNT result (which might then
3801 be shifted). */
3802 count += 1;
3803 }
3804 return count + aarch64_add_offset_1_temporaries (constant);
3805 }
3806
3807 /* If X can be represented as a poly_int64, return the number
3808 of temporaries that are required to add it to a register.
3809 Return -1 otherwise. */
3810
3811 int
3812 aarch64_add_offset_temporaries (rtx x)
3813 {
3814 poly_int64 offset;
3815 if (!poly_int_rtx_p (x, &offset))
3816 return -1;
3817 return aarch64_offset_temporaries (true, offset);
3818 }
3819
3820 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3821 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3822 be set and CFA adjustments added to the generated instructions.
3823
3824 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3825 temporary if register allocation is already complete. This temporary
3826 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3827 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3828 false to avoid emitting the immediate again.
3829
3830 TEMP2, if nonnull, is a second temporary register that doesn't
3831 overlap either DEST or REG.
3832
3833 Since this function may be used to adjust the stack pointer, we must
3834 ensure that it cannot cause transient stack deallocation (for example
3835 by first incrementing SP and then decrementing when adjusting by a
3836 large immediate). */
3837
3838 static void
3839 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3840 poly_int64 offset, rtx temp1, rtx temp2,
3841 bool frame_related_p, bool emit_move_imm = true)
3842 {
3843 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3844 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3845 gcc_assert (temp1 == NULL_RTX
3846 || !frame_related_p
3847 || !reg_overlap_mentioned_p (temp1, dest));
3848 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3849
3850 /* Try using ADDVL or ADDPL to add the whole value. */
3851 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3852 {
3853 rtx offset_rtx = gen_int_mode (offset, mode);
3854 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3855 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3856 return;
3857 }
3858
3859 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3860 SVE vector register, over and above the minimum size of 128 bits.
3861 This is equivalent to half the value returned by CNTD with a
3862 vector shape of ALL. */
3863 HOST_WIDE_INT factor = offset.coeffs[1];
3864 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3865
3866 /* Try using ADDVL or ADDPL to add the VG-based part. */
3867 poly_int64 poly_offset (factor, factor);
3868 if (src != const0_rtx
3869 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3870 {
3871 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3872 if (frame_related_p)
3873 {
3874 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3875 RTX_FRAME_RELATED_P (insn) = true;
3876 src = dest;
3877 }
3878 else
3879 {
3880 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3881 src = aarch64_force_temporary (mode, temp1, addr);
3882 temp1 = temp2;
3883 temp2 = NULL_RTX;
3884 }
3885 }
3886 /* Otherwise use a CNT-based sequence. */
3887 else if (factor != 0)
3888 {
3889 /* Use a subtraction if we have a negative factor. */
3890 rtx_code code = PLUS;
3891 if (factor < 0)
3892 {
3893 factor = -factor;
3894 code = MINUS;
3895 }
3896
3897 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3898 into the multiplication. */
3899 rtx val;
3900 int shift = 0;
3901 if (factor & 1)
3902 /* Use a right shift by 1. */
3903 shift = -1;
3904 else
3905 factor /= 2;
3906 HOST_WIDE_INT low_bit = factor & -factor;
3907 if (factor <= 16 * low_bit)
3908 {
3909 if (factor > 16 * 8)
3910 {
3911 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3912 the value with the minimum multiplier and shift it into
3913 position. */
3914 int extra_shift = exact_log2 (low_bit);
3915 shift += extra_shift;
3916 factor >>= extra_shift;
3917 }
3918 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3919 }
3920 else
3921 {
3922 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3923 directly, since that should increase the chances of being
3924 able to use a shift and add sequence. If LOW_BIT itself
3925 is out of range, just use CNTD. */
3926 if (low_bit <= 16 * 8)
3927 factor /= low_bit;
3928 else
3929 low_bit = 1;
3930
3931 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3932 val = aarch64_force_temporary (mode, temp1, val);
3933
3934 if (can_create_pseudo_p ())
3935 {
3936 rtx coeff1 = gen_int_mode (factor, mode);
3937 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3938 }
3939 else
3940 {
3941 /* Go back to using a negative multiplication factor if we have
3942 no register from which to subtract. */
3943 if (code == MINUS && src == const0_rtx)
3944 {
3945 factor = -factor;
3946 code = PLUS;
3947 }
3948 rtx coeff1 = gen_int_mode (factor, mode);
3949 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3950 val = gen_rtx_MULT (mode, val, coeff1);
3951 }
3952 }
3953
3954 if (shift > 0)
3955 {
3956 /* Multiply by 1 << SHIFT. */
3957 val = aarch64_force_temporary (mode, temp1, val);
3958 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3959 }
3960 else if (shift == -1)
3961 {
3962 /* Divide by 2. */
3963 val = aarch64_force_temporary (mode, temp1, val);
3964 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3965 }
3966
3967 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3968 if (src != const0_rtx)
3969 {
3970 val = aarch64_force_temporary (mode, temp1, val);
3971 val = gen_rtx_fmt_ee (code, mode, src, val);
3972 }
3973 else if (code == MINUS)
3974 {
3975 val = aarch64_force_temporary (mode, temp1, val);
3976 val = gen_rtx_NEG (mode, val);
3977 }
3978
3979 if (constant == 0 || frame_related_p)
3980 {
3981 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3982 if (frame_related_p)
3983 {
3984 RTX_FRAME_RELATED_P (insn) = true;
3985 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3986 gen_rtx_SET (dest, plus_constant (Pmode, src,
3987 poly_offset)));
3988 }
3989 src = dest;
3990 if (constant == 0)
3991 return;
3992 }
3993 else
3994 {
3995 src = aarch64_force_temporary (mode, temp1, val);
3996 temp1 = temp2;
3997 temp2 = NULL_RTX;
3998 }
3999
4000 emit_move_imm = true;
4001 }
4002
4003 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4004 frame_related_p, emit_move_imm);
4005 }
4006
4007 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4008 than a poly_int64. */
4009
4010 void
4011 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4012 rtx offset_rtx, rtx temp1, rtx temp2)
4013 {
4014 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4015 temp1, temp2, false);
4016 }
4017
4018 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4019 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4020 if TEMP1 already contains abs (DELTA). */
4021
4022 static inline void
4023 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4024 {
4025 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4026 temp1, temp2, true, emit_move_imm);
4027 }
4028
4029 /* Subtract DELTA from the stack pointer, marking the instructions
4030 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4031 if nonnull. */
4032
4033 static inline void
4034 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4035 bool emit_move_imm = true)
4036 {
4037 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4038 temp1, temp2, frame_related_p, emit_move_imm);
4039 }
4040
4041 /* Set DEST to (vec_series BASE STEP). */
4042
4043 static void
4044 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4045 {
4046 machine_mode mode = GET_MODE (dest);
4047 scalar_mode inner = GET_MODE_INNER (mode);
4048
4049 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4050 if (!aarch64_sve_index_immediate_p (base))
4051 base = force_reg (inner, base);
4052 if (!aarch64_sve_index_immediate_p (step))
4053 step = force_reg (inner, step);
4054
4055 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4056 }
4057
4058 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4059 register of mode MODE. Use TARGET for the result if it's nonnull
4060 and convenient.
4061
4062 The two vector modes must have the same element mode. The behavior
4063 is to duplicate architectural lane N of SRC into architectural lanes
4064 N + I * STEP of the result. On big-endian targets, architectural
4065 lane 0 of an Advanced SIMD vector is the last element of the vector
4066 in memory layout, so for big-endian targets this operation has the
4067 effect of reversing SRC before duplicating it. Callers need to
4068 account for this. */
4069
4070 rtx
4071 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4072 {
4073 machine_mode src_mode = GET_MODE (src);
4074 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4075 insn_code icode = (BYTES_BIG_ENDIAN
4076 ? code_for_aarch64_vec_duplicate_vq_be (mode)
4077 : code_for_aarch64_vec_duplicate_vq_le (mode));
4078
4079 unsigned int i = 0;
4080 expand_operand ops[3];
4081 create_output_operand (&ops[i++], target, mode);
4082 create_output_operand (&ops[i++], src, src_mode);
4083 if (BYTES_BIG_ENDIAN)
4084 {
4085 /* Create a PARALLEL describing the reversal of SRC. */
4086 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4087 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4088 nelts_per_vq - 1, -1);
4089 create_fixed_operand (&ops[i++], sel);
4090 }
4091 expand_insn (icode, i, ops);
4092 return ops[0].value;
4093 }
4094
4095 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4096 the memory image into DEST. Return true on success. */
4097
4098 static bool
4099 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4100 {
4101 src = force_const_mem (GET_MODE (src), src);
4102 if (!src)
4103 return false;
4104
4105 /* Make sure that the address is legitimate. */
4106 if (!aarch64_sve_ld1rq_operand_p (src))
4107 {
4108 rtx addr = force_reg (Pmode, XEXP (src, 0));
4109 src = replace_equiv_address (src, addr);
4110 }
4111
4112 machine_mode mode = GET_MODE (dest);
4113 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4114 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4115 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4116 return true;
4117 }
4118
4119 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4120 SVE data mode and isn't a legitimate constant. Use TARGET for the
4121 result if convenient.
4122
4123 The returned register can have whatever mode seems most natural
4124 given the contents of SRC. */
4125
4126 static rtx
4127 aarch64_expand_sve_const_vector (rtx target, rtx src)
4128 {
4129 machine_mode mode = GET_MODE (src);
4130 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4131 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4132 scalar_mode elt_mode = GET_MODE_INNER (mode);
4133 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4134 unsigned int container_bits = aarch64_sve_container_bits (mode);
4135 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4136
4137 if (nelts_per_pattern == 1
4138 && encoded_bits <= 128
4139 && container_bits != elt_bits)
4140 {
4141 /* We have a partial vector mode and a constant whose full-vector
4142 equivalent would occupy a repeating 128-bit sequence. Build that
4143 full-vector equivalent instead, so that we have the option of
4144 using LD1RQ and Advanced SIMD operations. */
4145 unsigned int repeat = container_bits / elt_bits;
4146 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4147 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4148 for (unsigned int i = 0; i < npatterns; ++i)
4149 for (unsigned int j = 0; j < repeat; ++j)
4150 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4151 target = aarch64_target_reg (target, full_mode);
4152 return aarch64_expand_sve_const_vector (target, builder.build ());
4153 }
4154
4155 if (nelts_per_pattern == 1 && encoded_bits == 128)
4156 {
4157 /* The constant is a duplicated quadword but can't be narrowed
4158 beyond a quadword. Get the memory image of the first quadword
4159 as a 128-bit vector and try using LD1RQ to load it from memory.
4160
4161 The effect for both endiannesses is to load memory lane N into
4162 architectural lanes N + I * STEP of the result. On big-endian
4163 targets, the layout of the 128-bit vector in an Advanced SIMD
4164 register would be different from its layout in an SVE register,
4165 but this 128-bit vector is a memory value only. */
4166 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4167 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4168 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4169 return target;
4170 }
4171
4172 if (nelts_per_pattern == 1 && encoded_bits < 128)
4173 {
4174 /* The vector is a repeating sequence of 64 bits or fewer.
4175 See if we can load them using an Advanced SIMD move and then
4176 duplicate it to fill a vector. This is better than using a GPR
4177 move because it keeps everything in the same register file. */
4178 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4179 rtx_vector_builder builder (vq_mode, npatterns, 1);
4180 for (unsigned int i = 0; i < npatterns; ++i)
4181 {
4182 /* We want memory lane N to go into architectural lane N,
4183 so reverse for big-endian targets. The DUP .Q pattern
4184 has a compensating reverse built-in. */
4185 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4186 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4187 }
4188 rtx vq_src = builder.build ();
4189 if (aarch64_simd_valid_immediate (vq_src, NULL))
4190 {
4191 vq_src = force_reg (vq_mode, vq_src);
4192 return aarch64_expand_sve_dupq (target, mode, vq_src);
4193 }
4194
4195 /* Get an integer representation of the repeating part of Advanced
4196 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4197 which for big-endian targets is lane-swapped wrt a normal
4198 Advanced SIMD vector. This means that for both endiannesses,
4199 memory lane N of SVE vector SRC corresponds to architectural
4200 lane N of a register holding VQ_SRC. This in turn means that
4201 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4202 as a single 128-bit value) and thus that memory lane 0 of SRC is
4203 in the lsb of the integer. Duplicating the integer therefore
4204 ensures that memory lane N of SRC goes into architectural lane
4205 N + I * INDEX of the SVE register. */
4206 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4207 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4208 if (elt_value)
4209 {
4210 /* Pretend that we had a vector of INT_MODE to start with. */
4211 elt_mode = int_mode;
4212 mode = aarch64_full_sve_mode (int_mode).require ();
4213
4214 /* If the integer can be moved into a general register by a
4215 single instruction, do that and duplicate the result. */
4216 if (CONST_INT_P (elt_value)
4217 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4218 {
4219 elt_value = force_reg (elt_mode, elt_value);
4220 return expand_vector_broadcast (mode, elt_value);
4221 }
4222 }
4223 else if (npatterns == 1)
4224 /* We're duplicating a single value, but can't do better than
4225 force it to memory and load from there. This handles things
4226 like symbolic constants. */
4227 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4228
4229 if (elt_value)
4230 {
4231 /* Load the element from memory if we can, otherwise move it into
4232 a register and use a DUP. */
4233 rtx op = force_const_mem (elt_mode, elt_value);
4234 if (!op)
4235 op = force_reg (elt_mode, elt_value);
4236 return expand_vector_broadcast (mode, op);
4237 }
4238 }
4239
4240 /* Try using INDEX. */
4241 rtx base, step;
4242 if (const_vec_series_p (src, &base, &step))
4243 {
4244 aarch64_expand_vec_series (target, base, step);
4245 return target;
4246 }
4247
4248 /* From here on, it's better to force the whole constant to memory
4249 if we can. */
4250 if (GET_MODE_NUNITS (mode).is_constant ())
4251 return NULL_RTX;
4252
4253 /* Expand each pattern individually. */
4254 gcc_assert (npatterns > 1);
4255 rtx_vector_builder builder;
4256 auto_vec<rtx, 16> vectors (npatterns);
4257 for (unsigned int i = 0; i < npatterns; ++i)
4258 {
4259 builder.new_vector (mode, 1, nelts_per_pattern);
4260 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4261 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4262 vectors.quick_push (force_reg (mode, builder.build ()));
4263 }
4264
4265 /* Use permutes to interleave the separate vectors. */
4266 while (npatterns > 1)
4267 {
4268 npatterns /= 2;
4269 for (unsigned int i = 0; i < npatterns; ++i)
4270 {
4271 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4272 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4273 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4274 vectors[i] = tmp;
4275 }
4276 }
4277 gcc_assert (vectors[0] == target);
4278 return target;
4279 }
4280
4281 /* Use WHILE to set a predicate register of mode MODE in which the first
4282 VL bits are set and the rest are clear. Use TARGET for the register
4283 if it's nonnull and convenient. */
4284
4285 static rtx
4286 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4287 unsigned int vl)
4288 {
4289 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4290 target = aarch64_target_reg (target, mode);
4291 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
4292 target, const0_rtx, limit));
4293 return target;
4294 }
4295
4296 static rtx
4297 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4298
4299 /* BUILDER is a constant predicate in which the index of every set bit
4300 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4301 by inverting every element at a multiple of ELT_SIZE and EORing the
4302 result with an ELT_SIZE PTRUE.
4303
4304 Return a register that contains the constant on success, otherwise
4305 return null. Use TARGET as the register if it is nonnull and
4306 convenient. */
4307
4308 static rtx
4309 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4310 unsigned int elt_size)
4311 {
4312 /* Invert every element at a multiple of ELT_SIZE, keeping the
4313 other bits zero. */
4314 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4315 builder.nelts_per_pattern ());
4316 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4317 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4318 inv_builder.quick_push (const1_rtx);
4319 else
4320 inv_builder.quick_push (const0_rtx);
4321 inv_builder.finalize ();
4322
4323 /* See if we can load the constant cheaply. */
4324 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4325 if (!inv)
4326 return NULL_RTX;
4327
4328 /* EOR the result with an ELT_SIZE PTRUE. */
4329 rtx mask = aarch64_ptrue_all (elt_size);
4330 mask = force_reg (VNx16BImode, mask);
4331 target = aarch64_target_reg (target, VNx16BImode);
4332 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4333 return target;
4334 }
4335
4336 /* BUILDER is a constant predicate in which the index of every set bit
4337 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4338 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4339 register on success, otherwise return null. Use TARGET as the register
4340 if nonnull and convenient. */
4341
4342 static rtx
4343 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4344 unsigned int elt_size,
4345 unsigned int permute_size)
4346 {
4347 /* We're going to split the constant into two new constants A and B,
4348 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4349 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4350
4351 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4352 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4353
4354 where _ indicates elements that will be discarded by the permute.
4355
4356 First calculate the ELT_SIZEs for A and B. */
4357 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4358 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4359 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4360 if (INTVAL (builder.elt (i)) != 0)
4361 {
4362 if (i & permute_size)
4363 b_elt_size |= i - permute_size;
4364 else
4365 a_elt_size |= i;
4366 }
4367 a_elt_size &= -a_elt_size;
4368 b_elt_size &= -b_elt_size;
4369
4370 /* Now construct the vectors themselves. */
4371 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4372 builder.nelts_per_pattern ());
4373 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4374 builder.nelts_per_pattern ());
4375 unsigned int nelts = builder.encoded_nelts ();
4376 for (unsigned int i = 0; i < nelts; ++i)
4377 if (i & (elt_size - 1))
4378 {
4379 a_builder.quick_push (const0_rtx);
4380 b_builder.quick_push (const0_rtx);
4381 }
4382 else if ((i & permute_size) == 0)
4383 {
4384 /* The A and B elements are significant. */
4385 a_builder.quick_push (builder.elt (i));
4386 b_builder.quick_push (builder.elt (i + permute_size));
4387 }
4388 else
4389 {
4390 /* The A and B elements are going to be discarded, so pick whatever
4391 is likely to give a nice constant. We are targeting element
4392 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4393 with the aim of each being a sequence of ones followed by
4394 a sequence of zeros. So:
4395
4396 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4397 duplicate the last X_ELT_SIZE element, to extend the
4398 current sequence of ones or zeros.
4399
4400 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4401 zero, so that the constant really does have X_ELT_SIZE and
4402 not a smaller size. */
4403 if (a_elt_size > permute_size)
4404 a_builder.quick_push (const0_rtx);
4405 else
4406 a_builder.quick_push (a_builder.elt (i - a_elt_size));
4407 if (b_elt_size > permute_size)
4408 b_builder.quick_push (const0_rtx);
4409 else
4410 b_builder.quick_push (b_builder.elt (i - b_elt_size));
4411 }
4412 a_builder.finalize ();
4413 b_builder.finalize ();
4414
4415 /* Try loading A into a register. */
4416 rtx_insn *last = get_last_insn ();
4417 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4418 if (!a)
4419 return NULL_RTX;
4420
4421 /* Try loading B into a register. */
4422 rtx b = a;
4423 if (a_builder != b_builder)
4424 {
4425 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4426 if (!b)
4427 {
4428 delete_insns_since (last);
4429 return NULL_RTX;
4430 }
4431 }
4432
4433 /* Emit the TRN1 itself. */
4434 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4435 target = aarch64_target_reg (target, mode);
4436 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4437 gen_lowpart (mode, a),
4438 gen_lowpart (mode, b)));
4439 return target;
4440 }
4441
4442 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4443 constant in BUILDER into an SVE predicate register. Return the register
4444 on success, otherwise return null. Use TARGET for the register if
4445 nonnull and convenient.
4446
4447 ALLOW_RECURSE_P is true if we can use methods that would call this
4448 function recursively. */
4449
4450 static rtx
4451 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4452 bool allow_recurse_p)
4453 {
4454 if (builder.encoded_nelts () == 1)
4455 /* A PFALSE or a PTRUE .B ALL. */
4456 return aarch64_emit_set_immediate (target, builder);
4457
4458 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4459 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4460 {
4461 /* If we can load the constant using PTRUE, use it as-is. */
4462 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4463 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4464 return aarch64_emit_set_immediate (target, builder);
4465
4466 /* Otherwise use WHILE to set the first VL bits. */
4467 return aarch64_sve_move_pred_via_while (target, mode, vl);
4468 }
4469
4470 if (!allow_recurse_p)
4471 return NULL_RTX;
4472
4473 /* Try inverting the vector in element size ELT_SIZE and then EORing
4474 the result with an ELT_SIZE PTRUE. */
4475 if (INTVAL (builder.elt (0)) == 0)
4476 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4477 elt_size))
4478 return res;
4479
4480 /* Try using TRN1 to permute two simpler constants. */
4481 for (unsigned int i = elt_size; i <= 8; i *= 2)
4482 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4483 elt_size, i))
4484 return res;
4485
4486 return NULL_RTX;
4487 }
4488
4489 /* Return an SVE predicate register that contains the VNx16BImode
4490 constant in BUILDER, without going through the move expanders.
4491
4492 The returned register can have whatever mode seems most natural
4493 given the contents of BUILDER. Use TARGET for the result if
4494 convenient. */
4495
4496 static rtx
4497 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4498 {
4499 /* Try loading the constant using pure predicate operations. */
4500 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4501 return res;
4502
4503 /* Try forcing the constant to memory. */
4504 if (builder.full_nelts ().is_constant ())
4505 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4506 {
4507 target = aarch64_target_reg (target, VNx16BImode);
4508 emit_move_insn (target, mem);
4509 return target;
4510 }
4511
4512 /* The last resort is to load the constant as an integer and then
4513 compare it against zero. Use -1 for set bits in order to increase
4514 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4515 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4516 builder.nelts_per_pattern ());
4517 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4518 int_builder.quick_push (INTVAL (builder.elt (i))
4519 ? constm1_rtx : const0_rtx);
4520 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4521 int_builder.build ());
4522 }
4523
4524 /* Set DEST to immediate IMM. */
4525
4526 void
4527 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4528 {
4529 machine_mode mode = GET_MODE (dest);
4530
4531 /* Check on what type of symbol it is. */
4532 scalar_int_mode int_mode;
4533 if ((GET_CODE (imm) == SYMBOL_REF
4534 || GET_CODE (imm) == LABEL_REF
4535 || GET_CODE (imm) == CONST
4536 || GET_CODE (imm) == CONST_POLY_INT)
4537 && is_a <scalar_int_mode> (mode, &int_mode))
4538 {
4539 rtx mem;
4540 poly_int64 offset;
4541 HOST_WIDE_INT const_offset;
4542 enum aarch64_symbol_type sty;
4543
4544 /* If we have (const (plus symbol offset)), separate out the offset
4545 before we start classifying the symbol. */
4546 rtx base = strip_offset (imm, &offset);
4547
4548 /* We must always add an offset involving VL separately, rather than
4549 folding it into the relocation. */
4550 if (!offset.is_constant (&const_offset))
4551 {
4552 if (!TARGET_SVE)
4553 {
4554 aarch64_report_sve_required ();
4555 return;
4556 }
4557 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4558 emit_insn (gen_rtx_SET (dest, imm));
4559 else
4560 {
4561 /* Do arithmetic on 32-bit values if the result is smaller
4562 than that. */
4563 if (partial_subreg_p (int_mode, SImode))
4564 {
4565 /* It is invalid to do symbol calculations in modes
4566 narrower than SImode. */
4567 gcc_assert (base == const0_rtx);
4568 dest = gen_lowpart (SImode, dest);
4569 int_mode = SImode;
4570 }
4571 if (base != const0_rtx)
4572 {
4573 base = aarch64_force_temporary (int_mode, dest, base);
4574 aarch64_add_offset (int_mode, dest, base, offset,
4575 NULL_RTX, NULL_RTX, false);
4576 }
4577 else
4578 aarch64_add_offset (int_mode, dest, base, offset,
4579 dest, NULL_RTX, false);
4580 }
4581 return;
4582 }
4583
4584 sty = aarch64_classify_symbol (base, const_offset);
4585 switch (sty)
4586 {
4587 case SYMBOL_FORCE_TO_MEM:
4588 if (const_offset != 0
4589 && targetm.cannot_force_const_mem (int_mode, imm))
4590 {
4591 gcc_assert (can_create_pseudo_p ());
4592 base = aarch64_force_temporary (int_mode, dest, base);
4593 aarch64_add_offset (int_mode, dest, base, const_offset,
4594 NULL_RTX, NULL_RTX, false);
4595 return;
4596 }
4597
4598 mem = force_const_mem (ptr_mode, imm);
4599 gcc_assert (mem);
4600
4601 /* If we aren't generating PC relative literals, then
4602 we need to expand the literal pool access carefully.
4603 This is something that needs to be done in a number
4604 of places, so could well live as a separate function. */
4605 if (!aarch64_pcrelative_literal_loads)
4606 {
4607 gcc_assert (can_create_pseudo_p ());
4608 base = gen_reg_rtx (ptr_mode);
4609 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4610 if (ptr_mode != Pmode)
4611 base = convert_memory_address (Pmode, base);
4612 mem = gen_rtx_MEM (ptr_mode, base);
4613 }
4614
4615 if (int_mode != ptr_mode)
4616 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4617
4618 emit_insn (gen_rtx_SET (dest, mem));
4619
4620 return;
4621
4622 case SYMBOL_SMALL_TLSGD:
4623 case SYMBOL_SMALL_TLSDESC:
4624 case SYMBOL_SMALL_TLSIE:
4625 case SYMBOL_SMALL_GOT_28K:
4626 case SYMBOL_SMALL_GOT_4G:
4627 case SYMBOL_TINY_GOT:
4628 case SYMBOL_TINY_TLSIE:
4629 if (const_offset != 0)
4630 {
4631 gcc_assert(can_create_pseudo_p ());
4632 base = aarch64_force_temporary (int_mode, dest, base);
4633 aarch64_add_offset (int_mode, dest, base, const_offset,
4634 NULL_RTX, NULL_RTX, false);
4635 return;
4636 }
4637 /* FALLTHRU */
4638
4639 case SYMBOL_SMALL_ABSOLUTE:
4640 case SYMBOL_TINY_ABSOLUTE:
4641 case SYMBOL_TLSLE12:
4642 case SYMBOL_TLSLE24:
4643 case SYMBOL_TLSLE32:
4644 case SYMBOL_TLSLE48:
4645 aarch64_load_symref_appropriately (dest, imm, sty);
4646 return;
4647
4648 default:
4649 gcc_unreachable ();
4650 }
4651 }
4652
4653 if (!CONST_INT_P (imm))
4654 {
4655 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4656 {
4657 /* Only the low bit of each .H, .S and .D element is defined,
4658 so we can set the upper bits to whatever we like. If the
4659 predicate is all-true in MODE, prefer to set all the undefined
4660 bits as well, so that we can share a single .B predicate for
4661 all modes. */
4662 if (imm == CONSTM1_RTX (mode))
4663 imm = CONSTM1_RTX (VNx16BImode);
4664
4665 /* All methods for constructing predicate modes wider than VNx16BI
4666 will set the upper bits of each element to zero. Expose this
4667 by moving such constants as a VNx16BI, so that all bits are
4668 significant and so that constants for different modes can be
4669 shared. The wider constant will still be available as a
4670 REG_EQUAL note. */
4671 rtx_vector_builder builder;
4672 if (aarch64_get_sve_pred_bits (builder, imm))
4673 {
4674 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4675 if (dest != res)
4676 emit_move_insn (dest, gen_lowpart (mode, res));
4677 return;
4678 }
4679 }
4680
4681 if (GET_CODE (imm) == HIGH
4682 || aarch64_simd_valid_immediate (imm, NULL))
4683 {
4684 emit_insn (gen_rtx_SET (dest, imm));
4685 return;
4686 }
4687
4688 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4689 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4690 {
4691 if (dest != res)
4692 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4693 return;
4694 }
4695
4696 rtx mem = force_const_mem (mode, imm);
4697 gcc_assert (mem);
4698 emit_move_insn (dest, mem);
4699 return;
4700 }
4701
4702 aarch64_internal_mov_immediate (dest, imm, true,
4703 as_a <scalar_int_mode> (mode));
4704 }
4705
4706 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4707 that is known to contain PTRUE. */
4708
4709 void
4710 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4711 {
4712 expand_operand ops[3];
4713 machine_mode mode = GET_MODE (dest);
4714 create_output_operand (&ops[0], dest, mode);
4715 create_input_operand (&ops[1], pred, GET_MODE(pred));
4716 create_input_operand (&ops[2], src, mode);
4717 temporary_volatile_ok v (true);
4718 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4719 }
4720
4721 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4722 operand is in memory. In this case we need to use the predicated LD1
4723 and ST1 instead of LDR and STR, both for correctness on big-endian
4724 targets and because LD1 and ST1 support a wider range of addressing modes.
4725 PRED_MODE is the mode of the predicate.
4726
4727 See the comment at the head of aarch64-sve.md for details about the
4728 big-endian handling. */
4729
4730 void
4731 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4732 {
4733 machine_mode mode = GET_MODE (dest);
4734 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4735 if (!register_operand (src, mode)
4736 && !register_operand (dest, mode))
4737 {
4738 rtx tmp = gen_reg_rtx (mode);
4739 if (MEM_P (src))
4740 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4741 else
4742 emit_move_insn (tmp, src);
4743 src = tmp;
4744 }
4745 aarch64_emit_sve_pred_move (dest, ptrue, src);
4746 }
4747
4748 /* Called only on big-endian targets. See whether an SVE vector move
4749 from SRC to DEST is effectively a REV[BHW] instruction, because at
4750 least one operand is a subreg of an SVE vector that has wider or
4751 narrower elements. Return true and emit the instruction if so.
4752
4753 For example:
4754
4755 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4756
4757 represents a VIEW_CONVERT between the following vectors, viewed
4758 in memory order:
4759
4760 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4761 R1: { [0], [1], [2], [3], ... }
4762
4763 The high part of lane X in R2 should therefore correspond to lane X*2
4764 of R1, but the register representations are:
4765
4766 msb lsb
4767 R2: ...... [1].high [1].low [0].high [0].low
4768 R1: ...... [3] [2] [1] [0]
4769
4770 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4771 We therefore need a reverse operation to swap the high and low values
4772 around.
4773
4774 This is purely an optimization. Without it we would spill the
4775 subreg operand to the stack in one mode and reload it in the
4776 other mode, which has the same effect as the REV. */
4777
4778 bool
4779 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4780 {
4781 gcc_assert (BYTES_BIG_ENDIAN);
4782 if (GET_CODE (dest) == SUBREG)
4783 dest = SUBREG_REG (dest);
4784 if (GET_CODE (src) == SUBREG)
4785 src = SUBREG_REG (src);
4786
4787 /* The optimization handles two single SVE REGs with different element
4788 sizes. */
4789 if (!REG_P (dest)
4790 || !REG_P (src)
4791 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4792 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4793 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4794 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4795 return false;
4796
4797 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4798 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4799 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4800 UNSPEC_REV_SUBREG);
4801 emit_insn (gen_rtx_SET (dest, unspec));
4802 return true;
4803 }
4804
4805 /* Return a copy of X with mode MODE, without changing its other
4806 attributes. Unlike gen_lowpart, this doesn't care whether the
4807 mode change is valid. */
4808
4809 rtx
4810 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4811 {
4812 if (GET_MODE (x) == mode)
4813 return x;
4814
4815 x = shallow_copy_rtx (x);
4816 set_mode_and_regno (x, mode, REGNO (x));
4817 return x;
4818 }
4819
4820 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4821 stored in wider integer containers. */
4822
4823 static unsigned int
4824 aarch64_sve_rev_unspec (machine_mode mode)
4825 {
4826 switch (GET_MODE_UNIT_SIZE (mode))
4827 {
4828 case 1: return UNSPEC_REVB;
4829 case 2: return UNSPEC_REVH;
4830 case 4: return UNSPEC_REVW;
4831 }
4832 gcc_unreachable ();
4833 }
4834
4835 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4836 operands. */
4837
4838 void
4839 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4840 {
4841 /* Decide which REV operation we need. The mode with wider elements
4842 determines the mode of the operands and the mode with the narrower
4843 elements determines the reverse width. */
4844 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
4845 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
4846 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4847 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4848 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4849
4850 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4851 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
4852
4853 /* Get the operands in the appropriate modes and emit the instruction. */
4854 ptrue = gen_lowpart (pred_mode, ptrue);
4855 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4856 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4857 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4858 dest, ptrue, src));
4859 }
4860
4861 static bool
4862 aarch64_function_ok_for_sibcall (tree, tree exp)
4863 {
4864 if (crtl->abi->id () != expr_callee_abi (exp).id ())
4865 return false;
4866
4867 return true;
4868 }
4869
4870 /* Implement TARGET_PASS_BY_REFERENCE. */
4871
4872 static bool
4873 aarch64_pass_by_reference (cumulative_args_t pcum_v,
4874 const function_arg_info &arg)
4875 {
4876 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4877 HOST_WIDE_INT size;
4878 machine_mode dummymode;
4879 int nregs;
4880
4881 unsigned int num_zr, num_pr;
4882 if (arg.type && aarch64_sve::builtin_type_p (arg.type, &num_zr, &num_pr))
4883 {
4884 if (pcum && !pcum->silent_p && !TARGET_SVE)
4885 /* We can't gracefully recover at this point, so make this a
4886 fatal error. */
4887 fatal_error (input_location, "arguments of type %qT require"
4888 " the SVE ISA extension", arg.type);
4889
4890 /* Variadic SVE types are passed by reference. Normal non-variadic
4891 arguments are too if we've run out of registers. */
4892 return (!arg.named
4893 || pcum->aapcs_nvrn + num_zr > NUM_FP_ARG_REGS
4894 || pcum->aapcs_nprn + num_pr > NUM_PR_ARG_REGS);
4895 }
4896
4897 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4898 if (arg.mode == BLKmode && arg.type)
4899 size = int_size_in_bytes (arg.type);
4900 else
4901 /* No frontends can create types with variable-sized modes, so we
4902 shouldn't be asked to pass or return them. */
4903 size = GET_MODE_SIZE (arg.mode).to_constant ();
4904
4905 /* Aggregates are passed by reference based on their size. */
4906 if (arg.aggregate_type_p ())
4907 size = int_size_in_bytes (arg.type);
4908
4909 /* Variable sized arguments are always returned by reference. */
4910 if (size < 0)
4911 return true;
4912
4913 /* Can this be a candidate to be passed in fp/simd register(s)? */
4914 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4915 &dummymode, &nregs,
4916 NULL))
4917 return false;
4918
4919 /* Arguments which are variable sized or larger than 2 registers are
4920 passed by reference unless they are a homogenous floating point
4921 aggregate. */
4922 return size > 2 * UNITS_PER_WORD;
4923 }
4924
4925 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4926 static bool
4927 aarch64_return_in_msb (const_tree valtype)
4928 {
4929 machine_mode dummy_mode;
4930 int dummy_int;
4931
4932 /* Never happens in little-endian mode. */
4933 if (!BYTES_BIG_ENDIAN)
4934 return false;
4935
4936 /* Only composite types smaller than or equal to 16 bytes can
4937 be potentially returned in registers. */
4938 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4939 || int_size_in_bytes (valtype) <= 0
4940 || int_size_in_bytes (valtype) > 16)
4941 return false;
4942
4943 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4944 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4945 is always passed/returned in the least significant bits of fp/simd
4946 register(s). */
4947 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4948 &dummy_mode, &dummy_int, NULL))
4949 return false;
4950
4951 return true;
4952 }
4953
4954 /* Subroutine of aarch64_function_value. MODE is the mode of the argument
4955 after promotion, and after partial SVE types have been replaced by
4956 their integer equivalents. */
4957 static rtx
4958 aarch64_function_value_1 (const_tree type, machine_mode mode)
4959 {
4960 unsigned int num_zr, num_pr;
4961 if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
4962 {
4963 /* Don't raise an error here if we're called when SVE is disabled,
4964 since this is really just a query function. Other code must
4965 do that where appropriate. */
4966 mode = TYPE_MODE_RAW (type);
4967 gcc_assert (VECTOR_MODE_P (mode)
4968 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
4969
4970 if (num_zr > 0 && num_pr == 0)
4971 return gen_rtx_REG (mode, V0_REGNUM);
4972
4973 if (num_zr == 0 && num_pr == 1)
4974 return gen_rtx_REG (mode, P0_REGNUM);
4975
4976 gcc_unreachable ();
4977 }
4978
4979 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
4980 returned in memory, not by value. */
4981 gcc_assert (!aarch64_sve_mode_p (mode));
4982
4983 if (aarch64_return_in_msb (type))
4984 {
4985 HOST_WIDE_INT size = int_size_in_bytes (type);
4986
4987 if (size % UNITS_PER_WORD != 0)
4988 {
4989 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4990 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4991 }
4992 }
4993
4994 int count;
4995 machine_mode ag_mode;
4996 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4997 &ag_mode, &count, NULL))
4998 {
4999 if (!aarch64_composite_type_p (type, mode))
5000 {
5001 gcc_assert (count == 1 && mode == ag_mode);
5002 return gen_rtx_REG (mode, V0_REGNUM);
5003 }
5004 else
5005 {
5006 int i;
5007 rtx par;
5008
5009 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5010 for (i = 0; i < count; i++)
5011 {
5012 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5013 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5014 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5015 XVECEXP (par, 0, i) = tmp;
5016 }
5017 return par;
5018 }
5019 }
5020 else
5021 return gen_rtx_REG (mode, R0_REGNUM);
5022 }
5023
5024 /* Implement TARGET_FUNCTION_VALUE.
5025 Define how to find the value returned by a function. */
5026
5027 static rtx
5028 aarch64_function_value (const_tree type, const_tree func,
5029 bool outgoing ATTRIBUTE_UNUSED)
5030 {
5031 machine_mode mode;
5032 int unsignedp;
5033
5034 mode = TYPE_MODE (type);
5035 if (INTEGRAL_TYPE_P (type))
5036 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5037
5038 /* Vector types can acquire a partial SVE mode using things like
5039 __attribute__((vector_size(N))), and this is potentially useful.
5040 However, the choice of mode doesn't affect the type's ABI identity,
5041 so we should treat the types as though they had the associated
5042 integer mode, just like they did before SVE was introduced.
5043
5044 We know that the vector must be 128 bits or smaller, otherwise we'd
5045 have returned it in memory instead. */
5046 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5047 if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5048 {
5049 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5050 rtx reg = aarch64_function_value_1 (type, int_mode);
5051 /* Vector types are never returned in the MSB and are never split. */
5052 gcc_assert (REG_P (reg) && GET_MODE (reg) == int_mode);
5053 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5054 return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, pair));
5055 }
5056
5057 return aarch64_function_value_1 (type, mode);
5058 }
5059
5060 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5061 Return true if REGNO is the number of a hard register in which the values
5062 of called function may come back. */
5063
5064 static bool
5065 aarch64_function_value_regno_p (const unsigned int regno)
5066 {
5067 /* Maximum of 16 bytes can be returned in the general registers. Examples
5068 of 16-byte return values are: 128-bit integers and 16-byte small
5069 structures (excluding homogeneous floating-point aggregates). */
5070 if (regno == R0_REGNUM || regno == R1_REGNUM)
5071 return true;
5072
5073 /* Up to four fp/simd registers can return a function value, e.g. a
5074 homogeneous floating-point aggregate having four members. */
5075 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5076 return TARGET_FLOAT;
5077
5078 return false;
5079 }
5080
5081 /* Implement TARGET_RETURN_IN_MEMORY.
5082
5083 If the type T of the result of a function is such that
5084 void func (T arg)
5085 would require that arg be passed as a value in a register (or set of
5086 registers) according to the parameter passing rules, then the result
5087 is returned in the same registers as would be used for such an
5088 argument. */
5089
5090 static bool
5091 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5092 {
5093 HOST_WIDE_INT size;
5094 machine_mode ag_mode;
5095 int count;
5096
5097 if (!AGGREGATE_TYPE_P (type)
5098 && TREE_CODE (type) != COMPLEX_TYPE
5099 && TREE_CODE (type) != VECTOR_TYPE)
5100 /* Simple scalar types always returned in registers. */
5101 return false;
5102
5103 unsigned int num_zr, num_pr;
5104 if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
5105 {
5106 /* All SVE types we support fit in registers. For example, it isn't
5107 yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
5108 predicates. */
5109 gcc_assert (num_zr <= NUM_FP_ARG_REGS && num_pr <= NUM_PR_ARG_REGS);
5110 return false;
5111 }
5112
5113 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
5114 type,
5115 &ag_mode,
5116 &count,
5117 NULL))
5118 return false;
5119
5120 /* Types larger than 2 registers returned in memory. */
5121 size = int_size_in_bytes (type);
5122 return (size < 0 || size > 2 * UNITS_PER_WORD);
5123 }
5124
5125 static bool
5126 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5127 const_tree type, int *nregs)
5128 {
5129 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5130 return aarch64_vfp_is_call_or_return_candidate (mode,
5131 type,
5132 &pcum->aapcs_vfp_rmode,
5133 nregs,
5134 NULL);
5135 }
5136
5137 /* Given MODE and TYPE of a function argument, return the alignment in
5138 bits. The idea is to suppress any stronger alignment requested by
5139 the user and opt for the natural alignment (specified in AAPCS64 \S
5140 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5141 calculated in versions of GCC prior to GCC-9. This is a helper
5142 function for local use only. */
5143
5144 static unsigned int
5145 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5146 bool *abi_break)
5147 {
5148 *abi_break = false;
5149 if (!type)
5150 return GET_MODE_ALIGNMENT (mode);
5151
5152 if (integer_zerop (TYPE_SIZE (type)))
5153 return 0;
5154
5155 gcc_assert (TYPE_MODE (type) == mode);
5156
5157 if (!AGGREGATE_TYPE_P (type))
5158 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5159
5160 if (TREE_CODE (type) == ARRAY_TYPE)
5161 return TYPE_ALIGN (TREE_TYPE (type));
5162
5163 unsigned int alignment = 0;
5164 unsigned int bitfield_alignment = 0;
5165 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5166 if (TREE_CODE (field) == FIELD_DECL)
5167 {
5168 alignment = std::max (alignment, DECL_ALIGN (field));
5169 if (DECL_BIT_FIELD_TYPE (field))
5170 bitfield_alignment
5171 = std::max (bitfield_alignment,
5172 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5173 }
5174
5175 if (bitfield_alignment > alignment)
5176 {
5177 *abi_break = true;
5178 return bitfield_alignment;
5179 }
5180
5181 return alignment;
5182 }
5183
5184 /* Layout a function argument according to the AAPCS64 rules. The rule
5185 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5186 mode that was originally given to us by the target hook, whereas the
5187 mode in ARG might be the result of replacing partial SVE modes with
5188 the equivalent integer mode. */
5189
5190 static void
5191 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg,
5192 machine_mode orig_mode)
5193 {
5194 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5195 tree type = arg.type;
5196 machine_mode mode = arg.mode;
5197 int ncrn, nvrn, nregs;
5198 bool allocate_ncrn, allocate_nvrn;
5199 HOST_WIDE_INT size;
5200 bool abi_break;
5201
5202 /* We need to do this once per argument. */
5203 if (pcum->aapcs_arg_processed)
5204 return;
5205
5206 /* Vector types can acquire a partial SVE mode using things like
5207 __attribute__((vector_size(N))), and this is potentially useful.
5208 However, the choice of mode doesn't affect the type's ABI identity,
5209 so we should treat the types as though they had the associated
5210 integer mode, just like they did before SVE was introduced.
5211
5212 We know that the vector must be 128 bits or smaller, otherwise we'd
5213 have passed it by reference instead. */
5214 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5215 if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5216 {
5217 function_arg_info tmp_arg = arg;
5218 tmp_arg.mode = int_mode_for_mode (mode).require ();
5219 aarch64_layout_arg (pcum_v, tmp_arg, orig_mode);
5220 if (rtx reg = pcum->aapcs_reg)
5221 {
5222 gcc_assert (REG_P (reg) && GET_MODE (reg) == tmp_arg.mode);
5223 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5224 pcum->aapcs_reg = gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5225 }
5226 return;
5227 }
5228
5229 pcum->aapcs_arg_processed = true;
5230
5231 unsigned int num_zr, num_pr;
5232 if (type && aarch64_sve::builtin_type_p (type, &num_zr, &num_pr))
5233 {
5234 /* The PCS says that it is invalid to pass an SVE value to an
5235 unprototyped function. There is no ABI-defined location we
5236 can return in this case, so we have no real choice but to raise
5237 an error immediately, even though this is only a query function. */
5238 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5239 {
5240 gcc_assert (!pcum->silent_p);
5241 error ("SVE type %qT cannot be passed to an unprototyped function",
5242 arg.type);
5243 /* Avoid repeating the message, and avoid tripping the assert
5244 below. */
5245 pcum->pcs_variant = ARM_PCS_SVE;
5246 }
5247
5248 /* We would have converted the argument into pass-by-reference
5249 form if it didn't fit in registers. */
5250 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + num_zr;
5251 pcum->aapcs_nextnprn = pcum->aapcs_nprn + num_pr;
5252 gcc_assert (arg.named
5253 && pcum->pcs_variant == ARM_PCS_SVE
5254 && aarch64_sve_mode_p (mode)
5255 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5256 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5257
5258 if (num_zr > 0 && num_pr == 0)
5259 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + pcum->aapcs_nvrn);
5260 else if (num_zr == 0 && num_pr == 1)
5261 pcum->aapcs_reg = gen_rtx_REG (mode, P0_REGNUM + pcum->aapcs_nprn);
5262 else
5263 gcc_unreachable ();
5264 return;
5265 }
5266
5267 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
5268 passed by reference, not by value. */
5269 gcc_assert (!aarch64_sve_mode_p (mode));
5270
5271 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
5272 if (type)
5273 size = int_size_in_bytes (type);
5274 else
5275 /* No frontends can create types with variable-sized modes, so we
5276 shouldn't be asked to pass or return them. */
5277 size = GET_MODE_SIZE (mode).to_constant ();
5278 size = ROUND_UP (size, UNITS_PER_WORD);
5279
5280 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5281 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5282 mode,
5283 type,
5284 &nregs);
5285
5286 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5287 The following code thus handles passing by SIMD/FP registers first. */
5288
5289 nvrn = pcum->aapcs_nvrn;
5290
5291 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5292 and homogenous short-vector aggregates (HVA). */
5293 if (allocate_nvrn)
5294 {
5295 if (!pcum->silent_p && !TARGET_FLOAT)
5296 aarch64_err_no_fpadvsimd (mode);
5297
5298 if (nvrn + nregs <= NUM_FP_ARG_REGS)
5299 {
5300 pcum->aapcs_nextnvrn = nvrn + nregs;
5301 if (!aarch64_composite_type_p (type, mode))
5302 {
5303 gcc_assert (nregs == 1);
5304 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5305 }
5306 else
5307 {
5308 rtx par;
5309 int i;
5310 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5311 for (i = 0; i < nregs; i++)
5312 {
5313 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5314 V0_REGNUM + nvrn + i);
5315 rtx offset = gen_int_mode
5316 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5317 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5318 XVECEXP (par, 0, i) = tmp;
5319 }
5320 pcum->aapcs_reg = par;
5321 }
5322 return;
5323 }
5324 else
5325 {
5326 /* C.3 NSRN is set to 8. */
5327 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5328 goto on_stack;
5329 }
5330 }
5331
5332 ncrn = pcum->aapcs_ncrn;
5333 nregs = size / UNITS_PER_WORD;
5334
5335 /* C6 - C9. though the sign and zero extension semantics are
5336 handled elsewhere. This is the case where the argument fits
5337 entirely general registers. */
5338 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5339 {
5340 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5341
5342 /* C.8 if the argument has an alignment of 16 then the NGRN is
5343 rounded up to the next even number. */
5344 if (nregs == 2
5345 && ncrn % 2
5346 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5347 comparison is there because for > 16 * BITS_PER_UNIT
5348 alignment nregs should be > 2 and therefore it should be
5349 passed by reference rather than value. */
5350 && (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5351 == 16 * BITS_PER_UNIT))
5352 {
5353 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5354 inform (input_location, "parameter passing for argument of type "
5355 "%qT changed in GCC 9.1", type);
5356 ++ncrn;
5357 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5358 }
5359
5360 /* NREGS can be 0 when e.g. an empty structure is to be passed.
5361 A reg is still generated for it, but the caller should be smart
5362 enough not to use it. */
5363 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
5364 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5365 else
5366 {
5367 rtx par;
5368 int i;
5369
5370 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5371 for (i = 0; i < nregs; i++)
5372 {
5373 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
5374 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5375 GEN_INT (i * UNITS_PER_WORD));
5376 XVECEXP (par, 0, i) = tmp;
5377 }
5378 pcum->aapcs_reg = par;
5379 }
5380
5381 pcum->aapcs_nextncrn = ncrn + nregs;
5382 return;
5383 }
5384
5385 /* C.11 */
5386 pcum->aapcs_nextncrn = NUM_ARG_REGS;
5387
5388 /* The argument is passed on stack; record the needed number of words for
5389 this argument and align the total size if necessary. */
5390 on_stack:
5391 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5392
5393 if (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5394 == 16 * BITS_PER_UNIT)
5395 {
5396 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5397 if (pcum->aapcs_stack_size != new_size)
5398 {
5399 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5400 inform (input_location, "parameter passing for argument of type "
5401 "%qT changed in GCC 9.1", type);
5402 pcum->aapcs_stack_size = new_size;
5403 }
5404 }
5405 return;
5406 }
5407
5408 /* Implement TARGET_FUNCTION_ARG. */
5409
5410 static rtx
5411 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5412 {
5413 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5414 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5415 || pcum->pcs_variant == ARM_PCS_SIMD
5416 || pcum->pcs_variant == ARM_PCS_SVE);
5417
5418 if (arg.end_marker_p ())
5419 return gen_int_mode (pcum->pcs_variant, DImode);
5420
5421 aarch64_layout_arg (pcum_v, arg, arg.mode);
5422 return pcum->aapcs_reg;
5423 }
5424
5425 void
5426 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5427 const_tree fntype,
5428 rtx libname ATTRIBUTE_UNUSED,
5429 const_tree fndecl ATTRIBUTE_UNUSED,
5430 unsigned n_named ATTRIBUTE_UNUSED,
5431 bool silent_p)
5432 {
5433 pcum->aapcs_ncrn = 0;
5434 pcum->aapcs_nvrn = 0;
5435 pcum->aapcs_nprn = 0;
5436 pcum->aapcs_nextncrn = 0;
5437 pcum->aapcs_nextnvrn = 0;
5438 pcum->aapcs_nextnprn = 0;
5439 if (fntype)
5440 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5441 else
5442 pcum->pcs_variant = ARM_PCS_AAPCS64;
5443 pcum->aapcs_reg = NULL_RTX;
5444 pcum->aapcs_arg_processed = false;
5445 pcum->aapcs_stack_words = 0;
5446 pcum->aapcs_stack_size = 0;
5447 pcum->silent_p = silent_p;
5448
5449 if (!silent_p
5450 && !TARGET_FLOAT
5451 && fndecl && TREE_PUBLIC (fndecl)
5452 && fntype && fntype != error_mark_node)
5453 {
5454 const_tree type = TREE_TYPE (fntype);
5455 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
5456 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
5457 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5458 &mode, &nregs, NULL))
5459 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5460 }
5461
5462 if (!silent_p
5463 && !TARGET_SVE
5464 && pcum->pcs_variant == ARM_PCS_SVE)
5465 {
5466 /* We can't gracefully recover at this point, so make this a
5467 fatal error. */
5468 if (fndecl)
5469 fatal_error (input_location, "%qE requires the SVE ISA extension",
5470 fndecl);
5471 else
5472 fatal_error (input_location, "calls to functions of type %qT require"
5473 " the SVE ISA extension", fntype);
5474 }
5475 }
5476
5477 static void
5478 aarch64_function_arg_advance (cumulative_args_t pcum_v,
5479 const function_arg_info &arg)
5480 {
5481 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5482 if (pcum->pcs_variant == ARM_PCS_AAPCS64
5483 || pcum->pcs_variant == ARM_PCS_SIMD
5484 || pcum->pcs_variant == ARM_PCS_SVE)
5485 {
5486 aarch64_layout_arg (pcum_v, arg, arg.mode);
5487 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
5488 != (pcum->aapcs_stack_words != 0));
5489 pcum->aapcs_arg_processed = false;
5490 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
5491 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
5492 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
5493 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
5494 pcum->aapcs_stack_words = 0;
5495 pcum->aapcs_reg = NULL_RTX;
5496 }
5497 }
5498
5499 bool
5500 aarch64_function_arg_regno_p (unsigned regno)
5501 {
5502 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
5503 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
5504 }
5505
5506 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
5507 PARM_BOUNDARY bits of alignment, but will be given anything up
5508 to STACK_BOUNDARY bits if the type requires it. This makes sure
5509 that both before and after the layout of each argument, the Next
5510 Stacked Argument Address (NSAA) will have a minimum alignment of
5511 8 bytes. */
5512
5513 static unsigned int
5514 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
5515 {
5516 bool abi_break;
5517 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
5518 &abi_break);
5519 if (abi_break & warn_psabi)
5520 inform (input_location, "parameter passing for argument of type "
5521 "%qT changed in GCC 9.1", type);
5522
5523 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
5524 }
5525
5526 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
5527
5528 static fixed_size_mode
5529 aarch64_get_reg_raw_mode (int regno)
5530 {
5531 if (TARGET_SVE && FP_REGNUM_P (regno))
5532 /* Don't use the SVE part of the register for __builtin_apply and
5533 __builtin_return. The SVE registers aren't used by the normal PCS,
5534 so using them there would be a waste of time. The PCS extensions
5535 for SVE types are fundamentally incompatible with the
5536 __builtin_return/__builtin_apply interface. */
5537 return as_a <fixed_size_mode> (V16QImode);
5538 return default_get_reg_raw_mode (regno);
5539 }
5540
5541 /* Implement TARGET_FUNCTION_ARG_PADDING.
5542
5543 Small aggregate types are placed in the lowest memory address.
5544
5545 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
5546
5547 static pad_direction
5548 aarch64_function_arg_padding (machine_mode mode, const_tree type)
5549 {
5550 /* On little-endian targets, the least significant byte of every stack
5551 argument is passed at the lowest byte address of the stack slot. */
5552 if (!BYTES_BIG_ENDIAN)
5553 return PAD_UPWARD;
5554
5555 /* Otherwise, integral, floating-point and pointer types are padded downward:
5556 the least significant byte of a stack argument is passed at the highest
5557 byte address of the stack slot. */
5558 if (type
5559 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
5560 || POINTER_TYPE_P (type))
5561 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
5562 return PAD_DOWNWARD;
5563
5564 /* Everything else padded upward, i.e. data in first byte of stack slot. */
5565 return PAD_UPWARD;
5566 }
5567
5568 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
5569
5570 It specifies padding for the last (may also be the only)
5571 element of a block move between registers and memory. If
5572 assuming the block is in the memory, padding upward means that
5573 the last element is padded after its highest significant byte,
5574 while in downward padding, the last element is padded at the
5575 its least significant byte side.
5576
5577 Small aggregates and small complex types are always padded
5578 upwards.
5579
5580 We don't need to worry about homogeneous floating-point or
5581 short-vector aggregates; their move is not affected by the
5582 padding direction determined here. Regardless of endianness,
5583 each element of such an aggregate is put in the least
5584 significant bits of a fp/simd register.
5585
5586 Return !BYTES_BIG_ENDIAN if the least significant byte of the
5587 register has useful data, and return the opposite if the most
5588 significant byte does. */
5589
5590 bool
5591 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
5592 bool first ATTRIBUTE_UNUSED)
5593 {
5594
5595 /* Small composite types are always padded upward. */
5596 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
5597 {
5598 HOST_WIDE_INT size;
5599 if (type)
5600 size = int_size_in_bytes (type);
5601 else
5602 /* No frontends can create types with variable-sized modes, so we
5603 shouldn't be asked to pass or return them. */
5604 size = GET_MODE_SIZE (mode).to_constant ();
5605 if (size < 2 * UNITS_PER_WORD)
5606 return true;
5607 }
5608
5609 /* Otherwise, use the default padding. */
5610 return !BYTES_BIG_ENDIAN;
5611 }
5612
5613 static scalar_int_mode
5614 aarch64_libgcc_cmp_return_mode (void)
5615 {
5616 return SImode;
5617 }
5618
5619 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5620
5621 /* We use the 12-bit shifted immediate arithmetic instructions so values
5622 must be multiple of (1 << 12), i.e. 4096. */
5623 #define ARITH_FACTOR 4096
5624
5625 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5626 #error Cannot use simple address calculation for stack probing
5627 #endif
5628
5629 /* The pair of scratch registers used for stack probing. */
5630 #define PROBE_STACK_FIRST_REG R9_REGNUM
5631 #define PROBE_STACK_SECOND_REG R10_REGNUM
5632
5633 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5634 inclusive. These are offsets from the current stack pointer. */
5635
5636 static void
5637 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5638 {
5639 HOST_WIDE_INT size;
5640 if (!poly_size.is_constant (&size))
5641 {
5642 sorry ("stack probes for SVE frames");
5643 return;
5644 }
5645
5646 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5647
5648 /* See the same assertion on PROBE_INTERVAL above. */
5649 gcc_assert ((first % ARITH_FACTOR) == 0);
5650
5651 /* See if we have a constant small number of probes to generate. If so,
5652 that's the easy case. */
5653 if (size <= PROBE_INTERVAL)
5654 {
5655 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5656
5657 emit_set_insn (reg1,
5658 plus_constant (Pmode,
5659 stack_pointer_rtx, -(first + base)));
5660 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5661 }
5662
5663 /* The run-time loop is made up of 8 insns in the generic case while the
5664 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5665 else if (size <= 4 * PROBE_INTERVAL)
5666 {
5667 HOST_WIDE_INT i, rem;
5668
5669 emit_set_insn (reg1,
5670 plus_constant (Pmode,
5671 stack_pointer_rtx,
5672 -(first + PROBE_INTERVAL)));
5673 emit_stack_probe (reg1);
5674
5675 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5676 it exceeds SIZE. If only two probes are needed, this will not
5677 generate any code. Then probe at FIRST + SIZE. */
5678 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5679 {
5680 emit_set_insn (reg1,
5681 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5682 emit_stack_probe (reg1);
5683 }
5684
5685 rem = size - (i - PROBE_INTERVAL);
5686 if (rem > 256)
5687 {
5688 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5689
5690 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5691 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5692 }
5693 else
5694 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5695 }
5696
5697 /* Otherwise, do the same as above, but in a loop. Note that we must be
5698 extra careful with variables wrapping around because we might be at
5699 the very top (or the very bottom) of the address space and we have
5700 to be able to handle this case properly; in particular, we use an
5701 equality test for the loop condition. */
5702 else
5703 {
5704 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5705
5706 /* Step 1: round SIZE to the previous multiple of the interval. */
5707
5708 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5709
5710
5711 /* Step 2: compute initial and final value of the loop counter. */
5712
5713 /* TEST_ADDR = SP + FIRST. */
5714 emit_set_insn (reg1,
5715 plus_constant (Pmode, stack_pointer_rtx, -first));
5716
5717 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5718 HOST_WIDE_INT adjustment = - (first + rounded_size);
5719 if (! aarch64_uimm12_shift (adjustment))
5720 {
5721 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5722 true, Pmode);
5723 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5724 }
5725 else
5726 emit_set_insn (reg2,
5727 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5728
5729 /* Step 3: the loop
5730
5731 do
5732 {
5733 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5734 probe at TEST_ADDR
5735 }
5736 while (TEST_ADDR != LAST_ADDR)
5737
5738 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5739 until it is equal to ROUNDED_SIZE. */
5740
5741 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5742
5743
5744 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5745 that SIZE is equal to ROUNDED_SIZE. */
5746
5747 if (size != rounded_size)
5748 {
5749 HOST_WIDE_INT rem = size - rounded_size;
5750
5751 if (rem > 256)
5752 {
5753 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5754
5755 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5756 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5757 }
5758 else
5759 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5760 }
5761 }
5762
5763 /* Make sure nothing is scheduled before we are done. */
5764 emit_insn (gen_blockage ());
5765 }
5766
5767 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5768 absolute addresses. */
5769
5770 const char *
5771 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5772 {
5773 static int labelno = 0;
5774 char loop_lab[32];
5775 rtx xops[2];
5776
5777 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5778
5779 /* Loop. */
5780 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5781
5782 HOST_WIDE_INT stack_clash_probe_interval
5783 = 1 << param_stack_clash_protection_guard_size;
5784
5785 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5786 xops[0] = reg1;
5787 HOST_WIDE_INT interval;
5788 if (flag_stack_clash_protection)
5789 interval = stack_clash_probe_interval;
5790 else
5791 interval = PROBE_INTERVAL;
5792
5793 gcc_assert (aarch64_uimm12_shift (interval));
5794 xops[1] = GEN_INT (interval);
5795
5796 output_asm_insn ("sub\t%0, %0, %1", xops);
5797
5798 /* If doing stack clash protection then we probe up by the ABI specified
5799 amount. We do this because we're dropping full pages at a time in the
5800 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5801 if (flag_stack_clash_protection)
5802 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5803 else
5804 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5805
5806 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5807 by this amount for each iteration. */
5808 output_asm_insn ("str\txzr, [%0, %1]", xops);
5809
5810 /* Test if TEST_ADDR == LAST_ADDR. */
5811 xops[1] = reg2;
5812 output_asm_insn ("cmp\t%0, %1", xops);
5813
5814 /* Branch. */
5815 fputs ("\tb.ne\t", asm_out_file);
5816 assemble_name_raw (asm_out_file, loop_lab);
5817 fputc ('\n', asm_out_file);
5818
5819 return "";
5820 }
5821
5822 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5823 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5824 of GUARD_SIZE. When a probe is emitted it is done at most
5825 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5826 at most MIN_PROBE_THRESHOLD. By the end of this function
5827 BASE = BASE - ADJUSTMENT. */
5828
5829 const char *
5830 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5831 rtx min_probe_threshold, rtx guard_size)
5832 {
5833 /* This function is not allowed to use any instruction generation function
5834 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5835 so instead emit the code you want using output_asm_insn. */
5836 gcc_assert (flag_stack_clash_protection);
5837 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5838 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5839
5840 /* The minimum required allocation before the residual requires probing. */
5841 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5842
5843 /* Clamp the value down to the nearest value that can be used with a cmp. */
5844 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5845 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5846
5847 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5848 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5849
5850 static int labelno = 0;
5851 char loop_start_lab[32];
5852 char loop_end_lab[32];
5853 rtx xops[2];
5854
5855 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5856 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5857
5858 /* Emit loop start label. */
5859 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5860
5861 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5862 xops[0] = adjustment;
5863 xops[1] = probe_offset_value_rtx;
5864 output_asm_insn ("cmp\t%0, %1", xops);
5865
5866 /* Branch to end if not enough adjustment to probe. */
5867 fputs ("\tb.lt\t", asm_out_file);
5868 assemble_name_raw (asm_out_file, loop_end_lab);
5869 fputc ('\n', asm_out_file);
5870
5871 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5872 xops[0] = base;
5873 xops[1] = probe_offset_value_rtx;
5874 output_asm_insn ("sub\t%0, %0, %1", xops);
5875
5876 /* Probe at BASE. */
5877 xops[1] = const0_rtx;
5878 output_asm_insn ("str\txzr, [%0, %1]", xops);
5879
5880 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5881 xops[0] = adjustment;
5882 xops[1] = probe_offset_value_rtx;
5883 output_asm_insn ("sub\t%0, %0, %1", xops);
5884
5885 /* Branch to start if still more bytes to allocate. */
5886 fputs ("\tb\t", asm_out_file);
5887 assemble_name_raw (asm_out_file, loop_start_lab);
5888 fputc ('\n', asm_out_file);
5889
5890 /* No probe leave. */
5891 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5892
5893 /* BASE = BASE - ADJUSTMENT. */
5894 xops[0] = base;
5895 xops[1] = adjustment;
5896 output_asm_insn ("sub\t%0, %0, %1", xops);
5897 return "";
5898 }
5899
5900 /* Determine whether a frame chain needs to be generated. */
5901 static bool
5902 aarch64_needs_frame_chain (void)
5903 {
5904 /* Force a frame chain for EH returns so the return address is at FP+8. */
5905 if (frame_pointer_needed || crtl->calls_eh_return)
5906 return true;
5907
5908 /* A leaf function cannot have calls or write LR. */
5909 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5910
5911 /* Don't use a frame chain in leaf functions if leaf frame pointers
5912 are disabled. */
5913 if (flag_omit_leaf_frame_pointer && is_leaf)
5914 return false;
5915
5916 return aarch64_use_frame_pointer;
5917 }
5918
5919 /* Mark the registers that need to be saved by the callee and calculate
5920 the size of the callee-saved registers area and frame record (both FP
5921 and LR may be omitted). */
5922 static void
5923 aarch64_layout_frame (void)
5924 {
5925 poly_int64 offset = 0;
5926 int regno, last_fp_reg = INVALID_REGNUM;
5927 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
5928 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
5929 bool frame_related_fp_reg_p = false;
5930 aarch64_frame &frame = cfun->machine->frame;
5931
5932 frame.emit_frame_chain = aarch64_needs_frame_chain ();
5933
5934 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5935 the mid-end is doing. */
5936 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5937
5938 #define SLOT_NOT_REQUIRED (-2)
5939 #define SLOT_REQUIRED (-1)
5940
5941 frame.wb_candidate1 = INVALID_REGNUM;
5942 frame.wb_candidate2 = INVALID_REGNUM;
5943 frame.spare_pred_reg = INVALID_REGNUM;
5944
5945 /* First mark all the registers that really need to be saved... */
5946 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5947 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5948
5949 /* ... that includes the eh data registers (if needed)... */
5950 if (crtl->calls_eh_return)
5951 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5952 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
5953
5954 /* ... and any callee saved register that dataflow says is live. */
5955 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5956 if (df_regs_ever_live_p (regno)
5957 && !fixed_regs[regno]
5958 && (regno == R30_REGNUM
5959 || !crtl->abi->clobbers_full_reg_p (regno)))
5960 frame.reg_offset[regno] = SLOT_REQUIRED;
5961
5962 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5963 if (df_regs_ever_live_p (regno)
5964 && !fixed_regs[regno]
5965 && !crtl->abi->clobbers_full_reg_p (regno))
5966 {
5967 frame.reg_offset[regno] = SLOT_REQUIRED;
5968 last_fp_reg = regno;
5969 if (aarch64_emit_cfi_for_reg_p (regno))
5970 frame_related_fp_reg_p = true;
5971 }
5972
5973 /* Big-endian SVE frames need a spare predicate register in order
5974 to save Z8-Z15. Decide which register they should use. Prefer
5975 an unused argument register if possible, so that we don't force P4
5976 to be saved unnecessarily. */
5977 if (frame_related_fp_reg_p
5978 && crtl->abi->id () == ARM_PCS_SVE
5979 && BYTES_BIG_ENDIAN)
5980 {
5981 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5982 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
5983 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
5984 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
5985 break;
5986 gcc_assert (regno <= P7_REGNUM);
5987 frame.spare_pred_reg = regno;
5988 df_set_regs_ever_live (regno, true);
5989 }
5990
5991 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5992 if (df_regs_ever_live_p (regno)
5993 && !fixed_regs[regno]
5994 && !crtl->abi->clobbers_full_reg_p (regno))
5995 frame.reg_offset[regno] = SLOT_REQUIRED;
5996
5997 /* With stack-clash, LR must be saved in non-leaf functions. */
5998 gcc_assert (crtl->is_leaf
5999 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6000
6001 /* Now assign stack slots for the registers. Start with the predicate
6002 registers, since predicate LDR and STR have a relatively small
6003 offset range. These saves happen below the hard frame pointer. */
6004 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6005 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6006 {
6007 frame.reg_offset[regno] = offset;
6008 offset += BYTES_PER_SVE_PRED;
6009 }
6010
6011 if (maybe_ne (offset, 0))
6012 {
6013 /* If we have any vector registers to save above the predicate registers,
6014 the offset of the vector register save slots need to be a multiple
6015 of the vector size. This lets us use the immediate forms of LDR/STR
6016 (or LD1/ST1 for big-endian).
6017
6018 A vector register is 8 times the size of a predicate register,
6019 and we need to save a maximum of 12 predicate registers, so the
6020 first vector register will be at either #1, MUL VL or #2, MUL VL.
6021
6022 If we don't have any vector registers to save, and we know how
6023 big the predicate save area is, we can just round it up to the
6024 next 16-byte boundary. */
6025 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6026 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6027 else
6028 {
6029 if (known_le (offset, vector_save_size))
6030 offset = vector_save_size;
6031 else if (known_le (offset, vector_save_size * 2))
6032 offset = vector_save_size * 2;
6033 else
6034 gcc_unreachable ();
6035 }
6036 }
6037
6038 /* If we need to save any SVE vector registers, add them next. */
6039 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6040 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6041 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6042 {
6043 frame.reg_offset[regno] = offset;
6044 offset += vector_save_size;
6045 }
6046
6047 /* OFFSET is now the offset of the hard frame pointer from the bottom
6048 of the callee save area. */
6049 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6050 frame.below_hard_fp_saved_regs_size = offset;
6051 if (frame.emit_frame_chain)
6052 {
6053 /* FP and LR are placed in the linkage record. */
6054 frame.reg_offset[R29_REGNUM] = offset;
6055 frame.wb_candidate1 = R29_REGNUM;
6056 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6057 frame.wb_candidate2 = R30_REGNUM;
6058 offset += 2 * UNITS_PER_WORD;
6059 }
6060
6061 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6062 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6063 {
6064 frame.reg_offset[regno] = offset;
6065 if (frame.wb_candidate1 == INVALID_REGNUM)
6066 frame.wb_candidate1 = regno;
6067 else if (frame.wb_candidate2 == INVALID_REGNUM)
6068 frame.wb_candidate2 = regno;
6069 offset += UNITS_PER_WORD;
6070 }
6071
6072 poly_int64 max_int_offset = offset;
6073 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6074 bool has_align_gap = maybe_ne (offset, max_int_offset);
6075
6076 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6077 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6078 {
6079 /* If there is an alignment gap between integer and fp callee-saves,
6080 allocate the last fp register to it if possible. */
6081 if (regno == last_fp_reg
6082 && has_align_gap
6083 && known_eq (vector_save_size, 8)
6084 && multiple_p (offset, 16))
6085 {
6086 frame.reg_offset[regno] = max_int_offset;
6087 break;
6088 }
6089
6090 frame.reg_offset[regno] = offset;
6091 if (frame.wb_candidate1 == INVALID_REGNUM)
6092 frame.wb_candidate1 = regno;
6093 else if (frame.wb_candidate2 == INVALID_REGNUM
6094 && frame.wb_candidate1 >= V0_REGNUM)
6095 frame.wb_candidate2 = regno;
6096 offset += vector_save_size;
6097 }
6098
6099 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6100
6101 frame.saved_regs_size = offset;
6102
6103 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6104
6105 poly_int64 above_outgoing_args
6106 = aligned_upper_bound (varargs_and_saved_regs_size
6107 + get_frame_size (),
6108 STACK_BOUNDARY / BITS_PER_UNIT);
6109
6110 frame.hard_fp_offset
6111 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6112
6113 /* Both these values are already aligned. */
6114 gcc_assert (multiple_p (crtl->outgoing_args_size,
6115 STACK_BOUNDARY / BITS_PER_UNIT));
6116 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6117
6118 frame.locals_offset = frame.saved_varargs_size;
6119
6120 frame.initial_adjust = 0;
6121 frame.final_adjust = 0;
6122 frame.callee_adjust = 0;
6123 frame.sve_callee_adjust = 0;
6124 frame.callee_offset = 0;
6125
6126 HOST_WIDE_INT max_push_offset = 0;
6127 if (frame.wb_candidate2 != INVALID_REGNUM)
6128 max_push_offset = 512;
6129 else if (frame.wb_candidate1 != INVALID_REGNUM)
6130 max_push_offset = 256;
6131
6132 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6133 HOST_WIDE_INT const_saved_regs_size;
6134 if (frame.frame_size.is_constant (&const_size)
6135 && const_size < max_push_offset
6136 && known_eq (frame.hard_fp_offset, const_size))
6137 {
6138 /* Simple, small frame with no outgoing arguments:
6139
6140 stp reg1, reg2, [sp, -frame_size]!
6141 stp reg3, reg4, [sp, 16] */
6142 frame.callee_adjust = const_size;
6143 }
6144 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6145 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6146 && const_outgoing_args_size + const_saved_regs_size < 512
6147 /* We could handle this case even with outgoing args, provided
6148 that the number of args left us with valid offsets for all
6149 predicate and vector save slots. It's such a rare case that
6150 it hardly seems worth the effort though. */
6151 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6152 && !(cfun->calls_alloca
6153 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6154 && const_fp_offset < max_push_offset))
6155 {
6156 /* Frame with small outgoing arguments:
6157
6158 sub sp, sp, frame_size
6159 stp reg1, reg2, [sp, outgoing_args_size]
6160 stp reg3, reg4, [sp, outgoing_args_size + 16] */
6161 frame.initial_adjust = frame.frame_size;
6162 frame.callee_offset = const_outgoing_args_size;
6163 }
6164 else if (saves_below_hard_fp_p
6165 && known_eq (frame.saved_regs_size,
6166 frame.below_hard_fp_saved_regs_size))
6167 {
6168 /* Frame in which all saves are SVE saves:
6169
6170 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6171 save SVE registers relative to SP
6172 sub sp, sp, outgoing_args_size */
6173 frame.initial_adjust = (frame.hard_fp_offset
6174 + frame.below_hard_fp_saved_regs_size);
6175 frame.final_adjust = crtl->outgoing_args_size;
6176 }
6177 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6178 && const_fp_offset < max_push_offset)
6179 {
6180 /* Frame with large outgoing arguments or SVE saves, but with
6181 a small local area:
6182
6183 stp reg1, reg2, [sp, -hard_fp_offset]!
6184 stp reg3, reg4, [sp, 16]
6185 [sub sp, sp, below_hard_fp_saved_regs_size]
6186 [save SVE registers relative to SP]
6187 sub sp, sp, outgoing_args_size */
6188 frame.callee_adjust = const_fp_offset;
6189 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6190 frame.final_adjust = crtl->outgoing_args_size;
6191 }
6192 else
6193 {
6194 /* Frame with large local area and outgoing arguments or SVE saves,
6195 using frame pointer:
6196
6197 sub sp, sp, hard_fp_offset
6198 stp x29, x30, [sp, 0]
6199 add x29, sp, 0
6200 stp reg3, reg4, [sp, 16]
6201 [sub sp, sp, below_hard_fp_saved_regs_size]
6202 [save SVE registers relative to SP]
6203 sub sp, sp, outgoing_args_size */
6204 frame.initial_adjust = frame.hard_fp_offset;
6205 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6206 frame.final_adjust = crtl->outgoing_args_size;
6207 }
6208
6209 /* Make sure the individual adjustments add up to the full frame size. */
6210 gcc_assert (known_eq (frame.initial_adjust
6211 + frame.callee_adjust
6212 + frame.sve_callee_adjust
6213 + frame.final_adjust, frame.frame_size));
6214
6215 frame.laid_out = true;
6216 }
6217
6218 /* Return true if the register REGNO is saved on entry to
6219 the current function. */
6220
6221 static bool
6222 aarch64_register_saved_on_entry (int regno)
6223 {
6224 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6225 }
6226
6227 /* Return the next register up from REGNO up to LIMIT for the callee
6228 to save. */
6229
6230 static unsigned
6231 aarch64_next_callee_save (unsigned regno, unsigned limit)
6232 {
6233 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6234 regno ++;
6235 return regno;
6236 }
6237
6238 /* Push the register number REGNO of mode MODE to the stack with write-back
6239 adjusting the stack by ADJUSTMENT. */
6240
6241 static void
6242 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6243 HOST_WIDE_INT adjustment)
6244 {
6245 rtx base_rtx = stack_pointer_rtx;
6246 rtx insn, reg, mem;
6247
6248 reg = gen_rtx_REG (mode, regno);
6249 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6250 plus_constant (Pmode, base_rtx, -adjustment));
6251 mem = gen_frame_mem (mode, mem);
6252
6253 insn = emit_move_insn (mem, reg);
6254 RTX_FRAME_RELATED_P (insn) = 1;
6255 }
6256
6257 /* Generate and return an instruction to store the pair of registers
6258 REG and REG2 of mode MODE to location BASE with write-back adjusting
6259 the stack location BASE by ADJUSTMENT. */
6260
6261 static rtx
6262 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6263 HOST_WIDE_INT adjustment)
6264 {
6265 switch (mode)
6266 {
6267 case E_DImode:
6268 return gen_storewb_pairdi_di (base, base, reg, reg2,
6269 GEN_INT (-adjustment),
6270 GEN_INT (UNITS_PER_WORD - adjustment));
6271 case E_DFmode:
6272 return gen_storewb_pairdf_di (base, base, reg, reg2,
6273 GEN_INT (-adjustment),
6274 GEN_INT (UNITS_PER_WORD - adjustment));
6275 case E_TFmode:
6276 return gen_storewb_pairtf_di (base, base, reg, reg2,
6277 GEN_INT (-adjustment),
6278 GEN_INT (UNITS_PER_VREG - adjustment));
6279 default:
6280 gcc_unreachable ();
6281 }
6282 }
6283
6284 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6285 stack pointer by ADJUSTMENT. */
6286
6287 static void
6288 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6289 {
6290 rtx_insn *insn;
6291 machine_mode mode = aarch64_reg_save_mode (regno1);
6292
6293 if (regno2 == INVALID_REGNUM)
6294 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6295
6296 rtx reg1 = gen_rtx_REG (mode, regno1);
6297 rtx reg2 = gen_rtx_REG (mode, regno2);
6298
6299 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6300 reg2, adjustment));
6301 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6302 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6303 RTX_FRAME_RELATED_P (insn) = 1;
6304 }
6305
6306 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6307 adjusting it by ADJUSTMENT afterwards. */
6308
6309 static rtx
6310 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6311 HOST_WIDE_INT adjustment)
6312 {
6313 switch (mode)
6314 {
6315 case E_DImode:
6316 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6317 GEN_INT (UNITS_PER_WORD));
6318 case E_DFmode:
6319 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6320 GEN_INT (UNITS_PER_WORD));
6321 case E_TFmode:
6322 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6323 GEN_INT (UNITS_PER_VREG));
6324 default:
6325 gcc_unreachable ();
6326 }
6327 }
6328
6329 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6330 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6331 into CFI_OPS. */
6332
6333 static void
6334 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6335 rtx *cfi_ops)
6336 {
6337 machine_mode mode = aarch64_reg_save_mode (regno1);
6338 rtx reg1 = gen_rtx_REG (mode, regno1);
6339
6340 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6341
6342 if (regno2 == INVALID_REGNUM)
6343 {
6344 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6345 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6346 emit_move_insn (reg1, gen_frame_mem (mode, mem));
6347 }
6348 else
6349 {
6350 rtx reg2 = gen_rtx_REG (mode, regno2);
6351 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6352 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6353 reg2, adjustment));
6354 }
6355 }
6356
6357 /* Generate and return a store pair instruction of mode MODE to store
6358 register REG1 to MEM1 and register REG2 to MEM2. */
6359
6360 static rtx
6361 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6362 rtx reg2)
6363 {
6364 switch (mode)
6365 {
6366 case E_DImode:
6367 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6368
6369 case E_DFmode:
6370 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6371
6372 case E_TFmode:
6373 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6374
6375 default:
6376 gcc_unreachable ();
6377 }
6378 }
6379
6380 /* Generate and regurn a load pair isntruction of mode MODE to load register
6381 REG1 from MEM1 and register REG2 from MEM2. */
6382
6383 static rtx
6384 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6385 rtx mem2)
6386 {
6387 switch (mode)
6388 {
6389 case E_DImode:
6390 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6391
6392 case E_DFmode:
6393 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6394
6395 case E_TFmode:
6396 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6397
6398 default:
6399 gcc_unreachable ();
6400 }
6401 }
6402
6403 /* Return TRUE if return address signing should be enabled for the current
6404 function, otherwise return FALSE. */
6405
6406 bool
6407 aarch64_return_address_signing_enabled (void)
6408 {
6409 /* This function should only be called after frame laid out. */
6410 gcc_assert (cfun->machine->frame.laid_out);
6411
6412 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6413 if its LR is pushed onto stack. */
6414 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6415 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6416 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6417 }
6418
6419 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
6420 bool
6421 aarch64_bti_enabled (void)
6422 {
6423 return (aarch64_enable_bti == 1);
6424 }
6425
6426 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6427 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6428 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
6429
6430 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6431 or LD1D address
6432
6433 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6434 if the variable isn't already nonnull
6435
6436 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6437 Handle this case using a temporary base register that is suitable for
6438 all offsets in that range. Use ANCHOR_REG as this base register if it
6439 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
6440
6441 static inline void
6442 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
6443 rtx &anchor_reg, poly_int64 &offset,
6444 rtx &ptrue)
6445 {
6446 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
6447 {
6448 /* This is the maximum valid offset of the anchor from the base.
6449 Lower values would be valid too. */
6450 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
6451 if (!anchor_reg)
6452 {
6453 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6454 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6455 gen_int_mode (anchor_offset, Pmode)));
6456 }
6457 base_rtx = anchor_reg;
6458 offset -= anchor_offset;
6459 }
6460 if (!ptrue)
6461 {
6462 int pred_reg = cfun->machine->frame.spare_pred_reg;
6463 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
6464 CONSTM1_RTX (VNx16BImode));
6465 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
6466 }
6467 }
6468
6469 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6470 is saved at BASE + OFFSET. */
6471
6472 static void
6473 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
6474 rtx base, poly_int64 offset)
6475 {
6476 rtx mem = gen_frame_mem (GET_MODE (reg),
6477 plus_constant (Pmode, base, offset));
6478 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
6479 }
6480
6481 /* Emit code to save the callee-saved registers from register number START
6482 to LIMIT to the stack at the location starting at offset START_OFFSET,
6483 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
6484 is true if the hard frame pointer has been set up. */
6485
6486 static void
6487 aarch64_save_callee_saves (poly_int64 start_offset,
6488 unsigned start, unsigned limit, bool skip_wb,
6489 bool hard_fp_valid_p)
6490 {
6491 rtx_insn *insn;
6492 unsigned regno;
6493 unsigned regno2;
6494 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6495
6496 for (regno = aarch64_next_callee_save (start, limit);
6497 regno <= limit;
6498 regno = aarch64_next_callee_save (regno + 1, limit))
6499 {
6500 rtx reg, mem;
6501 poly_int64 offset;
6502 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6503
6504 if (skip_wb
6505 && (regno == cfun->machine->frame.wb_candidate1
6506 || regno == cfun->machine->frame.wb_candidate2))
6507 continue;
6508
6509 if (cfun->machine->reg_is_wrapped_separately[regno])
6510 continue;
6511
6512 machine_mode mode = aarch64_reg_save_mode (regno);
6513 reg = gen_rtx_REG (mode, regno);
6514 offset = start_offset + cfun->machine->frame.reg_offset[regno];
6515 rtx base_rtx = stack_pointer_rtx;
6516 poly_int64 sp_offset = offset;
6517
6518 HOST_WIDE_INT const_offset;
6519 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6520 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6521 offset, ptrue);
6522 else if (GP_REGNUM_P (regno)
6523 && (!offset.is_constant (&const_offset) || const_offset >= 512))
6524 {
6525 gcc_assert (known_eq (start_offset, 0));
6526 poly_int64 fp_offset
6527 = cfun->machine->frame.below_hard_fp_saved_regs_size;
6528 if (hard_fp_valid_p)
6529 base_rtx = hard_frame_pointer_rtx;
6530 else
6531 {
6532 if (!anchor_reg)
6533 {
6534 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6535 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6536 gen_int_mode (fp_offset, Pmode)));
6537 }
6538 base_rtx = anchor_reg;
6539 }
6540 offset -= fp_offset;
6541 }
6542 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6543 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
6544
6545 if (!aarch64_sve_mode_p (mode)
6546 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6547 && !cfun->machine->reg_is_wrapped_separately[regno2]
6548 && known_eq (GET_MODE_SIZE (mode),
6549 cfun->machine->frame.reg_offset[regno2]
6550 - cfun->machine->frame.reg_offset[regno]))
6551 {
6552 rtx reg2 = gen_rtx_REG (mode, regno2);
6553 rtx mem2;
6554
6555 offset += GET_MODE_SIZE (mode);
6556 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6557 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
6558 reg2));
6559
6560 /* The first part of a frame-related parallel insn is
6561 always assumed to be relevant to the frame
6562 calculations; subsequent parts, are only
6563 frame-related if explicitly marked. */
6564 if (aarch64_emit_cfi_for_reg_p (regno2))
6565 {
6566 if (need_cfa_note_p)
6567 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
6568 sp_offset + GET_MODE_SIZE (mode));
6569 else
6570 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6571 }
6572
6573 regno = regno2;
6574 }
6575 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6576 {
6577 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
6578 need_cfa_note_p = true;
6579 }
6580 else if (aarch64_sve_mode_p (mode))
6581 insn = emit_insn (gen_rtx_SET (mem, reg));
6582 else
6583 insn = emit_move_insn (mem, reg);
6584
6585 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6586 if (frame_related_p && need_cfa_note_p)
6587 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
6588 }
6589 }
6590
6591 /* Emit code to restore the callee registers from register number START
6592 up to and including LIMIT. Restore from the stack offset START_OFFSET,
6593 skipping any write-back candidates if SKIP_WB is true. Write the
6594 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
6595
6596 static void
6597 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
6598 unsigned limit, bool skip_wb, rtx *cfi_ops)
6599 {
6600 unsigned regno;
6601 unsigned regno2;
6602 poly_int64 offset;
6603 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6604
6605 for (regno = aarch64_next_callee_save (start, limit);
6606 regno <= limit;
6607 regno = aarch64_next_callee_save (regno + 1, limit))
6608 {
6609 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6610 if (cfun->machine->reg_is_wrapped_separately[regno])
6611 continue;
6612
6613 rtx reg, mem;
6614
6615 if (skip_wb
6616 && (regno == cfun->machine->frame.wb_candidate1
6617 || regno == cfun->machine->frame.wb_candidate2))
6618 continue;
6619
6620 machine_mode mode = aarch64_reg_save_mode (regno);
6621 reg = gen_rtx_REG (mode, regno);
6622 offset = start_offset + cfun->machine->frame.reg_offset[regno];
6623 rtx base_rtx = stack_pointer_rtx;
6624 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6625 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6626 offset, ptrue);
6627 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6628
6629 if (!aarch64_sve_mode_p (mode)
6630 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6631 && !cfun->machine->reg_is_wrapped_separately[regno2]
6632 && known_eq (GET_MODE_SIZE (mode),
6633 cfun->machine->frame.reg_offset[regno2]
6634 - cfun->machine->frame.reg_offset[regno]))
6635 {
6636 rtx reg2 = gen_rtx_REG (mode, regno2);
6637 rtx mem2;
6638
6639 offset += GET_MODE_SIZE (mode);
6640 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6641 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6642
6643 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6644 regno = regno2;
6645 }
6646 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6647 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
6648 else if (aarch64_sve_mode_p (mode))
6649 emit_insn (gen_rtx_SET (reg, mem));
6650 else
6651 emit_move_insn (reg, mem);
6652 if (frame_related_p)
6653 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
6654 }
6655 }
6656
6657 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
6658 of MODE. */
6659
6660 static inline bool
6661 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6662 {
6663 HOST_WIDE_INT multiple;
6664 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6665 && IN_RANGE (multiple, -8, 7));
6666 }
6667
6668 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
6669 of MODE. */
6670
6671 static inline bool
6672 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6673 {
6674 HOST_WIDE_INT multiple;
6675 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6676 && IN_RANGE (multiple, 0, 63));
6677 }
6678
6679 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
6680 of MODE. */
6681
6682 bool
6683 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6684 {
6685 HOST_WIDE_INT multiple;
6686 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6687 && IN_RANGE (multiple, -64, 63));
6688 }
6689
6690 /* Return true if OFFSET is a signed 9-bit value. */
6691
6692 bool
6693 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
6694 poly_int64 offset)
6695 {
6696 HOST_WIDE_INT const_offset;
6697 return (offset.is_constant (&const_offset)
6698 && IN_RANGE (const_offset, -256, 255));
6699 }
6700
6701 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
6702 of MODE. */
6703
6704 static inline bool
6705 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6706 {
6707 HOST_WIDE_INT multiple;
6708 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6709 && IN_RANGE (multiple, -256, 255));
6710 }
6711
6712 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
6713 of MODE. */
6714
6715 static inline bool
6716 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6717 {
6718 HOST_WIDE_INT multiple;
6719 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6720 && IN_RANGE (multiple, 0, 4095));
6721 }
6722
6723 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
6724
6725 static sbitmap
6726 aarch64_get_separate_components (void)
6727 {
6728 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6729 bitmap_clear (components);
6730
6731 /* The registers we need saved to the frame. */
6732 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6733 if (aarch64_register_saved_on_entry (regno))
6734 {
6735 /* Punt on saves and restores that use ST1D and LD1D. We could
6736 try to be smarter, but it would involve making sure that the
6737 spare predicate register itself is safe to use at the save
6738 and restore points. Also, when a frame pointer is being used,
6739 the slots are often out of reach of ST1D and LD1D anyway. */
6740 machine_mode mode = aarch64_reg_save_mode (regno);
6741 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6742 continue;
6743
6744 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6745
6746 /* If the register is saved in the first SVE save slot, we use
6747 it as a stack probe for -fstack-clash-protection. */
6748 if (flag_stack_clash_protection
6749 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
6750 && known_eq (offset, 0))
6751 continue;
6752
6753 /* Get the offset relative to the register we'll use. */
6754 if (frame_pointer_needed)
6755 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6756 else
6757 offset += crtl->outgoing_args_size;
6758
6759 /* Check that we can access the stack slot of the register with one
6760 direct load with no adjustments needed. */
6761 if (aarch64_sve_mode_p (mode)
6762 ? offset_9bit_signed_scaled_p (mode, offset)
6763 : offset_12bit_unsigned_scaled_p (mode, offset))
6764 bitmap_set_bit (components, regno);
6765 }
6766
6767 /* Don't mess with the hard frame pointer. */
6768 if (frame_pointer_needed)
6769 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
6770
6771 /* If the spare predicate register used by big-endian SVE code
6772 is call-preserved, it must be saved in the main prologue
6773 before any saves that use it. */
6774 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
6775 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
6776
6777 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6778 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6779 /* If registers have been chosen to be stored/restored with
6780 writeback don't interfere with them to avoid having to output explicit
6781 stack adjustment instructions. */
6782 if (reg2 != INVALID_REGNUM)
6783 bitmap_clear_bit (components, reg2);
6784 if (reg1 != INVALID_REGNUM)
6785 bitmap_clear_bit (components, reg1);
6786
6787 bitmap_clear_bit (components, LR_REGNUM);
6788 bitmap_clear_bit (components, SP_REGNUM);
6789
6790 return components;
6791 }
6792
6793 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
6794
6795 static sbitmap
6796 aarch64_components_for_bb (basic_block bb)
6797 {
6798 bitmap in = DF_LIVE_IN (bb);
6799 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
6800 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
6801
6802 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6803 bitmap_clear (components);
6804
6805 /* Clobbered registers don't generate values in any meaningful sense,
6806 since nothing after the clobber can rely on their value. And we can't
6807 say that partially-clobbered registers are unconditionally killed,
6808 because whether they're killed or not depends on the mode of the
6809 value they're holding. Thus partially call-clobbered registers
6810 appear in neither the kill set nor the gen set.
6811
6812 Check manually for any calls that clobber more of a register than the
6813 current function can. */
6814 function_abi_aggregator callee_abis;
6815 rtx_insn *insn;
6816 FOR_BB_INSNS (bb, insn)
6817 if (CALL_P (insn))
6818 callee_abis.note_callee_abi (insn_callee_abi (insn));
6819 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
6820
6821 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
6822 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6823 if (!fixed_regs[regno]
6824 && !crtl->abi->clobbers_full_reg_p (regno)
6825 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
6826 || bitmap_bit_p (in, regno)
6827 || bitmap_bit_p (gen, regno)
6828 || bitmap_bit_p (kill, regno)))
6829 {
6830 bitmap_set_bit (components, regno);
6831
6832 /* If there is a callee-save at an adjacent offset, add it too
6833 to increase the use of LDP/STP. */
6834 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6835 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
6836
6837 if (regno2 <= LAST_SAVED_REGNUM)
6838 {
6839 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6840 if (regno < regno2
6841 ? known_eq (offset + 8, offset2)
6842 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
6843 bitmap_set_bit (components, regno2);
6844 }
6845 }
6846
6847 return components;
6848 }
6849
6850 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6851 Nothing to do for aarch64. */
6852
6853 static void
6854 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6855 {
6856 }
6857
6858 /* Return the next set bit in BMP from START onwards. Return the total number
6859 of bits in BMP if no set bit is found at or after START. */
6860
6861 static unsigned int
6862 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6863 {
6864 unsigned int nbits = SBITMAP_SIZE (bmp);
6865 if (start == nbits)
6866 return start;
6867
6868 gcc_assert (start < nbits);
6869 for (unsigned int i = start; i < nbits; i++)
6870 if (bitmap_bit_p (bmp, i))
6871 return i;
6872
6873 return nbits;
6874 }
6875
6876 /* Do the work for aarch64_emit_prologue_components and
6877 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6878 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6879 for these components or the epilogue sequence. That is, it determines
6880 whether we should emit stores or loads and what kind of CFA notes to attach
6881 to the insns. Otherwise the logic for the two sequences is very
6882 similar. */
6883
6884 static void
6885 aarch64_process_components (sbitmap components, bool prologue_p)
6886 {
6887 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6888 ? HARD_FRAME_POINTER_REGNUM
6889 : STACK_POINTER_REGNUM);
6890
6891 unsigned last_regno = SBITMAP_SIZE (components);
6892 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6893 rtx_insn *insn = NULL;
6894
6895 while (regno != last_regno)
6896 {
6897 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6898 machine_mode mode = aarch64_reg_save_mode (regno);
6899
6900 rtx reg = gen_rtx_REG (mode, regno);
6901 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6902 if (frame_pointer_needed)
6903 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6904 else
6905 offset += crtl->outgoing_args_size;
6906
6907 rtx addr = plus_constant (Pmode, ptr_reg, offset);
6908 rtx mem = gen_frame_mem (mode, addr);
6909
6910 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6911 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6912 /* No more registers to handle after REGNO.
6913 Emit a single save/restore and exit. */
6914 if (regno2 == last_regno)
6915 {
6916 insn = emit_insn (set);
6917 if (frame_related_p)
6918 {
6919 RTX_FRAME_RELATED_P (insn) = 1;
6920 if (prologue_p)
6921 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6922 else
6923 add_reg_note (insn, REG_CFA_RESTORE, reg);
6924 }
6925 break;
6926 }
6927
6928 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6929 /* The next register is not of the same class or its offset is not
6930 mergeable with the current one into a pair. */
6931 if (aarch64_sve_mode_p (mode)
6932 || !satisfies_constraint_Ump (mem)
6933 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6934 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6935 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6936 GET_MODE_SIZE (mode)))
6937 {
6938 insn = emit_insn (set);
6939 if (frame_related_p)
6940 {
6941 RTX_FRAME_RELATED_P (insn) = 1;
6942 if (prologue_p)
6943 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6944 else
6945 add_reg_note (insn, REG_CFA_RESTORE, reg);
6946 }
6947
6948 regno = regno2;
6949 continue;
6950 }
6951
6952 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
6953
6954 /* REGNO2 can be saved/restored in a pair with REGNO. */
6955 rtx reg2 = gen_rtx_REG (mode, regno2);
6956 if (frame_pointer_needed)
6957 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6958 else
6959 offset2 += crtl->outgoing_args_size;
6960 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6961 rtx mem2 = gen_frame_mem (mode, addr2);
6962 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6963 : gen_rtx_SET (reg2, mem2);
6964
6965 if (prologue_p)
6966 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6967 else
6968 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6969
6970 if (frame_related_p || frame_related2_p)
6971 {
6972 RTX_FRAME_RELATED_P (insn) = 1;
6973 if (prologue_p)
6974 {
6975 if (frame_related_p)
6976 add_reg_note (insn, REG_CFA_OFFSET, set);
6977 if (frame_related2_p)
6978 add_reg_note (insn, REG_CFA_OFFSET, set2);
6979 }
6980 else
6981 {
6982 if (frame_related_p)
6983 add_reg_note (insn, REG_CFA_RESTORE, reg);
6984 if (frame_related2_p)
6985 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6986 }
6987 }
6988
6989 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6990 }
6991 }
6992
6993 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6994
6995 static void
6996 aarch64_emit_prologue_components (sbitmap components)
6997 {
6998 aarch64_process_components (components, true);
6999 }
7000
7001 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
7002
7003 static void
7004 aarch64_emit_epilogue_components (sbitmap components)
7005 {
7006 aarch64_process_components (components, false);
7007 }
7008
7009 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
7010
7011 static void
7012 aarch64_set_handled_components (sbitmap components)
7013 {
7014 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7015 if (bitmap_bit_p (components, regno))
7016 cfun->machine->reg_is_wrapped_separately[regno] = true;
7017 }
7018
7019 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
7020 determining the probe offset for alloca. */
7021
7022 static HOST_WIDE_INT
7023 aarch64_stack_clash_protection_alloca_probe_range (void)
7024 {
7025 return STACK_CLASH_CALLER_GUARD;
7026 }
7027
7028
7029 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7030 registers. If POLY_SIZE is not large enough to require a probe this function
7031 will only adjust the stack. When allocating the stack space
7032 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7033 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7034 arguments. If we are then we ensure that any allocation larger than the ABI
7035 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7036 maintained.
7037
7038 We emit barriers after each stack adjustment to prevent optimizations from
7039 breaking the invariant that we never drop the stack more than a page. This
7040 invariant is needed to make it easier to correctly handle asynchronous
7041 events, e.g. if we were to allow the stack to be dropped by more than a page
7042 and then have multiple probes up and we take a signal somewhere in between
7043 then the signal handler doesn't know the state of the stack and can make no
7044 assumptions about which pages have been probed. */
7045
7046 static void
7047 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7048 poly_int64 poly_size,
7049 bool frame_related_p,
7050 bool final_adjustment_p)
7051 {
7052 HOST_WIDE_INT guard_size
7053 = 1 << param_stack_clash_protection_guard_size;
7054 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7055 HOST_WIDE_INT min_probe_threshold
7056 = (final_adjustment_p
7057 ? guard_used_by_caller
7058 : guard_size - guard_used_by_caller);
7059 /* When doing the final adjustment for the outgoing arguments, take into
7060 account any unprobed space there is above the current SP. There are
7061 two cases:
7062
7063 - When saving SVE registers below the hard frame pointer, we force
7064 the lowest save to take place in the prologue before doing the final
7065 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7066 This acts as a probe at SP, so there is no unprobed space.
7067
7068 - When there are no SVE register saves, we use the store of the link
7069 register as a probe. We can't assume that LR was saved at position 0
7070 though, so treat any space below it as unprobed. */
7071 if (final_adjustment_p
7072 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7073 {
7074 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7075 if (known_ge (lr_offset, 0))
7076 min_probe_threshold -= lr_offset.to_constant ();
7077 else
7078 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7079 }
7080
7081 poly_int64 frame_size = cfun->machine->frame.frame_size;
7082
7083 /* We should always have a positive probe threshold. */
7084 gcc_assert (min_probe_threshold > 0);
7085
7086 if (flag_stack_clash_protection && !final_adjustment_p)
7087 {
7088 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7089 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7090 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7091
7092 if (known_eq (frame_size, 0))
7093 {
7094 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7095 }
7096 else if (known_lt (initial_adjust + sve_callee_adjust,
7097 guard_size - guard_used_by_caller)
7098 && known_lt (final_adjust, guard_used_by_caller))
7099 {
7100 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7101 }
7102 }
7103
7104 /* If SIZE is not large enough to require probing, just adjust the stack and
7105 exit. */
7106 if (known_lt (poly_size, min_probe_threshold)
7107 || !flag_stack_clash_protection)
7108 {
7109 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7110 return;
7111 }
7112
7113 HOST_WIDE_INT size;
7114 /* Handle the SVE non-constant case first. */
7115 if (!poly_size.is_constant (&size))
7116 {
7117 if (dump_file)
7118 {
7119 fprintf (dump_file, "Stack clash SVE prologue: ");
7120 print_dec (poly_size, dump_file);
7121 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7122 }
7123
7124 /* First calculate the amount of bytes we're actually spilling. */
7125 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7126 poly_size, temp1, temp2, false, true);
7127
7128 rtx_insn *insn = get_last_insn ();
7129
7130 if (frame_related_p)
7131 {
7132 /* This is done to provide unwinding information for the stack
7133 adjustments we're about to do, however to prevent the optimizers
7134 from removing the R11 move and leaving the CFA note (which would be
7135 very wrong) we tie the old and new stack pointer together.
7136 The tie will expand to nothing but the optimizers will not touch
7137 the instruction. */
7138 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7139 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7140 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7141
7142 /* We want the CFA independent of the stack pointer for the
7143 duration of the loop. */
7144 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7145 RTX_FRAME_RELATED_P (insn) = 1;
7146 }
7147
7148 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7149 rtx guard_const = gen_int_mode (guard_size, Pmode);
7150
7151 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7152 stack_pointer_rtx, temp1,
7153 probe_const, guard_const));
7154
7155 /* Now reset the CFA register if needed. */
7156 if (frame_related_p)
7157 {
7158 add_reg_note (insn, REG_CFA_DEF_CFA,
7159 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7160 gen_int_mode (poly_size, Pmode)));
7161 RTX_FRAME_RELATED_P (insn) = 1;
7162 }
7163
7164 return;
7165 }
7166
7167 if (dump_file)
7168 fprintf (dump_file,
7169 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7170 " bytes, probing will be required.\n", size);
7171
7172 /* Round size to the nearest multiple of guard_size, and calculate the
7173 residual as the difference between the original size and the rounded
7174 size. */
7175 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7176 HOST_WIDE_INT residual = size - rounded_size;
7177
7178 /* We can handle a small number of allocations/probes inline. Otherwise
7179 punt to a loop. */
7180 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7181 {
7182 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7183 {
7184 aarch64_sub_sp (NULL, temp2, guard_size, true);
7185 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7186 guard_used_by_caller));
7187 emit_insn (gen_blockage ());
7188 }
7189 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7190 }
7191 else
7192 {
7193 /* Compute the ending address. */
7194 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7195 temp1, NULL, false, true);
7196 rtx_insn *insn = get_last_insn ();
7197
7198 /* For the initial allocation, we don't have a frame pointer
7199 set up, so we always need CFI notes. If we're doing the
7200 final allocation, then we may have a frame pointer, in which
7201 case it is the CFA, otherwise we need CFI notes.
7202
7203 We can determine which allocation we are doing by looking at
7204 the value of FRAME_RELATED_P since the final allocations are not
7205 frame related. */
7206 if (frame_related_p)
7207 {
7208 /* We want the CFA independent of the stack pointer for the
7209 duration of the loop. */
7210 add_reg_note (insn, REG_CFA_DEF_CFA,
7211 plus_constant (Pmode, temp1, rounded_size));
7212 RTX_FRAME_RELATED_P (insn) = 1;
7213 }
7214
7215 /* This allocates and probes the stack. Note that this re-uses some of
7216 the existing Ada stack protection code. However we are guaranteed not
7217 to enter the non loop or residual branches of that code.
7218
7219 The non-loop part won't be entered because if our allocation amount
7220 doesn't require a loop, the case above would handle it.
7221
7222 The residual amount won't be entered because TEMP1 is a mutliple of
7223 the allocation size. The residual will always be 0. As such, the only
7224 part we are actually using from that code is the loop setup. The
7225 actual probing is done in aarch64_output_probe_stack_range. */
7226 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7227 stack_pointer_rtx, temp1));
7228
7229 /* Now reset the CFA register if needed. */
7230 if (frame_related_p)
7231 {
7232 add_reg_note (insn, REG_CFA_DEF_CFA,
7233 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7234 RTX_FRAME_RELATED_P (insn) = 1;
7235 }
7236
7237 emit_insn (gen_blockage ());
7238 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7239 }
7240
7241 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
7242 be probed. This maintains the requirement that each page is probed at
7243 least once. For initial probing we probe only if the allocation is
7244 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7245 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
7246 GUARD_SIZE. This works that for any allocation that is large enough to
7247 trigger a probe here, we'll have at least one, and if they're not large
7248 enough for this code to emit anything for them, The page would have been
7249 probed by the saving of FP/LR either by this function or any callees. If
7250 we don't have any callees then we won't have more stack adjustments and so
7251 are still safe. */
7252 if (residual)
7253 {
7254 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7255 /* If we're doing final adjustments, and we've done any full page
7256 allocations then any residual needs to be probed. */
7257 if (final_adjustment_p && rounded_size != 0)
7258 min_probe_threshold = 0;
7259 /* If doing a small final adjustment, we always probe at offset 0.
7260 This is done to avoid issues when LR is not at position 0 or when
7261 the final adjustment is smaller than the probing offset. */
7262 else if (final_adjustment_p && rounded_size == 0)
7263 residual_probe_offset = 0;
7264
7265 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7266 if (residual >= min_probe_threshold)
7267 {
7268 if (dump_file)
7269 fprintf (dump_file,
7270 "Stack clash AArch64 prologue residuals: "
7271 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7272 "\n", residual);
7273
7274 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7275 residual_probe_offset));
7276 emit_insn (gen_blockage ());
7277 }
7278 }
7279 }
7280
7281 /* Return 1 if the register is used by the epilogue. We need to say the
7282 return register is used, but only after epilogue generation is complete.
7283 Note that in the case of sibcalls, the values "used by the epilogue" are
7284 considered live at the start of the called function.
7285
7286 For SIMD functions we need to return 1 for FP registers that are saved and
7287 restored by a function but are not zero in call_used_regs. If we do not do
7288 this optimizations may remove the restore of the register. */
7289
7290 int
7291 aarch64_epilogue_uses (int regno)
7292 {
7293 if (epilogue_completed)
7294 {
7295 if (regno == LR_REGNUM)
7296 return 1;
7297 }
7298 return 0;
7299 }
7300
7301 /* AArch64 stack frames generated by this compiler look like:
7302
7303 +-------------------------------+
7304 | |
7305 | incoming stack arguments |
7306 | |
7307 +-------------------------------+
7308 | | <-- incoming stack pointer (aligned)
7309 | callee-allocated save area |
7310 | for register varargs |
7311 | |
7312 +-------------------------------+
7313 | local variables | <-- frame_pointer_rtx
7314 | |
7315 +-------------------------------+
7316 | padding | \
7317 +-------------------------------+ |
7318 | callee-saved registers | | frame.saved_regs_size
7319 +-------------------------------+ |
7320 | LR' | |
7321 +-------------------------------+ |
7322 | FP' | |
7323 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
7324 | SVE vector registers | | \
7325 +-------------------------------+ | | below_hard_fp_saved_regs_size
7326 | SVE predicate registers | / /
7327 +-------------------------------+
7328 | dynamic allocation |
7329 +-------------------------------+
7330 | padding |
7331 +-------------------------------+
7332 | outgoing stack arguments | <-- arg_pointer
7333 | |
7334 +-------------------------------+
7335 | | <-- stack_pointer_rtx (aligned)
7336
7337 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7338 but leave frame_pointer_rtx and hard_frame_pointer_rtx
7339 unchanged.
7340
7341 By default for stack-clash we assume the guard is at least 64KB, but this
7342 value is configurable to either 4KB or 64KB. We also force the guard size to
7343 be the same as the probing interval and both values are kept in sync.
7344
7345 With those assumptions the callee can allocate up to 63KB (or 3KB depending
7346 on the guard size) of stack space without probing.
7347
7348 When probing is needed, we emit a probe at the start of the prologue
7349 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7350
7351 We have to track how much space has been allocated and the only stores
7352 to the stack we track as implicit probes are the FP/LR stores.
7353
7354 For outgoing arguments we probe if the size is larger than 1KB, such that
7355 the ABI specified buffer is maintained for the next callee.
7356
7357 The following registers are reserved during frame layout and should not be
7358 used for any other purpose:
7359
7360 - r11: Used by stack clash protection when SVE is enabled, and also
7361 as an anchor register when saving and restoring registers
7362 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7363 - r14 and r15: Used for speculation tracking.
7364 - r16(IP0), r17(IP1): Used by indirect tailcalls.
7365 - r30(LR), r29(FP): Used by standard frame layout.
7366
7367 These registers must be avoided in frame layout related code unless the
7368 explicit intention is to interact with one of the features listed above. */
7369
7370 /* Generate the prologue instructions for entry into a function.
7371 Establish the stack frame by decreasing the stack pointer with a
7372 properly calculated size and, if necessary, create a frame record
7373 filled with the values of LR and previous frame pointer. The
7374 current FP is also set up if it is in use. */
7375
7376 void
7377 aarch64_expand_prologue (void)
7378 {
7379 poly_int64 frame_size = cfun->machine->frame.frame_size;
7380 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7381 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7382 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7383 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7384 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7385 poly_int64 below_hard_fp_saved_regs_size
7386 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7387 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7388 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7389 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7390 rtx_insn *insn;
7391
7392 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7393 {
7394 /* Fold the SVE allocation into the initial allocation.
7395 We don't do this in aarch64_layout_arg to avoid pessimizing
7396 the epilogue code. */
7397 initial_adjust += sve_callee_adjust;
7398 sve_callee_adjust = 0;
7399 }
7400
7401 /* Sign return address for functions. */
7402 if (aarch64_return_address_signing_enabled ())
7403 {
7404 switch (aarch64_ra_sign_key)
7405 {
7406 case AARCH64_KEY_A:
7407 insn = emit_insn (gen_paciasp ());
7408 break;
7409 case AARCH64_KEY_B:
7410 insn = emit_insn (gen_pacibsp ());
7411 break;
7412 default:
7413 gcc_unreachable ();
7414 }
7415 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7416 RTX_FRAME_RELATED_P (insn) = 1;
7417 }
7418
7419 if (flag_stack_usage_info)
7420 current_function_static_stack_size = constant_lower_bound (frame_size);
7421
7422 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7423 {
7424 if (crtl->is_leaf && !cfun->calls_alloca)
7425 {
7426 if (maybe_gt (frame_size, PROBE_INTERVAL)
7427 && maybe_gt (frame_size, get_stack_check_protect ()))
7428 aarch64_emit_probe_stack_range (get_stack_check_protect (),
7429 (frame_size
7430 - get_stack_check_protect ()));
7431 }
7432 else if (maybe_gt (frame_size, 0))
7433 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7434 }
7435
7436 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7437 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7438
7439 /* In theory we should never have both an initial adjustment
7440 and a callee save adjustment. Verify that is the case since the
7441 code below does not handle it for -fstack-clash-protection. */
7442 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
7443
7444 /* Will only probe if the initial adjustment is larger than the guard
7445 less the amount of the guard reserved for use by the caller's
7446 outgoing args. */
7447 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
7448 true, false);
7449
7450 if (callee_adjust != 0)
7451 aarch64_push_regs (reg1, reg2, callee_adjust);
7452
7453 /* The offset of the frame chain record (if any) from the current SP. */
7454 poly_int64 chain_offset = (initial_adjust + callee_adjust
7455 - cfun->machine->frame.hard_fp_offset);
7456 gcc_assert (known_ge (chain_offset, 0));
7457
7458 /* The offset of the bottom of the save area from the current SP. */
7459 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
7460
7461 if (emit_frame_chain)
7462 {
7463 if (callee_adjust == 0)
7464 {
7465 reg1 = R29_REGNUM;
7466 reg2 = R30_REGNUM;
7467 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
7468 false, false);
7469 }
7470 else
7471 gcc_assert (known_eq (chain_offset, 0));
7472 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
7473 stack_pointer_rtx, chain_offset,
7474 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
7475 if (frame_pointer_needed && !frame_size.is_constant ())
7476 {
7477 /* Variable-sized frames need to describe the save slot
7478 address using DW_CFA_expression rather than DW_CFA_offset.
7479 This means that, without taking further action, the
7480 locations of the registers that we've already saved would
7481 remain based on the stack pointer even after we redefine
7482 the CFA based on the frame pointer. We therefore need new
7483 DW_CFA_expressions to re-express the save slots with addresses
7484 based on the frame pointer. */
7485 rtx_insn *insn = get_last_insn ();
7486 gcc_assert (RTX_FRAME_RELATED_P (insn));
7487
7488 /* Add an explicit CFA definition if this was previously
7489 implicit. */
7490 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
7491 {
7492 rtx src = plus_constant (Pmode, stack_pointer_rtx,
7493 callee_offset);
7494 add_reg_note (insn, REG_CFA_ADJUST_CFA,
7495 gen_rtx_SET (hard_frame_pointer_rtx, src));
7496 }
7497
7498 /* Change the save slot expressions for the registers that
7499 we've already saved. */
7500 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
7501 hard_frame_pointer_rtx, UNITS_PER_WORD);
7502 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
7503 hard_frame_pointer_rtx, 0);
7504 }
7505 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
7506 }
7507
7508 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
7509 callee_adjust != 0 || emit_frame_chain,
7510 emit_frame_chain);
7511 if (maybe_ne (sve_callee_adjust, 0))
7512 {
7513 gcc_assert (!flag_stack_clash_protection
7514 || known_eq (initial_adjust, 0));
7515 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
7516 sve_callee_adjust,
7517 !frame_pointer_needed, false);
7518 saved_regs_offset += sve_callee_adjust;
7519 }
7520 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
7521 false, emit_frame_chain);
7522 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
7523 callee_adjust != 0 || emit_frame_chain,
7524 emit_frame_chain);
7525
7526 /* We may need to probe the final adjustment if it is larger than the guard
7527 that is assumed by the called. */
7528 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
7529 !frame_pointer_needed, true);
7530 }
7531
7532 /* Return TRUE if we can use a simple_return insn.
7533
7534 This function checks whether the callee saved stack is empty, which
7535 means no restore actions are need. The pro_and_epilogue will use
7536 this to check whether shrink-wrapping opt is feasible. */
7537
7538 bool
7539 aarch64_use_return_insn_p (void)
7540 {
7541 if (!reload_completed)
7542 return false;
7543
7544 if (crtl->profile)
7545 return false;
7546
7547 return known_eq (cfun->machine->frame.frame_size, 0);
7548 }
7549
7550 /* Generate the epilogue instructions for returning from a function.
7551 This is almost exactly the reverse of the prolog sequence, except
7552 that we need to insert barriers to avoid scheduling loads that read
7553 from a deallocated stack, and we optimize the unwind records by
7554 emitting them all together if possible. */
7555 void
7556 aarch64_expand_epilogue (bool for_sibcall)
7557 {
7558 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7559 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7560 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7561 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7562 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7563 poly_int64 below_hard_fp_saved_regs_size
7564 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7565 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7566 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7567 rtx cfi_ops = NULL;
7568 rtx_insn *insn;
7569 /* A stack clash protection prologue may not have left EP0_REGNUM or
7570 EP1_REGNUM in a usable state. The same is true for allocations
7571 with an SVE component, since we then need both temporary registers
7572 for each allocation. For stack clash we are in a usable state if
7573 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
7574 HOST_WIDE_INT guard_size
7575 = 1 << param_stack_clash_protection_guard_size;
7576 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7577
7578 /* We can re-use the registers when:
7579
7580 (a) the deallocation amount is the same as the corresponding
7581 allocation amount (which is false if we combine the initial
7582 and SVE callee save allocations in the prologue); and
7583
7584 (b) the allocation amount doesn't need a probe (which is false
7585 if the amount is guard_size - guard_used_by_caller or greater).
7586
7587 In such situations the register should remain live with the correct
7588 value. */
7589 bool can_inherit_p = (initial_adjust.is_constant ()
7590 && final_adjust.is_constant ()
7591 && (!flag_stack_clash_protection
7592 || (known_lt (initial_adjust,
7593 guard_size - guard_used_by_caller)
7594 && known_eq (sve_callee_adjust, 0))));
7595
7596 /* We need to add memory barrier to prevent read from deallocated stack. */
7597 bool need_barrier_p
7598 = maybe_ne (get_frame_size ()
7599 + cfun->machine->frame.saved_varargs_size, 0);
7600
7601 /* Emit a barrier to prevent loads from a deallocated stack. */
7602 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
7603 || cfun->calls_alloca
7604 || crtl->calls_eh_return)
7605 {
7606 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7607 need_barrier_p = false;
7608 }
7609
7610 /* Restore the stack pointer from the frame pointer if it may not
7611 be the same as the stack pointer. */
7612 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7613 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7614 if (frame_pointer_needed
7615 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
7616 /* If writeback is used when restoring callee-saves, the CFA
7617 is restored on the instruction doing the writeback. */
7618 aarch64_add_offset (Pmode, stack_pointer_rtx,
7619 hard_frame_pointer_rtx,
7620 -callee_offset - below_hard_fp_saved_regs_size,
7621 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
7622 else
7623 /* The case where we need to re-use the register here is very rare, so
7624 avoid the complicated condition and just always emit a move if the
7625 immediate doesn't fit. */
7626 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
7627
7628 /* Restore the vector registers before the predicate registers,
7629 so that we can use P4 as a temporary for big-endian SVE frames. */
7630 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
7631 callee_adjust != 0, &cfi_ops);
7632 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
7633 false, &cfi_ops);
7634 if (maybe_ne (sve_callee_adjust, 0))
7635 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
7636 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
7637 R0_REGNUM, R30_REGNUM,
7638 callee_adjust != 0, &cfi_ops);
7639
7640 if (need_barrier_p)
7641 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7642
7643 if (callee_adjust != 0)
7644 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
7645
7646 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
7647 {
7648 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
7649 insn = get_last_insn ();
7650 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
7651 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
7652 RTX_FRAME_RELATED_P (insn) = 1;
7653 cfi_ops = NULL;
7654 }
7655
7656 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
7657 add restriction on emit_move optimization to leaf functions. */
7658 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
7659 (!can_inherit_p || !crtl->is_leaf
7660 || df_regs_ever_live_p (EP0_REGNUM)));
7661
7662 if (cfi_ops)
7663 {
7664 /* Emit delayed restores and reset the CFA to be SP. */
7665 insn = get_last_insn ();
7666 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
7667 REG_NOTES (insn) = cfi_ops;
7668 RTX_FRAME_RELATED_P (insn) = 1;
7669 }
7670
7671 /* We prefer to emit the combined return/authenticate instruction RETAA,
7672 however there are three cases in which we must instead emit an explicit
7673 authentication instruction.
7674
7675 1) Sibcalls don't return in a normal way, so if we're about to call one
7676 we must authenticate.
7677
7678 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
7679 generating code for !TARGET_ARMV8_3 we can't use it and must
7680 explicitly authenticate.
7681
7682 3) On an eh_return path we make extra stack adjustments to update the
7683 canonical frame address to be the exception handler's CFA. We want
7684 to authenticate using the CFA of the function which calls eh_return.
7685 */
7686 if (aarch64_return_address_signing_enabled ()
7687 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
7688 {
7689 switch (aarch64_ra_sign_key)
7690 {
7691 case AARCH64_KEY_A:
7692 insn = emit_insn (gen_autiasp ());
7693 break;
7694 case AARCH64_KEY_B:
7695 insn = emit_insn (gen_autibsp ());
7696 break;
7697 default:
7698 gcc_unreachable ();
7699 }
7700 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7701 RTX_FRAME_RELATED_P (insn) = 1;
7702 }
7703
7704 /* Stack adjustment for exception handler. */
7705 if (crtl->calls_eh_return && !for_sibcall)
7706 {
7707 /* We need to unwind the stack by the offset computed by
7708 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
7709 to be SP; letting the CFA move during this adjustment
7710 is just as correct as retaining the CFA from the body
7711 of the function. Therefore, do nothing special. */
7712 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
7713 }
7714
7715 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
7716 if (!for_sibcall)
7717 emit_jump_insn (ret_rtx);
7718 }
7719
7720 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
7721 normally or return to a previous frame after unwinding.
7722
7723 An EH return uses a single shared return sequence. The epilogue is
7724 exactly like a normal epilogue except that it has an extra input
7725 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
7726 that must be applied after the frame has been destroyed. An extra label
7727 is inserted before the epilogue which initializes this register to zero,
7728 and this is the entry point for a normal return.
7729
7730 An actual EH return updates the return address, initializes the stack
7731 adjustment and jumps directly into the epilogue (bypassing the zeroing
7732 of the adjustment). Since the return address is typically saved on the
7733 stack when a function makes a call, the saved LR must be updated outside
7734 the epilogue.
7735
7736 This poses problems as the store is generated well before the epilogue,
7737 so the offset of LR is not known yet. Also optimizations will remove the
7738 store as it appears dead, even after the epilogue is generated (as the
7739 base or offset for loading LR is different in many cases).
7740
7741 To avoid these problems this implementation forces the frame pointer
7742 in eh_return functions so that the location of LR is fixed and known early.
7743 It also marks the store volatile, so no optimization is permitted to
7744 remove the store. */
7745 rtx
7746 aarch64_eh_return_handler_rtx (void)
7747 {
7748 rtx tmp = gen_frame_mem (Pmode,
7749 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
7750
7751 /* Mark the store volatile, so no optimization is permitted to remove it. */
7752 MEM_VOLATILE_P (tmp) = true;
7753 return tmp;
7754 }
7755
7756 /* Output code to add DELTA to the first argument, and then jump
7757 to FUNCTION. Used for C++ multiple inheritance. */
7758 static void
7759 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
7760 HOST_WIDE_INT delta,
7761 HOST_WIDE_INT vcall_offset,
7762 tree function)
7763 {
7764 /* The this pointer is always in x0. Note that this differs from
7765 Arm where the this pointer maybe bumped to r1 if r0 is required
7766 to return a pointer to an aggregate. On AArch64 a result value
7767 pointer will be in x8. */
7768 int this_regno = R0_REGNUM;
7769 rtx this_rtx, temp0, temp1, addr, funexp;
7770 rtx_insn *insn;
7771 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
7772
7773 if (aarch64_bti_enabled ())
7774 emit_insn (gen_bti_c());
7775
7776 reload_completed = 1;
7777 emit_note (NOTE_INSN_PROLOGUE_END);
7778
7779 this_rtx = gen_rtx_REG (Pmode, this_regno);
7780 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
7781 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
7782
7783 if (vcall_offset == 0)
7784 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
7785 else
7786 {
7787 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
7788
7789 addr = this_rtx;
7790 if (delta != 0)
7791 {
7792 if (delta >= -256 && delta < 256)
7793 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
7794 plus_constant (Pmode, this_rtx, delta));
7795 else
7796 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
7797 temp1, temp0, false);
7798 }
7799
7800 if (Pmode == ptr_mode)
7801 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
7802 else
7803 aarch64_emit_move (temp0,
7804 gen_rtx_ZERO_EXTEND (Pmode,
7805 gen_rtx_MEM (ptr_mode, addr)));
7806
7807 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
7808 addr = plus_constant (Pmode, temp0, vcall_offset);
7809 else
7810 {
7811 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
7812 Pmode);
7813 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
7814 }
7815
7816 if (Pmode == ptr_mode)
7817 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
7818 else
7819 aarch64_emit_move (temp1,
7820 gen_rtx_SIGN_EXTEND (Pmode,
7821 gen_rtx_MEM (ptr_mode, addr)));
7822
7823 emit_insn (gen_add2_insn (this_rtx, temp1));
7824 }
7825
7826 /* Generate a tail call to the target function. */
7827 if (!TREE_USED (function))
7828 {
7829 assemble_external (function);
7830 TREE_USED (function) = 1;
7831 }
7832 funexp = XEXP (DECL_RTL (function), 0);
7833 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
7834 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
7835 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
7836 SIBLING_CALL_P (insn) = 1;
7837
7838 insn = get_insns ();
7839 shorten_branches (insn);
7840
7841 assemble_start_function (thunk, fnname);
7842 final_start_function (insn, file, 1);
7843 final (insn, file, 1);
7844 final_end_function ();
7845 assemble_end_function (thunk, fnname);
7846
7847 /* Stop pretending to be a post-reload pass. */
7848 reload_completed = 0;
7849 }
7850
7851 static bool
7852 aarch64_tls_referenced_p (rtx x)
7853 {
7854 if (!TARGET_HAVE_TLS)
7855 return false;
7856 subrtx_iterator::array_type array;
7857 FOR_EACH_SUBRTX (iter, array, x, ALL)
7858 {
7859 const_rtx x = *iter;
7860 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
7861 return true;
7862 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
7863 TLS offsets, not real symbol references. */
7864 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7865 iter.skip_subrtxes ();
7866 }
7867 return false;
7868 }
7869
7870
7871 /* Return true if val can be encoded as a 12-bit unsigned immediate with
7872 a left shift of 0 or 12 bits. */
7873 bool
7874 aarch64_uimm12_shift (HOST_WIDE_INT val)
7875 {
7876 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
7877 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
7878 );
7879 }
7880
7881 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
7882 that can be created with a left shift of 0 or 12. */
7883 static HOST_WIDE_INT
7884 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
7885 {
7886 /* Check to see if the value fits in 24 bits, as that is the maximum we can
7887 handle correctly. */
7888 gcc_assert ((val & 0xffffff) == val);
7889
7890 if (((val & 0xfff) << 0) == val)
7891 return val;
7892
7893 return val & (0xfff << 12);
7894 }
7895
7896 /* Return true if val is an immediate that can be loaded into a
7897 register by a MOVZ instruction. */
7898 static bool
7899 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
7900 {
7901 if (GET_MODE_SIZE (mode) > 4)
7902 {
7903 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
7904 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
7905 return 1;
7906 }
7907 else
7908 {
7909 /* Ignore sign extension. */
7910 val &= (HOST_WIDE_INT) 0xffffffff;
7911 }
7912 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
7913 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
7914 }
7915
7916 /* Test whether:
7917
7918 X = (X & AND_VAL) | IOR_VAL;
7919
7920 can be implemented using:
7921
7922 MOVK X, #(IOR_VAL >> shift), LSL #shift
7923
7924 Return the shift if so, otherwise return -1. */
7925 int
7926 aarch64_movk_shift (const wide_int_ref &and_val,
7927 const wide_int_ref &ior_val)
7928 {
7929 unsigned int precision = and_val.get_precision ();
7930 unsigned HOST_WIDE_INT mask = 0xffff;
7931 for (unsigned int shift = 0; shift < precision; shift += 16)
7932 {
7933 if (and_val == ~mask && (ior_val & mask) == ior_val)
7934 return shift;
7935 mask <<= 16;
7936 }
7937 return -1;
7938 }
7939
7940 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
7941 64-bit (DImode) integer. */
7942
7943 static unsigned HOST_WIDE_INT
7944 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7945 {
7946 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7947 while (size < 64)
7948 {
7949 val &= (HOST_WIDE_INT_1U << size) - 1;
7950 val |= val << size;
7951 size *= 2;
7952 }
7953 return val;
7954 }
7955
7956 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7957
7958 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7959 {
7960 0x0000000100000001ull,
7961 0x0001000100010001ull,
7962 0x0101010101010101ull,
7963 0x1111111111111111ull,
7964 0x5555555555555555ull,
7965 };
7966
7967
7968 /* Return true if val is a valid bitmask immediate. */
7969
7970 bool
7971 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7972 {
7973 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7974 int bits;
7975
7976 /* Check for a single sequence of one bits and return quickly if so.
7977 The special cases of all ones and all zeroes returns false. */
7978 val = aarch64_replicate_bitmask_imm (val_in, mode);
7979 tmp = val + (val & -val);
7980
7981 if (tmp == (tmp & -tmp))
7982 return (val + 1) > 1;
7983
7984 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7985 if (mode == SImode)
7986 val = (val << 32) | (val & 0xffffffff);
7987
7988 /* Invert if the immediate doesn't start with a zero bit - this means we
7989 only need to search for sequences of one bits. */
7990 if (val & 1)
7991 val = ~val;
7992
7993 /* Find the first set bit and set tmp to val with the first sequence of one
7994 bits removed. Return success if there is a single sequence of ones. */
7995 first_one = val & -val;
7996 tmp = val & (val + first_one);
7997
7998 if (tmp == 0)
7999 return true;
8000
8001 /* Find the next set bit and compute the difference in bit position. */
8002 next_one = tmp & -tmp;
8003 bits = clz_hwi (first_one) - clz_hwi (next_one);
8004 mask = val ^ tmp;
8005
8006 /* Check the bit position difference is a power of 2, and that the first
8007 sequence of one bits fits within 'bits' bits. */
8008 if ((mask >> bits) != 0 || bits != (bits & -bits))
8009 return false;
8010
8011 /* Check the sequence of one bits is repeated 64/bits times. */
8012 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
8013 }
8014
8015 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8016 Assumed precondition: VAL_IN Is not zero. */
8017
8018 unsigned HOST_WIDE_INT
8019 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8020 {
8021 int lowest_bit_set = ctz_hwi (val_in);
8022 int highest_bit_set = floor_log2 (val_in);
8023 gcc_assert (val_in != 0);
8024
8025 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8026 (HOST_WIDE_INT_1U << lowest_bit_set));
8027 }
8028
8029 /* Create constant where bits outside of lowest bit set to highest bit set
8030 are set to 1. */
8031
8032 unsigned HOST_WIDE_INT
8033 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8034 {
8035 return val_in | ~aarch64_and_split_imm1 (val_in);
8036 }
8037
8038 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
8039
8040 bool
8041 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8042 {
8043 scalar_int_mode int_mode;
8044 if (!is_a <scalar_int_mode> (mode, &int_mode))
8045 return false;
8046
8047 if (aarch64_bitmask_imm (val_in, int_mode))
8048 return false;
8049
8050 if (aarch64_move_imm (val_in, int_mode))
8051 return false;
8052
8053 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8054
8055 return aarch64_bitmask_imm (imm2, int_mode);
8056 }
8057
8058 /* Return true if val is an immediate that can be loaded into a
8059 register in a single instruction. */
8060 bool
8061 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8062 {
8063 scalar_int_mode int_mode;
8064 if (!is_a <scalar_int_mode> (mode, &int_mode))
8065 return false;
8066
8067 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8068 return 1;
8069 return aarch64_bitmask_imm (val, int_mode);
8070 }
8071
8072 static bool
8073 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8074 {
8075 rtx base, offset;
8076
8077 if (GET_CODE (x) == HIGH)
8078 return true;
8079
8080 /* There's no way to calculate VL-based values using relocations. */
8081 subrtx_iterator::array_type array;
8082 FOR_EACH_SUBRTX (iter, array, x, ALL)
8083 if (GET_CODE (*iter) == CONST_POLY_INT)
8084 return true;
8085
8086 split_const (x, &base, &offset);
8087 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
8088 {
8089 if (aarch64_classify_symbol (base, INTVAL (offset))
8090 != SYMBOL_FORCE_TO_MEM)
8091 return true;
8092 else
8093 /* Avoid generating a 64-bit relocation in ILP32; leave
8094 to aarch64_expand_mov_immediate to handle it properly. */
8095 return mode != ptr_mode;
8096 }
8097
8098 return aarch64_tls_referenced_p (x);
8099 }
8100
8101 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8102 The expansion for a table switch is quite expensive due to the number
8103 of instructions, the table lookup and hard to predict indirect jump.
8104 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8105 set, otherwise use tables for > 16 cases as a tradeoff between size and
8106 performance. When optimizing for size, use the default setting. */
8107
8108 static unsigned int
8109 aarch64_case_values_threshold (void)
8110 {
8111 /* Use the specified limit for the number of cases before using jump
8112 tables at higher optimization levels. */
8113 if (optimize > 2
8114 && selected_cpu->tune->max_case_values != 0)
8115 return selected_cpu->tune->max_case_values;
8116 else
8117 return optimize_size ? default_case_values_threshold () : 17;
8118 }
8119
8120 /* Return true if register REGNO is a valid index register.
8121 STRICT_P is true if REG_OK_STRICT is in effect. */
8122
8123 bool
8124 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8125 {
8126 if (!HARD_REGISTER_NUM_P (regno))
8127 {
8128 if (!strict_p)
8129 return true;
8130
8131 if (!reg_renumber)
8132 return false;
8133
8134 regno = reg_renumber[regno];
8135 }
8136 return GP_REGNUM_P (regno);
8137 }
8138
8139 /* Return true if register REGNO is a valid base register for mode MODE.
8140 STRICT_P is true if REG_OK_STRICT is in effect. */
8141
8142 bool
8143 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8144 {
8145 if (!HARD_REGISTER_NUM_P (regno))
8146 {
8147 if (!strict_p)
8148 return true;
8149
8150 if (!reg_renumber)
8151 return false;
8152
8153 regno = reg_renumber[regno];
8154 }
8155
8156 /* The fake registers will be eliminated to either the stack or
8157 hard frame pointer, both of which are usually valid base registers.
8158 Reload deals with the cases where the eliminated form isn't valid. */
8159 return (GP_REGNUM_P (regno)
8160 || regno == SP_REGNUM
8161 || regno == FRAME_POINTER_REGNUM
8162 || regno == ARG_POINTER_REGNUM);
8163 }
8164
8165 /* Return true if X is a valid base register for mode MODE.
8166 STRICT_P is true if REG_OK_STRICT is in effect. */
8167
8168 static bool
8169 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8170 {
8171 if (!strict_p
8172 && GET_CODE (x) == SUBREG
8173 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8174 x = SUBREG_REG (x);
8175
8176 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8177 }
8178
8179 /* Return true if address offset is a valid index. If it is, fill in INFO
8180 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8181
8182 static bool
8183 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8184 machine_mode mode, bool strict_p)
8185 {
8186 enum aarch64_address_type type;
8187 rtx index;
8188 int shift;
8189
8190 /* (reg:P) */
8191 if ((REG_P (x) || GET_CODE (x) == SUBREG)
8192 && GET_MODE (x) == Pmode)
8193 {
8194 type = ADDRESS_REG_REG;
8195 index = x;
8196 shift = 0;
8197 }
8198 /* (sign_extend:DI (reg:SI)) */
8199 else if ((GET_CODE (x) == SIGN_EXTEND
8200 || GET_CODE (x) == ZERO_EXTEND)
8201 && GET_MODE (x) == DImode
8202 && GET_MODE (XEXP (x, 0)) == SImode)
8203 {
8204 type = (GET_CODE (x) == SIGN_EXTEND)
8205 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8206 index = XEXP (x, 0);
8207 shift = 0;
8208 }
8209 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8210 else if (GET_CODE (x) == MULT
8211 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8212 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8213 && GET_MODE (XEXP (x, 0)) == DImode
8214 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8215 && CONST_INT_P (XEXP (x, 1)))
8216 {
8217 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8218 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8219 index = XEXP (XEXP (x, 0), 0);
8220 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8221 }
8222 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8223 else if (GET_CODE (x) == ASHIFT
8224 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8225 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8226 && GET_MODE (XEXP (x, 0)) == DImode
8227 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8228 && CONST_INT_P (XEXP (x, 1)))
8229 {
8230 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8231 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8232 index = XEXP (XEXP (x, 0), 0);
8233 shift = INTVAL (XEXP (x, 1));
8234 }
8235 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8236 else if ((GET_CODE (x) == SIGN_EXTRACT
8237 || GET_CODE (x) == ZERO_EXTRACT)
8238 && GET_MODE (x) == DImode
8239 && GET_CODE (XEXP (x, 0)) == MULT
8240 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8241 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8242 {
8243 type = (GET_CODE (x) == SIGN_EXTRACT)
8244 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8245 index = XEXP (XEXP (x, 0), 0);
8246 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8247 if (INTVAL (XEXP (x, 1)) != 32 + shift
8248 || INTVAL (XEXP (x, 2)) != 0)
8249 shift = -1;
8250 }
8251 /* (and:DI (mult:DI (reg:DI) (const_int scale))
8252 (const_int 0xffffffff<<shift)) */
8253 else if (GET_CODE (x) == AND
8254 && GET_MODE (x) == DImode
8255 && GET_CODE (XEXP (x, 0)) == MULT
8256 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8257 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8258 && CONST_INT_P (XEXP (x, 1)))
8259 {
8260 type = ADDRESS_REG_UXTW;
8261 index = XEXP (XEXP (x, 0), 0);
8262 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8263 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8264 shift = -1;
8265 }
8266 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8267 else if ((GET_CODE (x) == SIGN_EXTRACT
8268 || GET_CODE (x) == ZERO_EXTRACT)
8269 && GET_MODE (x) == DImode
8270 && GET_CODE (XEXP (x, 0)) == ASHIFT
8271 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8272 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8273 {
8274 type = (GET_CODE (x) == SIGN_EXTRACT)
8275 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8276 index = XEXP (XEXP (x, 0), 0);
8277 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8278 if (INTVAL (XEXP (x, 1)) != 32 + shift
8279 || INTVAL (XEXP (x, 2)) != 0)
8280 shift = -1;
8281 }
8282 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8283 (const_int 0xffffffff<<shift)) */
8284 else if (GET_CODE (x) == AND
8285 && GET_MODE (x) == DImode
8286 && GET_CODE (XEXP (x, 0)) == ASHIFT
8287 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8288 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8289 && CONST_INT_P (XEXP (x, 1)))
8290 {
8291 type = ADDRESS_REG_UXTW;
8292 index = XEXP (XEXP (x, 0), 0);
8293 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8294 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8295 shift = -1;
8296 }
8297 /* (mult:P (reg:P) (const_int scale)) */
8298 else if (GET_CODE (x) == MULT
8299 && GET_MODE (x) == Pmode
8300 && GET_MODE (XEXP (x, 0)) == Pmode
8301 && CONST_INT_P (XEXP (x, 1)))
8302 {
8303 type = ADDRESS_REG_REG;
8304 index = XEXP (x, 0);
8305 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8306 }
8307 /* (ashift:P (reg:P) (const_int shift)) */
8308 else if (GET_CODE (x) == ASHIFT
8309 && GET_MODE (x) == Pmode
8310 && GET_MODE (XEXP (x, 0)) == Pmode
8311 && CONST_INT_P (XEXP (x, 1)))
8312 {
8313 type = ADDRESS_REG_REG;
8314 index = XEXP (x, 0);
8315 shift = INTVAL (XEXP (x, 1));
8316 }
8317 else
8318 return false;
8319
8320 if (!strict_p
8321 && GET_CODE (index) == SUBREG
8322 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8323 index = SUBREG_REG (index);
8324
8325 if (aarch64_sve_data_mode_p (mode))
8326 {
8327 if (type != ADDRESS_REG_REG
8328 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8329 return false;
8330 }
8331 else
8332 {
8333 if (shift != 0
8334 && !(IN_RANGE (shift, 1, 3)
8335 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8336 return false;
8337 }
8338
8339 if (REG_P (index)
8340 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8341 {
8342 info->type = type;
8343 info->offset = index;
8344 info->shift = shift;
8345 return true;
8346 }
8347
8348 return false;
8349 }
8350
8351 /* Return true if MODE is one of the modes for which we
8352 support LDP/STP operations. */
8353
8354 static bool
8355 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8356 {
8357 return mode == SImode || mode == DImode
8358 || mode == SFmode || mode == DFmode
8359 || (aarch64_vector_mode_supported_p (mode)
8360 && (known_eq (GET_MODE_SIZE (mode), 8)
8361 || (known_eq (GET_MODE_SIZE (mode), 16)
8362 && (aarch64_tune_params.extra_tuning_flags
8363 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8364 }
8365
8366 /* Return true if REGNO is a virtual pointer register, or an eliminable
8367 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
8368 include stack_pointer or hard_frame_pointer. */
8369 static bool
8370 virt_or_elim_regno_p (unsigned regno)
8371 {
8372 return ((regno >= FIRST_VIRTUAL_REGISTER
8373 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8374 || regno == FRAME_POINTER_REGNUM
8375 || regno == ARG_POINTER_REGNUM);
8376 }
8377
8378 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8379 If it is, fill in INFO appropriately. STRICT_P is true if
8380 REG_OK_STRICT is in effect. */
8381
8382 bool
8383 aarch64_classify_address (struct aarch64_address_info *info,
8384 rtx x, machine_mode mode, bool strict_p,
8385 aarch64_addr_query_type type)
8386 {
8387 enum rtx_code code = GET_CODE (x);
8388 rtx op0, op1;
8389 poly_int64 offset;
8390
8391 HOST_WIDE_INT const_size;
8392
8393 /* Whether a vector mode is partial doesn't affect address legitimacy.
8394 Partial vectors like VNx8QImode allow the same indexed addressing
8395 mode and MUL VL addressing mode as full vectors like VNx16QImode;
8396 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
8397 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8398 vec_flags &= ~VEC_PARTIAL;
8399
8400 /* On BE, we use load/store pair for all large int mode load/stores.
8401 TI/TFmode may also use a load/store pair. */
8402 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8403 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8404 || type == ADDR_QUERY_LDP_STP_N
8405 || mode == TImode
8406 || mode == TFmode
8407 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8408
8409 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8410 corresponds to the actual size of the memory being loaded/stored and the
8411 mode of the corresponding addressing mode is half of that. */
8412 if (type == ADDR_QUERY_LDP_STP_N
8413 && known_eq (GET_MODE_SIZE (mode), 16))
8414 mode = DFmode;
8415
8416 bool allow_reg_index_p = (!load_store_pair_p
8417 && (known_lt (GET_MODE_SIZE (mode), 16)
8418 || vec_flags == VEC_ADVSIMD
8419 || vec_flags & VEC_SVE_DATA));
8420
8421 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8422 [Rn, #offset, MUL VL]. */
8423 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8424 && (code != REG && code != PLUS))
8425 return false;
8426
8427 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8428 REG addressing. */
8429 if (advsimd_struct_p
8430 && !BYTES_BIG_ENDIAN
8431 && (code != POST_INC && code != REG))
8432 return false;
8433
8434 gcc_checking_assert (GET_MODE (x) == VOIDmode
8435 || SCALAR_INT_MODE_P (GET_MODE (x)));
8436
8437 switch (code)
8438 {
8439 case REG:
8440 case SUBREG:
8441 info->type = ADDRESS_REG_IMM;
8442 info->base = x;
8443 info->offset = const0_rtx;
8444 info->const_offset = 0;
8445 return aarch64_base_register_rtx_p (x, strict_p);
8446
8447 case PLUS:
8448 op0 = XEXP (x, 0);
8449 op1 = XEXP (x, 1);
8450
8451 if (! strict_p
8452 && REG_P (op0)
8453 && virt_or_elim_regno_p (REGNO (op0))
8454 && poly_int_rtx_p (op1, &offset))
8455 {
8456 info->type = ADDRESS_REG_IMM;
8457 info->base = op0;
8458 info->offset = op1;
8459 info->const_offset = offset;
8460
8461 return true;
8462 }
8463
8464 if (maybe_ne (GET_MODE_SIZE (mode), 0)
8465 && aarch64_base_register_rtx_p (op0, strict_p)
8466 && poly_int_rtx_p (op1, &offset))
8467 {
8468 info->type = ADDRESS_REG_IMM;
8469 info->base = op0;
8470 info->offset = op1;
8471 info->const_offset = offset;
8472
8473 /* TImode and TFmode values are allowed in both pairs of X
8474 registers and individual Q registers. The available
8475 address modes are:
8476 X,X: 7-bit signed scaled offset
8477 Q: 9-bit signed offset
8478 We conservatively require an offset representable in either mode.
8479 When performing the check for pairs of X registers i.e. LDP/STP
8480 pass down DImode since that is the natural size of the LDP/STP
8481 instruction memory accesses. */
8482 if (mode == TImode || mode == TFmode)
8483 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
8484 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8485 || offset_12bit_unsigned_scaled_p (mode, offset)));
8486
8487 /* A 7bit offset check because OImode will emit a ldp/stp
8488 instruction (only big endian will get here).
8489 For ldp/stp instructions, the offset is scaled for the size of a
8490 single element of the pair. */
8491 if (mode == OImode)
8492 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
8493
8494 /* Three 9/12 bit offsets checks because CImode will emit three
8495 ldr/str instructions (only big endian will get here). */
8496 if (mode == CImode)
8497 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8498 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
8499 offset + 32)
8500 || offset_12bit_unsigned_scaled_p (V16QImode,
8501 offset + 32)));
8502
8503 /* Two 7bit offsets checks because XImode will emit two ldp/stp
8504 instructions (only big endian will get here). */
8505 if (mode == XImode)
8506 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8507 && aarch64_offset_7bit_signed_scaled_p (TImode,
8508 offset + 32));
8509
8510 /* Make "m" use the LD1 offset range for SVE data modes, so
8511 that pre-RTL optimizers like ivopts will work to that
8512 instead of the wider LDR/STR range. */
8513 if (vec_flags == VEC_SVE_DATA)
8514 return (type == ADDR_QUERY_M
8515 ? offset_4bit_signed_scaled_p (mode, offset)
8516 : offset_9bit_signed_scaled_p (mode, offset));
8517
8518 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
8519 {
8520 poly_int64 end_offset = (offset
8521 + GET_MODE_SIZE (mode)
8522 - BYTES_PER_SVE_VECTOR);
8523 return (type == ADDR_QUERY_M
8524 ? offset_4bit_signed_scaled_p (mode, offset)
8525 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
8526 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
8527 end_offset)));
8528 }
8529
8530 if (vec_flags == VEC_SVE_PRED)
8531 return offset_9bit_signed_scaled_p (mode, offset);
8532
8533 if (load_store_pair_p)
8534 return ((known_eq (GET_MODE_SIZE (mode), 4)
8535 || known_eq (GET_MODE_SIZE (mode), 8)
8536 || known_eq (GET_MODE_SIZE (mode), 16))
8537 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8538 else
8539 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8540 || offset_12bit_unsigned_scaled_p (mode, offset));
8541 }
8542
8543 if (allow_reg_index_p)
8544 {
8545 /* Look for base + (scaled/extended) index register. */
8546 if (aarch64_base_register_rtx_p (op0, strict_p)
8547 && aarch64_classify_index (info, op1, mode, strict_p))
8548 {
8549 info->base = op0;
8550 return true;
8551 }
8552 if (aarch64_base_register_rtx_p (op1, strict_p)
8553 && aarch64_classify_index (info, op0, mode, strict_p))
8554 {
8555 info->base = op1;
8556 return true;
8557 }
8558 }
8559
8560 return false;
8561
8562 case POST_INC:
8563 case POST_DEC:
8564 case PRE_INC:
8565 case PRE_DEC:
8566 info->type = ADDRESS_REG_WB;
8567 info->base = XEXP (x, 0);
8568 info->offset = NULL_RTX;
8569 return aarch64_base_register_rtx_p (info->base, strict_p);
8570
8571 case POST_MODIFY:
8572 case PRE_MODIFY:
8573 info->type = ADDRESS_REG_WB;
8574 info->base = XEXP (x, 0);
8575 if (GET_CODE (XEXP (x, 1)) == PLUS
8576 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
8577 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
8578 && aarch64_base_register_rtx_p (info->base, strict_p))
8579 {
8580 info->offset = XEXP (XEXP (x, 1), 1);
8581 info->const_offset = offset;
8582
8583 /* TImode and TFmode values are allowed in both pairs of X
8584 registers and individual Q registers. The available
8585 address modes are:
8586 X,X: 7-bit signed scaled offset
8587 Q: 9-bit signed offset
8588 We conservatively require an offset representable in either mode.
8589 */
8590 if (mode == TImode || mode == TFmode)
8591 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
8592 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
8593
8594 if (load_store_pair_p)
8595 return ((known_eq (GET_MODE_SIZE (mode), 4)
8596 || known_eq (GET_MODE_SIZE (mode), 8)
8597 || known_eq (GET_MODE_SIZE (mode), 16))
8598 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8599 else
8600 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
8601 }
8602 return false;
8603
8604 case CONST:
8605 case SYMBOL_REF:
8606 case LABEL_REF:
8607 /* load literal: pc-relative constant pool entry. Only supported
8608 for SI mode or larger. */
8609 info->type = ADDRESS_SYMBOLIC;
8610
8611 if (!load_store_pair_p
8612 && GET_MODE_SIZE (mode).is_constant (&const_size)
8613 && const_size >= 4)
8614 {
8615 rtx sym, addend;
8616
8617 split_const (x, &sym, &addend);
8618 return ((GET_CODE (sym) == LABEL_REF
8619 || (GET_CODE (sym) == SYMBOL_REF
8620 && CONSTANT_POOL_ADDRESS_P (sym)
8621 && aarch64_pcrelative_literal_loads)));
8622 }
8623 return false;
8624
8625 case LO_SUM:
8626 info->type = ADDRESS_LO_SUM;
8627 info->base = XEXP (x, 0);
8628 info->offset = XEXP (x, 1);
8629 if (allow_reg_index_p
8630 && aarch64_base_register_rtx_p (info->base, strict_p))
8631 {
8632 rtx sym, offs;
8633 split_const (info->offset, &sym, &offs);
8634 if (GET_CODE (sym) == SYMBOL_REF
8635 && (aarch64_classify_symbol (sym, INTVAL (offs))
8636 == SYMBOL_SMALL_ABSOLUTE))
8637 {
8638 /* The symbol and offset must be aligned to the access size. */
8639 unsigned int align;
8640
8641 if (CONSTANT_POOL_ADDRESS_P (sym))
8642 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
8643 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
8644 {
8645 tree exp = SYMBOL_REF_DECL (sym);
8646 align = TYPE_ALIGN (TREE_TYPE (exp));
8647 align = aarch64_constant_alignment (exp, align);
8648 }
8649 else if (SYMBOL_REF_DECL (sym))
8650 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
8651 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
8652 && SYMBOL_REF_BLOCK (sym) != NULL)
8653 align = SYMBOL_REF_BLOCK (sym)->alignment;
8654 else
8655 align = BITS_PER_UNIT;
8656
8657 poly_int64 ref_size = GET_MODE_SIZE (mode);
8658 if (known_eq (ref_size, 0))
8659 ref_size = GET_MODE_SIZE (DImode);
8660
8661 return (multiple_p (INTVAL (offs), ref_size)
8662 && multiple_p (align / BITS_PER_UNIT, ref_size));
8663 }
8664 }
8665 return false;
8666
8667 default:
8668 return false;
8669 }
8670 }
8671
8672 /* Return true if the address X is valid for a PRFM instruction.
8673 STRICT_P is true if we should do strict checking with
8674 aarch64_classify_address. */
8675
8676 bool
8677 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
8678 {
8679 struct aarch64_address_info addr;
8680
8681 /* PRFM accepts the same addresses as DImode... */
8682 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
8683 if (!res)
8684 return false;
8685
8686 /* ... except writeback forms. */
8687 return addr.type != ADDRESS_REG_WB;
8688 }
8689
8690 bool
8691 aarch64_symbolic_address_p (rtx x)
8692 {
8693 rtx offset;
8694
8695 split_const (x, &x, &offset);
8696 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
8697 }
8698
8699 /* Classify the base of symbolic expression X. */
8700
8701 enum aarch64_symbol_type
8702 aarch64_classify_symbolic_expression (rtx x)
8703 {
8704 rtx offset;
8705
8706 split_const (x, &x, &offset);
8707 return aarch64_classify_symbol (x, INTVAL (offset));
8708 }
8709
8710
8711 /* Return TRUE if X is a legitimate address for accessing memory in
8712 mode MODE. */
8713 static bool
8714 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
8715 {
8716 struct aarch64_address_info addr;
8717
8718 return aarch64_classify_address (&addr, x, mode, strict_p);
8719 }
8720
8721 /* Return TRUE if X is a legitimate address of type TYPE for accessing
8722 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
8723 bool
8724 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
8725 aarch64_addr_query_type type)
8726 {
8727 struct aarch64_address_info addr;
8728
8729 return aarch64_classify_address (&addr, x, mode, strict_p, type);
8730 }
8731
8732 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
8733
8734 static bool
8735 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
8736 poly_int64 orig_offset,
8737 machine_mode mode)
8738 {
8739 HOST_WIDE_INT size;
8740 if (GET_MODE_SIZE (mode).is_constant (&size))
8741 {
8742 HOST_WIDE_INT const_offset, second_offset;
8743
8744 /* A general SVE offset is A * VQ + B. Remove the A component from
8745 coefficient 0 in order to get the constant B. */
8746 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
8747
8748 /* Split an out-of-range address displacement into a base and
8749 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
8750 range otherwise to increase opportunities for sharing the base
8751 address of different sizes. Unaligned accesses use the signed
8752 9-bit range, TImode/TFmode use the intersection of signed
8753 scaled 7-bit and signed 9-bit offset. */
8754 if (mode == TImode || mode == TFmode)
8755 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
8756 else if ((const_offset & (size - 1)) != 0)
8757 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
8758 else
8759 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
8760
8761 if (second_offset == 0 || known_eq (orig_offset, second_offset))
8762 return false;
8763
8764 /* Split the offset into second_offset and the rest. */
8765 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8766 *offset2 = gen_int_mode (second_offset, Pmode);
8767 return true;
8768 }
8769 else
8770 {
8771 /* Get the mode we should use as the basis of the range. For structure
8772 modes this is the mode of one vector. */
8773 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8774 machine_mode step_mode
8775 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
8776
8777 /* Get the "mul vl" multiplier we'd like to use. */
8778 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
8779 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
8780 if (vec_flags & VEC_SVE_DATA)
8781 /* LDR supports a 9-bit range, but the move patterns for
8782 structure modes require all vectors to be in range of the
8783 same base. The simplest way of accomodating that while still
8784 promoting reuse of anchor points between different modes is
8785 to use an 8-bit range unconditionally. */
8786 vnum = ((vnum + 128) & 255) - 128;
8787 else
8788 /* Predicates are only handled singly, so we might as well use
8789 the full range. */
8790 vnum = ((vnum + 256) & 511) - 256;
8791 if (vnum == 0)
8792 return false;
8793
8794 /* Convert the "mul vl" multiplier into a byte offset. */
8795 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
8796 if (known_eq (second_offset, orig_offset))
8797 return false;
8798
8799 /* Split the offset into second_offset and the rest. */
8800 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8801 *offset2 = gen_int_mode (second_offset, Pmode);
8802 return true;
8803 }
8804 }
8805
8806 /* Return the binary representation of floating point constant VALUE in INTVAL.
8807 If the value cannot be converted, return false without setting INTVAL.
8808 The conversion is done in the given MODE. */
8809 bool
8810 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
8811 {
8812
8813 /* We make a general exception for 0. */
8814 if (aarch64_float_const_zero_rtx_p (value))
8815 {
8816 *intval = 0;
8817 return true;
8818 }
8819
8820 scalar_float_mode mode;
8821 if (GET_CODE (value) != CONST_DOUBLE
8822 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
8823 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
8824 /* Only support up to DF mode. */
8825 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
8826 return false;
8827
8828 unsigned HOST_WIDE_INT ival = 0;
8829
8830 long res[2];
8831 real_to_target (res,
8832 CONST_DOUBLE_REAL_VALUE (value),
8833 REAL_MODE_FORMAT (mode));
8834
8835 if (mode == DFmode)
8836 {
8837 int order = BYTES_BIG_ENDIAN ? 1 : 0;
8838 ival = zext_hwi (res[order], 32);
8839 ival |= (zext_hwi (res[1 - order], 32) << 32);
8840 }
8841 else
8842 ival = zext_hwi (res[0], 32);
8843
8844 *intval = ival;
8845 return true;
8846 }
8847
8848 /* Return TRUE if rtx X is an immediate constant that can be moved using a
8849 single MOV(+MOVK) followed by an FMOV. */
8850 bool
8851 aarch64_float_const_rtx_p (rtx x)
8852 {
8853 machine_mode mode = GET_MODE (x);
8854 if (mode == VOIDmode)
8855 return false;
8856
8857 /* Determine whether it's cheaper to write float constants as
8858 mov/movk pairs over ldr/adrp pairs. */
8859 unsigned HOST_WIDE_INT ival;
8860
8861 if (GET_CODE (x) == CONST_DOUBLE
8862 && SCALAR_FLOAT_MODE_P (mode)
8863 && aarch64_reinterpret_float_as_int (x, &ival))
8864 {
8865 scalar_int_mode imode = (mode == HFmode
8866 ? SImode
8867 : int_mode_for_mode (mode).require ());
8868 int num_instr = aarch64_internal_mov_immediate
8869 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8870 return num_instr < 3;
8871 }
8872
8873 return false;
8874 }
8875
8876 /* Return TRUE if rtx X is immediate constant 0.0 */
8877 bool
8878 aarch64_float_const_zero_rtx_p (rtx x)
8879 {
8880 if (GET_MODE (x) == VOIDmode)
8881 return false;
8882
8883 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
8884 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
8885 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
8886 }
8887
8888 /* Return TRUE if rtx X is immediate constant that fits in a single
8889 MOVI immediate operation. */
8890 bool
8891 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
8892 {
8893 if (!TARGET_SIMD)
8894 return false;
8895
8896 machine_mode vmode;
8897 scalar_int_mode imode;
8898 unsigned HOST_WIDE_INT ival;
8899
8900 if (GET_CODE (x) == CONST_DOUBLE
8901 && SCALAR_FLOAT_MODE_P (mode))
8902 {
8903 if (!aarch64_reinterpret_float_as_int (x, &ival))
8904 return false;
8905
8906 /* We make a general exception for 0. */
8907 if (aarch64_float_const_zero_rtx_p (x))
8908 return true;
8909
8910 imode = int_mode_for_mode (mode).require ();
8911 }
8912 else if (GET_CODE (x) == CONST_INT
8913 && is_a <scalar_int_mode> (mode, &imode))
8914 ival = INTVAL (x);
8915 else
8916 return false;
8917
8918 /* use a 64 bit mode for everything except for DI/DF mode, where we use
8919 a 128 bit vector mode. */
8920 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
8921
8922 vmode = aarch64_simd_container_mode (imode, width);
8923 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
8924
8925 return aarch64_simd_valid_immediate (v_op, NULL);
8926 }
8927
8928
8929 /* Return the fixed registers used for condition codes. */
8930
8931 static bool
8932 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
8933 {
8934 *p1 = CC_REGNUM;
8935 *p2 = INVALID_REGNUM;
8936 return true;
8937 }
8938
8939 /* This function is used by the call expanders of the machine description.
8940 RESULT is the register in which the result is returned. It's NULL for
8941 "call" and "sibcall".
8942 MEM is the location of the function call.
8943 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
8944 SIBCALL indicates whether this function call is normal call or sibling call.
8945 It will generate different pattern accordingly. */
8946
8947 void
8948 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
8949 {
8950 rtx call, callee, tmp;
8951 rtvec vec;
8952 machine_mode mode;
8953
8954 gcc_assert (MEM_P (mem));
8955 callee = XEXP (mem, 0);
8956 mode = GET_MODE (callee);
8957 gcc_assert (mode == Pmode);
8958
8959 /* Decide if we should generate indirect calls by loading the
8960 address of the callee into a register before performing
8961 the branch-and-link. */
8962 if (SYMBOL_REF_P (callee)
8963 ? (aarch64_is_long_call_p (callee)
8964 || aarch64_is_noplt_call_p (callee))
8965 : !REG_P (callee))
8966 XEXP (mem, 0) = force_reg (mode, callee);
8967
8968 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8969
8970 if (result != NULL_RTX)
8971 call = gen_rtx_SET (result, call);
8972
8973 if (sibcall)
8974 tmp = ret_rtx;
8975 else
8976 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8977
8978 gcc_assert (CONST_INT_P (callee_abi));
8979 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
8980 UNSPEC_CALLEE_ABI);
8981
8982 vec = gen_rtvec (3, call, callee_abi, tmp);
8983 call = gen_rtx_PARALLEL (VOIDmode, vec);
8984
8985 aarch64_emit_call_insn (call);
8986 }
8987
8988 /* Emit call insn with PAT and do aarch64-specific handling. */
8989
8990 void
8991 aarch64_emit_call_insn (rtx pat)
8992 {
8993 rtx insn = emit_call_insn (pat);
8994
8995 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8996 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8997 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8998 }
8999
9000 machine_mode
9001 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9002 {
9003 machine_mode mode_x = GET_MODE (x);
9004 rtx_code code_x = GET_CODE (x);
9005
9006 /* All floating point compares return CCFP if it is an equality
9007 comparison, and CCFPE otherwise. */
9008 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
9009 {
9010 switch (code)
9011 {
9012 case EQ:
9013 case NE:
9014 case UNORDERED:
9015 case ORDERED:
9016 case UNLT:
9017 case UNLE:
9018 case UNGT:
9019 case UNGE:
9020 case UNEQ:
9021 return CCFPmode;
9022
9023 case LT:
9024 case LE:
9025 case GT:
9026 case GE:
9027 case LTGT:
9028 return CCFPEmode;
9029
9030 default:
9031 gcc_unreachable ();
9032 }
9033 }
9034
9035 /* Equality comparisons of short modes against zero can be performed
9036 using the TST instruction with the appropriate bitmask. */
9037 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
9038 && (code == EQ || code == NE)
9039 && (mode_x == HImode || mode_x == QImode))
9040 return CC_NZmode;
9041
9042 /* Similarly, comparisons of zero_extends from shorter modes can
9043 be performed using an ANDS with an immediate mask. */
9044 if (y == const0_rtx && code_x == ZERO_EXTEND
9045 && (mode_x == SImode || mode_x == DImode)
9046 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9047 && (code == EQ || code == NE))
9048 return CC_NZmode;
9049
9050 if ((mode_x == SImode || mode_x == DImode)
9051 && y == const0_rtx
9052 && (code == EQ || code == NE || code == LT || code == GE)
9053 && (code_x == PLUS || code_x == MINUS || code_x == AND
9054 || code_x == NEG
9055 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9056 && CONST_INT_P (XEXP (x, 2)))))
9057 return CC_NZmode;
9058
9059 /* A compare with a shifted operand. Because of canonicalization,
9060 the comparison will have to be swapped when we emit the assembly
9061 code. */
9062 if ((mode_x == SImode || mode_x == DImode)
9063 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9064 && (code_x == ASHIFT || code_x == ASHIFTRT
9065 || code_x == LSHIFTRT
9066 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9067 return CC_SWPmode;
9068
9069 /* Similarly for a negated operand, but we can only do this for
9070 equalities. */
9071 if ((mode_x == SImode || mode_x == DImode)
9072 && (REG_P (y) || GET_CODE (y) == SUBREG)
9073 && (code == EQ || code == NE)
9074 && code_x == NEG)
9075 return CC_Zmode;
9076
9077 /* A test for unsigned overflow from an addition. */
9078 if ((mode_x == DImode || mode_x == TImode)
9079 && (code == LTU || code == GEU)
9080 && code_x == PLUS
9081 && rtx_equal_p (XEXP (x, 0), y))
9082 return CC_Cmode;
9083
9084 /* A test for unsigned overflow from an add with carry. */
9085 if ((mode_x == DImode || mode_x == TImode)
9086 && (code == LTU || code == GEU)
9087 && code_x == PLUS
9088 && CONST_SCALAR_INT_P (y)
9089 && (rtx_mode_t (y, mode_x)
9090 == (wi::shwi (1, mode_x)
9091 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9092 return CC_ADCmode;
9093
9094 /* A test for signed overflow. */
9095 if ((mode_x == DImode || mode_x == TImode)
9096 && code == NE
9097 && code_x == PLUS
9098 && GET_CODE (y) == SIGN_EXTEND)
9099 return CC_Vmode;
9100
9101 /* For everything else, return CCmode. */
9102 return CCmode;
9103 }
9104
9105 static int
9106 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9107
9108 int
9109 aarch64_get_condition_code (rtx x)
9110 {
9111 machine_mode mode = GET_MODE (XEXP (x, 0));
9112 enum rtx_code comp_code = GET_CODE (x);
9113
9114 if (GET_MODE_CLASS (mode) != MODE_CC)
9115 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9116 return aarch64_get_condition_code_1 (mode, comp_code);
9117 }
9118
9119 static int
9120 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9121 {
9122 switch (mode)
9123 {
9124 case E_CCFPmode:
9125 case E_CCFPEmode:
9126 switch (comp_code)
9127 {
9128 case GE: return AARCH64_GE;
9129 case GT: return AARCH64_GT;
9130 case LE: return AARCH64_LS;
9131 case LT: return AARCH64_MI;
9132 case NE: return AARCH64_NE;
9133 case EQ: return AARCH64_EQ;
9134 case ORDERED: return AARCH64_VC;
9135 case UNORDERED: return AARCH64_VS;
9136 case UNLT: return AARCH64_LT;
9137 case UNLE: return AARCH64_LE;
9138 case UNGT: return AARCH64_HI;
9139 case UNGE: return AARCH64_PL;
9140 default: return -1;
9141 }
9142 break;
9143
9144 case E_CCmode:
9145 switch (comp_code)
9146 {
9147 case NE: return AARCH64_NE;
9148 case EQ: return AARCH64_EQ;
9149 case GE: return AARCH64_GE;
9150 case GT: return AARCH64_GT;
9151 case LE: return AARCH64_LE;
9152 case LT: return AARCH64_LT;
9153 case GEU: return AARCH64_CS;
9154 case GTU: return AARCH64_HI;
9155 case LEU: return AARCH64_LS;
9156 case LTU: return AARCH64_CC;
9157 default: return -1;
9158 }
9159 break;
9160
9161 case E_CC_SWPmode:
9162 switch (comp_code)
9163 {
9164 case NE: return AARCH64_NE;
9165 case EQ: return AARCH64_EQ;
9166 case GE: return AARCH64_LE;
9167 case GT: return AARCH64_LT;
9168 case LE: return AARCH64_GE;
9169 case LT: return AARCH64_GT;
9170 case GEU: return AARCH64_LS;
9171 case GTU: return AARCH64_CC;
9172 case LEU: return AARCH64_CS;
9173 case LTU: return AARCH64_HI;
9174 default: return -1;
9175 }
9176 break;
9177
9178 case E_CC_NZCmode:
9179 switch (comp_code)
9180 {
9181 case NE: return AARCH64_NE; /* = any */
9182 case EQ: return AARCH64_EQ; /* = none */
9183 case GE: return AARCH64_PL; /* = nfrst */
9184 case LT: return AARCH64_MI; /* = first */
9185 case GEU: return AARCH64_CS; /* = nlast */
9186 case GTU: return AARCH64_HI; /* = pmore */
9187 case LEU: return AARCH64_LS; /* = plast */
9188 case LTU: return AARCH64_CC; /* = last */
9189 default: return -1;
9190 }
9191 break;
9192
9193 case E_CC_NZmode:
9194 switch (comp_code)
9195 {
9196 case NE: return AARCH64_NE;
9197 case EQ: return AARCH64_EQ;
9198 case GE: return AARCH64_PL;
9199 case LT: return AARCH64_MI;
9200 default: return -1;
9201 }
9202 break;
9203
9204 case E_CC_Zmode:
9205 switch (comp_code)
9206 {
9207 case NE: return AARCH64_NE;
9208 case EQ: return AARCH64_EQ;
9209 default: return -1;
9210 }
9211 break;
9212
9213 case E_CC_Cmode:
9214 switch (comp_code)
9215 {
9216 case LTU: return AARCH64_CS;
9217 case GEU: return AARCH64_CC;
9218 default: return -1;
9219 }
9220 break;
9221
9222 case E_CC_ADCmode:
9223 switch (comp_code)
9224 {
9225 case GEU: return AARCH64_CS;
9226 case LTU: return AARCH64_CC;
9227 default: return -1;
9228 }
9229 break;
9230
9231 case E_CC_Vmode:
9232 switch (comp_code)
9233 {
9234 case NE: return AARCH64_VS;
9235 case EQ: return AARCH64_VC;
9236 default: return -1;
9237 }
9238 break;
9239
9240 default:
9241 return -1;
9242 }
9243
9244 return -1;
9245 }
9246
9247 bool
9248 aarch64_const_vec_all_same_in_range_p (rtx x,
9249 HOST_WIDE_INT minval,
9250 HOST_WIDE_INT maxval)
9251 {
9252 rtx elt;
9253 return (const_vec_duplicate_p (x, &elt)
9254 && CONST_INT_P (elt)
9255 && IN_RANGE (INTVAL (elt), minval, maxval));
9256 }
9257
9258 bool
9259 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9260 {
9261 return aarch64_const_vec_all_same_in_range_p (x, val, val);
9262 }
9263
9264 /* Return true if VEC is a constant in which every element is in the range
9265 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
9266
9267 static bool
9268 aarch64_const_vec_all_in_range_p (rtx vec,
9269 HOST_WIDE_INT minval,
9270 HOST_WIDE_INT maxval)
9271 {
9272 if (GET_CODE (vec) != CONST_VECTOR
9273 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9274 return false;
9275
9276 int nunits;
9277 if (!CONST_VECTOR_STEPPED_P (vec))
9278 nunits = const_vector_encoded_nelts (vec);
9279 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9280 return false;
9281
9282 for (int i = 0; i < nunits; i++)
9283 {
9284 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9285 if (!CONST_INT_P (vec_elem)
9286 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9287 return false;
9288 }
9289 return true;
9290 }
9291
9292 /* N Z C V. */
9293 #define AARCH64_CC_V 1
9294 #define AARCH64_CC_C (1 << 1)
9295 #define AARCH64_CC_Z (1 << 2)
9296 #define AARCH64_CC_N (1 << 3)
9297
9298 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
9299 static const int aarch64_nzcv_codes[] =
9300 {
9301 0, /* EQ, Z == 1. */
9302 AARCH64_CC_Z, /* NE, Z == 0. */
9303 0, /* CS, C == 1. */
9304 AARCH64_CC_C, /* CC, C == 0. */
9305 0, /* MI, N == 1. */
9306 AARCH64_CC_N, /* PL, N == 0. */
9307 0, /* VS, V == 1. */
9308 AARCH64_CC_V, /* VC, V == 0. */
9309 0, /* HI, C ==1 && Z == 0. */
9310 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
9311 AARCH64_CC_V, /* GE, N == V. */
9312 0, /* LT, N != V. */
9313 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
9314 0, /* LE, !(Z == 0 && N == V). */
9315 0, /* AL, Any. */
9316 0 /* NV, Any. */
9317 };
9318
9319 /* Print floating-point vector immediate operand X to F, negating it
9320 first if NEGATE is true. Return true on success, false if it isn't
9321 a constant we can handle. */
9322
9323 static bool
9324 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9325 {
9326 rtx elt;
9327
9328 if (!const_vec_duplicate_p (x, &elt))
9329 return false;
9330
9331 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9332 if (negate)
9333 r = real_value_negate (&r);
9334
9335 /* Handle the SVE single-bit immediates specially, since they have a
9336 fixed form in the assembly syntax. */
9337 if (real_equal (&r, &dconst0))
9338 asm_fprintf (f, "0.0");
9339 else if (real_equal (&r, &dconst2))
9340 asm_fprintf (f, "2.0");
9341 else if (real_equal (&r, &dconst1))
9342 asm_fprintf (f, "1.0");
9343 else if (real_equal (&r, &dconsthalf))
9344 asm_fprintf (f, "0.5");
9345 else
9346 {
9347 const int buf_size = 20;
9348 char float_buf[buf_size] = {'\0'};
9349 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9350 1, GET_MODE (elt));
9351 asm_fprintf (f, "%s", float_buf);
9352 }
9353
9354 return true;
9355 }
9356
9357 /* Return the equivalent letter for size. */
9358 static char
9359 sizetochar (int size)
9360 {
9361 switch (size)
9362 {
9363 case 64: return 'd';
9364 case 32: return 's';
9365 case 16: return 'h';
9366 case 8 : return 'b';
9367 default: gcc_unreachable ();
9368 }
9369 }
9370
9371 /* Print operand X to file F in a target specific manner according to CODE.
9372 The acceptable formatting commands given by CODE are:
9373 'c': An integer or symbol address without a preceding #
9374 sign.
9375 'C': Take the duplicated element in a vector constant
9376 and print it in hex.
9377 'D': Take the duplicated element in a vector constant
9378 and print it as an unsigned integer, in decimal.
9379 'e': Print the sign/zero-extend size as a character 8->b,
9380 16->h, 32->w. Can also be used for masks:
9381 0xff->b, 0xffff->h, 0xffffffff->w.
9382 'I': If the operand is a duplicated vector constant,
9383 replace it with the duplicated scalar. If the
9384 operand is then a floating-point constant, replace
9385 it with the integer bit representation. Print the
9386 transformed constant as a signed decimal number.
9387 'p': Prints N such that 2^N == X (X must be power of 2 and
9388 const int).
9389 'P': Print the number of non-zero bits in X (a const_int).
9390 'H': Print the higher numbered register of a pair (TImode)
9391 of regs.
9392 'm': Print a condition (eq, ne, etc).
9393 'M': Same as 'm', but invert condition.
9394 'N': Take the duplicated element in a vector constant
9395 and print the negative of it in decimal.
9396 'b/h/s/d/q': Print a scalar FP/SIMD register name.
9397 'S/T/U/V': Print a FP/SIMD register name for a register list.
9398 The register printed is the FP/SIMD register name
9399 of X + 0/1/2/3 for S/T/U/V.
9400 'R': Print a scalar Integer/FP/SIMD register name + 1.
9401 'X': Print bottom 16 bits of integer constant in hex.
9402 'w/x': Print a general register name or the zero register
9403 (32-bit or 64-bit).
9404 '0': Print a normal operand, if it's a general register,
9405 then we assume DImode.
9406 'k': Print NZCV for conditional compare instructions.
9407 'A': Output address constant representing the first
9408 argument of X, specifying a relocation offset
9409 if appropriate.
9410 'L': Output constant address specified by X
9411 with a relocation offset if appropriate.
9412 'G': Prints address of X, specifying a PC relative
9413 relocation mode if appropriate.
9414 'y': Output address of LDP or STP - this is used for
9415 some LDP/STPs which don't use a PARALLEL in their
9416 pattern (so the mode needs to be adjusted).
9417 'z': Output address of a typical LDP or STP. */
9418
9419 static void
9420 aarch64_print_operand (FILE *f, rtx x, int code)
9421 {
9422 rtx elt;
9423 switch (code)
9424 {
9425 case 'c':
9426 switch (GET_CODE (x))
9427 {
9428 case CONST_INT:
9429 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9430 break;
9431
9432 case SYMBOL_REF:
9433 output_addr_const (f, x);
9434 break;
9435
9436 case CONST:
9437 if (GET_CODE (XEXP (x, 0)) == PLUS
9438 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
9439 {
9440 output_addr_const (f, x);
9441 break;
9442 }
9443 /* Fall through. */
9444
9445 default:
9446 output_operand_lossage ("unsupported operand for code '%c'", code);
9447 }
9448 break;
9449
9450 case 'e':
9451 {
9452 x = unwrap_const_vec_duplicate (x);
9453 if (!CONST_INT_P (x))
9454 {
9455 output_operand_lossage ("invalid operand for '%%%c'", code);
9456 return;
9457 }
9458
9459 HOST_WIDE_INT val = INTVAL (x);
9460 if ((val & ~7) == 8 || val == 0xff)
9461 fputc ('b', f);
9462 else if ((val & ~7) == 16 || val == 0xffff)
9463 fputc ('h', f);
9464 else if ((val & ~7) == 32 || val == 0xffffffff)
9465 fputc ('w', f);
9466 else
9467 {
9468 output_operand_lossage ("invalid operand for '%%%c'", code);
9469 return;
9470 }
9471 }
9472 break;
9473
9474 case 'p':
9475 {
9476 int n;
9477
9478 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
9479 {
9480 output_operand_lossage ("invalid operand for '%%%c'", code);
9481 return;
9482 }
9483
9484 asm_fprintf (f, "%d", n);
9485 }
9486 break;
9487
9488 case 'P':
9489 if (!CONST_INT_P (x))
9490 {
9491 output_operand_lossage ("invalid operand for '%%%c'", code);
9492 return;
9493 }
9494
9495 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
9496 break;
9497
9498 case 'H':
9499 if (x == const0_rtx)
9500 {
9501 asm_fprintf (f, "xzr");
9502 break;
9503 }
9504
9505 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
9506 {
9507 output_operand_lossage ("invalid operand for '%%%c'", code);
9508 return;
9509 }
9510
9511 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
9512 break;
9513
9514 case 'I':
9515 {
9516 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
9517 if (CONST_INT_P (x))
9518 asm_fprintf (f, "%wd", INTVAL (x));
9519 else
9520 {
9521 output_operand_lossage ("invalid operand for '%%%c'", code);
9522 return;
9523 }
9524 break;
9525 }
9526
9527 case 'M':
9528 case 'm':
9529 {
9530 int cond_code;
9531 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
9532 if (x == const_true_rtx)
9533 {
9534 if (code == 'M')
9535 fputs ("nv", f);
9536 return;
9537 }
9538
9539 if (!COMPARISON_P (x))
9540 {
9541 output_operand_lossage ("invalid operand for '%%%c'", code);
9542 return;
9543 }
9544
9545 cond_code = aarch64_get_condition_code (x);
9546 gcc_assert (cond_code >= 0);
9547 if (code == 'M')
9548 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
9549 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
9550 fputs (aarch64_sve_condition_codes[cond_code], f);
9551 else
9552 fputs (aarch64_condition_codes[cond_code], f);
9553 }
9554 break;
9555
9556 case 'N':
9557 if (!const_vec_duplicate_p (x, &elt))
9558 {
9559 output_operand_lossage ("invalid vector constant");
9560 return;
9561 }
9562
9563 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9564 asm_fprintf (f, "%wd", -INTVAL (elt));
9565 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9566 && aarch64_print_vector_float_operand (f, x, true))
9567 ;
9568 else
9569 {
9570 output_operand_lossage ("invalid vector constant");
9571 return;
9572 }
9573 break;
9574
9575 case 'b':
9576 case 'h':
9577 case 's':
9578 case 'd':
9579 case 'q':
9580 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9581 {
9582 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9583 return;
9584 }
9585 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
9586 break;
9587
9588 case 'S':
9589 case 'T':
9590 case 'U':
9591 case 'V':
9592 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9593 {
9594 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9595 return;
9596 }
9597 asm_fprintf (f, "%c%d",
9598 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
9599 REGNO (x) - V0_REGNUM + (code - 'S'));
9600 break;
9601
9602 case 'R':
9603 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
9604 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
9605 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9606 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
9607 else
9608 output_operand_lossage ("incompatible register operand for '%%%c'",
9609 code);
9610 break;
9611
9612 case 'X':
9613 if (!CONST_INT_P (x))
9614 {
9615 output_operand_lossage ("invalid operand for '%%%c'", code);
9616 return;
9617 }
9618 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
9619 break;
9620
9621 case 'C':
9622 {
9623 /* Print a replicated constant in hex. */
9624 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9625 {
9626 output_operand_lossage ("invalid operand for '%%%c'", code);
9627 return;
9628 }
9629 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9630 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9631 }
9632 break;
9633
9634 case 'D':
9635 {
9636 /* Print a replicated constant in decimal, treating it as
9637 unsigned. */
9638 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9639 {
9640 output_operand_lossage ("invalid operand for '%%%c'", code);
9641 return;
9642 }
9643 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9644 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9645 }
9646 break;
9647
9648 case 'w':
9649 case 'x':
9650 if (x == const0_rtx
9651 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
9652 {
9653 asm_fprintf (f, "%czr", code);
9654 break;
9655 }
9656
9657 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9658 {
9659 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
9660 break;
9661 }
9662
9663 if (REG_P (x) && REGNO (x) == SP_REGNUM)
9664 {
9665 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
9666 break;
9667 }
9668
9669 /* Fall through */
9670
9671 case 0:
9672 if (x == NULL)
9673 {
9674 output_operand_lossage ("missing operand");
9675 return;
9676 }
9677
9678 switch (GET_CODE (x))
9679 {
9680 case REG:
9681 if (aarch64_sve_data_mode_p (GET_MODE (x)))
9682 {
9683 if (REG_NREGS (x) == 1)
9684 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
9685 else
9686 {
9687 char suffix
9688 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
9689 asm_fprintf (f, "{z%d.%c - z%d.%c}",
9690 REGNO (x) - V0_REGNUM, suffix,
9691 END_REGNO (x) - V0_REGNUM - 1, suffix);
9692 }
9693 }
9694 else
9695 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
9696 break;
9697
9698 case MEM:
9699 output_address (GET_MODE (x), XEXP (x, 0));
9700 break;
9701
9702 case LABEL_REF:
9703 case SYMBOL_REF:
9704 output_addr_const (asm_out_file, x);
9705 break;
9706
9707 case CONST_INT:
9708 asm_fprintf (f, "%wd", INTVAL (x));
9709 break;
9710
9711 case CONST:
9712 if (!VECTOR_MODE_P (GET_MODE (x)))
9713 {
9714 output_addr_const (asm_out_file, x);
9715 break;
9716 }
9717 /* fall through */
9718
9719 case CONST_VECTOR:
9720 if (!const_vec_duplicate_p (x, &elt))
9721 {
9722 output_operand_lossage ("invalid vector constant");
9723 return;
9724 }
9725
9726 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9727 asm_fprintf (f, "%wd", INTVAL (elt));
9728 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9729 && aarch64_print_vector_float_operand (f, x, false))
9730 ;
9731 else
9732 {
9733 output_operand_lossage ("invalid vector constant");
9734 return;
9735 }
9736 break;
9737
9738 case CONST_DOUBLE:
9739 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
9740 be getting CONST_DOUBLEs holding integers. */
9741 gcc_assert (GET_MODE (x) != VOIDmode);
9742 if (aarch64_float_const_zero_rtx_p (x))
9743 {
9744 fputc ('0', f);
9745 break;
9746 }
9747 else if (aarch64_float_const_representable_p (x))
9748 {
9749 #define buf_size 20
9750 char float_buf[buf_size] = {'\0'};
9751 real_to_decimal_for_mode (float_buf,
9752 CONST_DOUBLE_REAL_VALUE (x),
9753 buf_size, buf_size,
9754 1, GET_MODE (x));
9755 asm_fprintf (asm_out_file, "%s", float_buf);
9756 break;
9757 #undef buf_size
9758 }
9759 output_operand_lossage ("invalid constant");
9760 return;
9761 default:
9762 output_operand_lossage ("invalid operand");
9763 return;
9764 }
9765 break;
9766
9767 case 'A':
9768 if (GET_CODE (x) == HIGH)
9769 x = XEXP (x, 0);
9770
9771 switch (aarch64_classify_symbolic_expression (x))
9772 {
9773 case SYMBOL_SMALL_GOT_4G:
9774 asm_fprintf (asm_out_file, ":got:");
9775 break;
9776
9777 case SYMBOL_SMALL_TLSGD:
9778 asm_fprintf (asm_out_file, ":tlsgd:");
9779 break;
9780
9781 case SYMBOL_SMALL_TLSDESC:
9782 asm_fprintf (asm_out_file, ":tlsdesc:");
9783 break;
9784
9785 case SYMBOL_SMALL_TLSIE:
9786 asm_fprintf (asm_out_file, ":gottprel:");
9787 break;
9788
9789 case SYMBOL_TLSLE24:
9790 asm_fprintf (asm_out_file, ":tprel:");
9791 break;
9792
9793 case SYMBOL_TINY_GOT:
9794 gcc_unreachable ();
9795 break;
9796
9797 default:
9798 break;
9799 }
9800 output_addr_const (asm_out_file, x);
9801 break;
9802
9803 case 'L':
9804 switch (aarch64_classify_symbolic_expression (x))
9805 {
9806 case SYMBOL_SMALL_GOT_4G:
9807 asm_fprintf (asm_out_file, ":lo12:");
9808 break;
9809
9810 case SYMBOL_SMALL_TLSGD:
9811 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
9812 break;
9813
9814 case SYMBOL_SMALL_TLSDESC:
9815 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
9816 break;
9817
9818 case SYMBOL_SMALL_TLSIE:
9819 asm_fprintf (asm_out_file, ":gottprel_lo12:");
9820 break;
9821
9822 case SYMBOL_TLSLE12:
9823 asm_fprintf (asm_out_file, ":tprel_lo12:");
9824 break;
9825
9826 case SYMBOL_TLSLE24:
9827 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
9828 break;
9829
9830 case SYMBOL_TINY_GOT:
9831 asm_fprintf (asm_out_file, ":got:");
9832 break;
9833
9834 case SYMBOL_TINY_TLSIE:
9835 asm_fprintf (asm_out_file, ":gottprel:");
9836 break;
9837
9838 default:
9839 break;
9840 }
9841 output_addr_const (asm_out_file, x);
9842 break;
9843
9844 case 'G':
9845 switch (aarch64_classify_symbolic_expression (x))
9846 {
9847 case SYMBOL_TLSLE24:
9848 asm_fprintf (asm_out_file, ":tprel_hi12:");
9849 break;
9850 default:
9851 break;
9852 }
9853 output_addr_const (asm_out_file, x);
9854 break;
9855
9856 case 'k':
9857 {
9858 HOST_WIDE_INT cond_code;
9859
9860 if (!CONST_INT_P (x))
9861 {
9862 output_operand_lossage ("invalid operand for '%%%c'", code);
9863 return;
9864 }
9865
9866 cond_code = INTVAL (x);
9867 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
9868 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
9869 }
9870 break;
9871
9872 case 'y':
9873 case 'z':
9874 {
9875 machine_mode mode = GET_MODE (x);
9876
9877 if (GET_CODE (x) != MEM
9878 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
9879 {
9880 output_operand_lossage ("invalid operand for '%%%c'", code);
9881 return;
9882 }
9883
9884 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
9885 code == 'y'
9886 ? ADDR_QUERY_LDP_STP_N
9887 : ADDR_QUERY_LDP_STP))
9888 output_operand_lossage ("invalid operand prefix '%%%c'", code);
9889 }
9890 break;
9891
9892 default:
9893 output_operand_lossage ("invalid operand prefix '%%%c'", code);
9894 return;
9895 }
9896 }
9897
9898 /* Print address 'x' of a memory access with mode 'mode'.
9899 'op' is the context required by aarch64_classify_address. It can either be
9900 MEM for a normal memory access or PARALLEL for LDP/STP. */
9901 static bool
9902 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
9903 aarch64_addr_query_type type)
9904 {
9905 struct aarch64_address_info addr;
9906 unsigned int size, vec_flags;
9907
9908 /* Check all addresses are Pmode - including ILP32. */
9909 if (GET_MODE (x) != Pmode
9910 && (!CONST_INT_P (x)
9911 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
9912 {
9913 output_operand_lossage ("invalid address mode");
9914 return false;
9915 }
9916
9917 if (aarch64_classify_address (&addr, x, mode, true, type))
9918 switch (addr.type)
9919 {
9920 case ADDRESS_REG_IMM:
9921 if (known_eq (addr.const_offset, 0))
9922 {
9923 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
9924 return true;
9925 }
9926
9927 vec_flags = aarch64_classify_vector_mode (mode);
9928 if (vec_flags & VEC_ANY_SVE)
9929 {
9930 HOST_WIDE_INT vnum
9931 = exact_div (addr.const_offset,
9932 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
9933 asm_fprintf (f, "[%s, #%wd, mul vl]",
9934 reg_names[REGNO (addr.base)], vnum);
9935 return true;
9936 }
9937
9938 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
9939 INTVAL (addr.offset));
9940 return true;
9941
9942 case ADDRESS_REG_REG:
9943 if (addr.shift == 0)
9944 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
9945 reg_names [REGNO (addr.offset)]);
9946 else
9947 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
9948 reg_names [REGNO (addr.offset)], addr.shift);
9949 return true;
9950
9951 case ADDRESS_REG_UXTW:
9952 if (addr.shift == 0)
9953 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9954 REGNO (addr.offset) - R0_REGNUM);
9955 else
9956 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9957 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9958 return true;
9959
9960 case ADDRESS_REG_SXTW:
9961 if (addr.shift == 0)
9962 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9963 REGNO (addr.offset) - R0_REGNUM);
9964 else
9965 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9966 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9967 return true;
9968
9969 case ADDRESS_REG_WB:
9970 /* Writeback is only supported for fixed-width modes. */
9971 size = GET_MODE_SIZE (mode).to_constant ();
9972 switch (GET_CODE (x))
9973 {
9974 case PRE_INC:
9975 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9976 return true;
9977 case POST_INC:
9978 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9979 return true;
9980 case PRE_DEC:
9981 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9982 return true;
9983 case POST_DEC:
9984 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9985 return true;
9986 case PRE_MODIFY:
9987 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9988 INTVAL (addr.offset));
9989 return true;
9990 case POST_MODIFY:
9991 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9992 INTVAL (addr.offset));
9993 return true;
9994 default:
9995 break;
9996 }
9997 break;
9998
9999 case ADDRESS_LO_SUM:
10000 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
10001 output_addr_const (f, addr.offset);
10002 asm_fprintf (f, "]");
10003 return true;
10004
10005 case ADDRESS_SYMBOLIC:
10006 output_addr_const (f, x);
10007 return true;
10008 }
10009
10010 return false;
10011 }
10012
10013 /* Print address 'x' of a memory access with mode 'mode'. */
10014 static void
10015 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10016 {
10017 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
10018 output_addr_const (f, x);
10019 }
10020
10021 bool
10022 aarch64_label_mentioned_p (rtx x)
10023 {
10024 const char *fmt;
10025 int i;
10026
10027 if (GET_CODE (x) == LABEL_REF)
10028 return true;
10029
10030 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10031 referencing instruction, but they are constant offsets, not
10032 symbols. */
10033 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10034 return false;
10035
10036 fmt = GET_RTX_FORMAT (GET_CODE (x));
10037 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10038 {
10039 if (fmt[i] == 'E')
10040 {
10041 int j;
10042
10043 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10044 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10045 return 1;
10046 }
10047 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10048 return 1;
10049 }
10050
10051 return 0;
10052 }
10053
10054 /* Implement REGNO_REG_CLASS. */
10055
10056 enum reg_class
10057 aarch64_regno_regclass (unsigned regno)
10058 {
10059 if (GP_REGNUM_P (regno))
10060 return GENERAL_REGS;
10061
10062 if (regno == SP_REGNUM)
10063 return STACK_REG;
10064
10065 if (regno == FRAME_POINTER_REGNUM
10066 || regno == ARG_POINTER_REGNUM)
10067 return POINTER_REGS;
10068
10069 if (FP_REGNUM_P (regno))
10070 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10071 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10072
10073 if (PR_REGNUM_P (regno))
10074 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10075
10076 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10077 return FFR_REGS;
10078
10079 return NO_REGS;
10080 }
10081
10082 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10083 If OFFSET is out of range, return an offset of an anchor point
10084 that is in range. Return 0 otherwise. */
10085
10086 static HOST_WIDE_INT
10087 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10088 machine_mode mode)
10089 {
10090 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10091 if (size > 16)
10092 return (offset + 0x400) & ~0x7f0;
10093
10094 /* For offsets that aren't a multiple of the access size, the limit is
10095 -256...255. */
10096 if (offset & (size - 1))
10097 {
10098 /* BLKmode typically uses LDP of X-registers. */
10099 if (mode == BLKmode)
10100 return (offset + 512) & ~0x3ff;
10101 return (offset + 0x100) & ~0x1ff;
10102 }
10103
10104 /* Small negative offsets are supported. */
10105 if (IN_RANGE (offset, -256, 0))
10106 return 0;
10107
10108 if (mode == TImode || mode == TFmode)
10109 return (offset + 0x100) & ~0x1ff;
10110
10111 /* Use 12-bit offset by access size. */
10112 return offset & (~0xfff * size);
10113 }
10114
10115 static rtx
10116 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
10117 {
10118 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10119 where mask is selected by alignment and size of the offset.
10120 We try to pick as large a range for the offset as possible to
10121 maximize the chance of a CSE. However, for aligned addresses
10122 we limit the range to 4k so that structures with different sized
10123 elements are likely to use the same base. We need to be careful
10124 not to split a CONST for some forms of address expression, otherwise
10125 it will generate sub-optimal code. */
10126
10127 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10128 {
10129 rtx base = XEXP (x, 0);
10130 rtx offset_rtx = XEXP (x, 1);
10131 HOST_WIDE_INT offset = INTVAL (offset_rtx);
10132
10133 if (GET_CODE (base) == PLUS)
10134 {
10135 rtx op0 = XEXP (base, 0);
10136 rtx op1 = XEXP (base, 1);
10137
10138 /* Force any scaling into a temp for CSE. */
10139 op0 = force_reg (Pmode, op0);
10140 op1 = force_reg (Pmode, op1);
10141
10142 /* Let the pointer register be in op0. */
10143 if (REG_POINTER (op1))
10144 std::swap (op0, op1);
10145
10146 /* If the pointer is virtual or frame related, then we know that
10147 virtual register instantiation or register elimination is going
10148 to apply a second constant. We want the two constants folded
10149 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10150 if (virt_or_elim_regno_p (REGNO (op0)))
10151 {
10152 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10153 NULL_RTX, true, OPTAB_DIRECT);
10154 return gen_rtx_PLUS (Pmode, base, op1);
10155 }
10156
10157 /* Otherwise, in order to encourage CSE (and thence loop strength
10158 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10159 base = expand_binop (Pmode, add_optab, op0, op1,
10160 NULL_RTX, true, OPTAB_DIRECT);
10161 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10162 }
10163
10164 HOST_WIDE_INT size;
10165 if (GET_MODE_SIZE (mode).is_constant (&size))
10166 {
10167 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10168 mode);
10169 if (base_offset != 0)
10170 {
10171 base = plus_constant (Pmode, base, base_offset);
10172 base = force_operand (base, NULL_RTX);
10173 return plus_constant (Pmode, base, offset - base_offset);
10174 }
10175 }
10176 }
10177
10178 return x;
10179 }
10180
10181 static reg_class_t
10182 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10183 reg_class_t rclass,
10184 machine_mode mode,
10185 secondary_reload_info *sri)
10186 {
10187 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10188 LDR and STR. See the comment at the head of aarch64-sve.md for
10189 more details about the big-endian handling. */
10190 if (reg_class_subset_p (rclass, FP_REGS)
10191 && !((REG_P (x) && HARD_REGISTER_P (x))
10192 || aarch64_simd_valid_immediate (x, NULL))
10193 && mode != VNx16QImode)
10194 {
10195 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10196 if ((vec_flags & VEC_SVE_DATA)
10197 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10198 {
10199 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10200 return NO_REGS;
10201 }
10202 }
10203
10204 /* If we have to disable direct literal pool loads and stores because the
10205 function is too big, then we need a scratch register. */
10206 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10207 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10208 || targetm.vector_mode_supported_p (GET_MODE (x)))
10209 && !aarch64_pcrelative_literal_loads)
10210 {
10211 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10212 return NO_REGS;
10213 }
10214
10215 /* Without the TARGET_SIMD instructions we cannot move a Q register
10216 to a Q register directly. We need a scratch. */
10217 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10218 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10219 && reg_class_subset_p (rclass, FP_REGS))
10220 {
10221 sri->icode = code_for_aarch64_reload_mov (mode);
10222 return NO_REGS;
10223 }
10224
10225 /* A TFmode or TImode memory access should be handled via an FP_REGS
10226 because AArch64 has richer addressing modes for LDR/STR instructions
10227 than LDP/STP instructions. */
10228 if (TARGET_FLOAT && rclass == GENERAL_REGS
10229 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10230 return FP_REGS;
10231
10232 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10233 return GENERAL_REGS;
10234
10235 return NO_REGS;
10236 }
10237
10238 static bool
10239 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10240 {
10241 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10242
10243 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10244 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
10245 if (frame_pointer_needed)
10246 return to == HARD_FRAME_POINTER_REGNUM;
10247 return true;
10248 }
10249
10250 poly_int64
10251 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10252 {
10253 if (to == HARD_FRAME_POINTER_REGNUM)
10254 {
10255 if (from == ARG_POINTER_REGNUM)
10256 return cfun->machine->frame.hard_fp_offset;
10257
10258 if (from == FRAME_POINTER_REGNUM)
10259 return cfun->machine->frame.hard_fp_offset
10260 - cfun->machine->frame.locals_offset;
10261 }
10262
10263 if (to == STACK_POINTER_REGNUM)
10264 {
10265 if (from == FRAME_POINTER_REGNUM)
10266 return cfun->machine->frame.frame_size
10267 - cfun->machine->frame.locals_offset;
10268 }
10269
10270 return cfun->machine->frame.frame_size;
10271 }
10272
10273 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
10274 previous frame. */
10275
10276 rtx
10277 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10278 {
10279 if (count != 0)
10280 return const0_rtx;
10281 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
10282 }
10283
10284
10285 static void
10286 aarch64_asm_trampoline_template (FILE *f)
10287 {
10288 int offset1 = 16;
10289 int offset2 = 20;
10290
10291 if (aarch64_bti_enabled ())
10292 {
10293 asm_fprintf (f, "\thint\t34 // bti c\n");
10294 offset1 -= 4;
10295 offset2 -= 4;
10296 }
10297
10298 if (TARGET_ILP32)
10299 {
10300 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
10301 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
10302 offset1);
10303 }
10304 else
10305 {
10306 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
10307 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
10308 offset2);
10309 }
10310 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10311
10312 /* The trampoline needs an extra padding instruction. In case if BTI is
10313 enabled the padding instruction is replaced by the BTI instruction at
10314 the beginning. */
10315 if (!aarch64_bti_enabled ())
10316 assemble_aligned_integer (4, const0_rtx);
10317
10318 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10319 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10320 }
10321
10322 static void
10323 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10324 {
10325 rtx fnaddr, mem, a_tramp;
10326 const int tramp_code_sz = 16;
10327
10328 /* Don't need to copy the trailing D-words, we fill those in below. */
10329 emit_block_move (m_tramp, assemble_trampoline_template (),
10330 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10331 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10332 fnaddr = XEXP (DECL_RTL (fndecl), 0);
10333 if (GET_MODE (fnaddr) != ptr_mode)
10334 fnaddr = convert_memory_address (ptr_mode, fnaddr);
10335 emit_move_insn (mem, fnaddr);
10336
10337 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10338 emit_move_insn (mem, chain_value);
10339
10340 /* XXX We should really define a "clear_cache" pattern and use
10341 gen_clear_cache(). */
10342 a_tramp = XEXP (m_tramp, 0);
10343 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10344 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10345 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10346 ptr_mode);
10347 }
10348
10349 static unsigned char
10350 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10351 {
10352 /* ??? Logically we should only need to provide a value when
10353 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10354 can hold MODE, but at the moment we need to handle all modes.
10355 Just ignore any runtime parts for registers that can't store them. */
10356 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10357 unsigned int nregs, vec_flags;
10358 switch (regclass)
10359 {
10360 case TAILCALL_ADDR_REGS:
10361 case POINTER_REGS:
10362 case GENERAL_REGS:
10363 case ALL_REGS:
10364 case POINTER_AND_FP_REGS:
10365 case FP_REGS:
10366 case FP_LO_REGS:
10367 case FP_LO8_REGS:
10368 vec_flags = aarch64_classify_vector_mode (mode);
10369 if ((vec_flags & VEC_SVE_DATA)
10370 && constant_multiple_p (GET_MODE_SIZE (mode),
10371 aarch64_vl_bytes (mode, vec_flags), &nregs))
10372 return nregs;
10373 return (vec_flags & VEC_ADVSIMD
10374 ? CEIL (lowest_size, UNITS_PER_VREG)
10375 : CEIL (lowest_size, UNITS_PER_WORD));
10376 case STACK_REG:
10377 case PR_REGS:
10378 case PR_LO_REGS:
10379 case PR_HI_REGS:
10380 case FFR_REGS:
10381 case PR_AND_FFR_REGS:
10382 return 1;
10383
10384 case NO_REGS:
10385 return 0;
10386
10387 default:
10388 break;
10389 }
10390 gcc_unreachable ();
10391 }
10392
10393 static reg_class_t
10394 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10395 {
10396 if (regclass == POINTER_REGS)
10397 return GENERAL_REGS;
10398
10399 if (regclass == STACK_REG)
10400 {
10401 if (REG_P(x)
10402 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10403 return regclass;
10404
10405 return NO_REGS;
10406 }
10407
10408 /* Register eliminiation can result in a request for
10409 SP+constant->FP_REGS. We cannot support such operations which
10410 use SP as source and an FP_REG as destination, so reject out
10411 right now. */
10412 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10413 {
10414 rtx lhs = XEXP (x, 0);
10415
10416 /* Look through a possible SUBREG introduced by ILP32. */
10417 if (GET_CODE (lhs) == SUBREG)
10418 lhs = SUBREG_REG (lhs);
10419
10420 gcc_assert (REG_P (lhs));
10421 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
10422 POINTER_REGS));
10423 return NO_REGS;
10424 }
10425
10426 return regclass;
10427 }
10428
10429 void
10430 aarch64_asm_output_labelref (FILE* f, const char *name)
10431 {
10432 asm_fprintf (f, "%U%s", name);
10433 }
10434
10435 static void
10436 aarch64_elf_asm_constructor (rtx symbol, int priority)
10437 {
10438 if (priority == DEFAULT_INIT_PRIORITY)
10439 default_ctor_section_asm_out_constructor (symbol, priority);
10440 else
10441 {
10442 section *s;
10443 /* While priority is known to be in range [0, 65535], so 18 bytes
10444 would be enough, the compiler might not know that. To avoid
10445 -Wformat-truncation false positive, use a larger size. */
10446 char buf[23];
10447 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
10448 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10449 switch_to_section (s);
10450 assemble_align (POINTER_SIZE);
10451 assemble_aligned_integer (POINTER_BYTES, symbol);
10452 }
10453 }
10454
10455 static void
10456 aarch64_elf_asm_destructor (rtx symbol, int priority)
10457 {
10458 if (priority == DEFAULT_INIT_PRIORITY)
10459 default_dtor_section_asm_out_destructor (symbol, priority);
10460 else
10461 {
10462 section *s;
10463 /* While priority is known to be in range [0, 65535], so 18 bytes
10464 would be enough, the compiler might not know that. To avoid
10465 -Wformat-truncation false positive, use a larger size. */
10466 char buf[23];
10467 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
10468 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10469 switch_to_section (s);
10470 assemble_align (POINTER_SIZE);
10471 assemble_aligned_integer (POINTER_BYTES, symbol);
10472 }
10473 }
10474
10475 const char*
10476 aarch64_output_casesi (rtx *operands)
10477 {
10478 char buf[100];
10479 char label[100];
10480 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
10481 int index;
10482 static const char *const patterns[4][2] =
10483 {
10484 {
10485 "ldrb\t%w3, [%0,%w1,uxtw]",
10486 "add\t%3, %4, %w3, sxtb #2"
10487 },
10488 {
10489 "ldrh\t%w3, [%0,%w1,uxtw #1]",
10490 "add\t%3, %4, %w3, sxth #2"
10491 },
10492 {
10493 "ldr\t%w3, [%0,%w1,uxtw #2]",
10494 "add\t%3, %4, %w3, sxtw #2"
10495 },
10496 /* We assume that DImode is only generated when not optimizing and
10497 that we don't really need 64-bit address offsets. That would
10498 imply an object file with 8GB of code in a single function! */
10499 {
10500 "ldr\t%w3, [%0,%w1,uxtw #2]",
10501 "add\t%3, %4, %w3, sxtw #2"
10502 }
10503 };
10504
10505 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
10506
10507 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
10508 index = exact_log2 (GET_MODE_SIZE (mode));
10509
10510 gcc_assert (index >= 0 && index <= 3);
10511
10512 /* Need to implement table size reduction, by chaning the code below. */
10513 output_asm_insn (patterns[index][0], operands);
10514 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
10515 snprintf (buf, sizeof (buf),
10516 "adr\t%%4, %s", targetm.strip_name_encoding (label));
10517 output_asm_insn (buf, operands);
10518 output_asm_insn (patterns[index][1], operands);
10519 output_asm_insn ("br\t%3", operands);
10520 assemble_label (asm_out_file, label);
10521 return "";
10522 }
10523
10524
10525 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10526 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10527 operator. */
10528
10529 int
10530 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
10531 {
10532 if (shift >= 0 && shift <= 3)
10533 {
10534 int size;
10535 for (size = 8; size <= 32; size *= 2)
10536 {
10537 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
10538 if (mask == bits << shift)
10539 return size;
10540 }
10541 }
10542 return 0;
10543 }
10544
10545 /* Constant pools are per function only when PC relative
10546 literal loads are true or we are in the large memory
10547 model. */
10548
10549 static inline bool
10550 aarch64_can_use_per_function_literal_pools_p (void)
10551 {
10552 return (aarch64_pcrelative_literal_loads
10553 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
10554 }
10555
10556 static bool
10557 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
10558 {
10559 /* We can't use blocks for constants when we're using a per-function
10560 constant pool. */
10561 return !aarch64_can_use_per_function_literal_pools_p ();
10562 }
10563
10564 /* Select appropriate section for constants depending
10565 on where we place literal pools. */
10566
10567 static section *
10568 aarch64_select_rtx_section (machine_mode mode,
10569 rtx x,
10570 unsigned HOST_WIDE_INT align)
10571 {
10572 if (aarch64_can_use_per_function_literal_pools_p ())
10573 return function_section (current_function_decl);
10574
10575 return default_elf_select_rtx_section (mode, x, align);
10576 }
10577
10578 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
10579 void
10580 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
10581 HOST_WIDE_INT offset)
10582 {
10583 /* When using per-function literal pools, we must ensure that any code
10584 section is aligned to the minimal instruction length, lest we get
10585 errors from the assembler re "unaligned instructions". */
10586 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
10587 ASM_OUTPUT_ALIGN (f, 2);
10588 }
10589
10590 /* Costs. */
10591
10592 /* Helper function for rtx cost calculation. Strip a shift expression
10593 from X. Returns the inner operand if successful, or the original
10594 expression on failure. */
10595 static rtx
10596 aarch64_strip_shift (rtx x)
10597 {
10598 rtx op = x;
10599
10600 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
10601 we can convert both to ROR during final output. */
10602 if ((GET_CODE (op) == ASHIFT
10603 || GET_CODE (op) == ASHIFTRT
10604 || GET_CODE (op) == LSHIFTRT
10605 || GET_CODE (op) == ROTATERT
10606 || GET_CODE (op) == ROTATE)
10607 && CONST_INT_P (XEXP (op, 1)))
10608 return XEXP (op, 0);
10609
10610 if (GET_CODE (op) == MULT
10611 && CONST_INT_P (XEXP (op, 1))
10612 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
10613 return XEXP (op, 0);
10614
10615 return x;
10616 }
10617
10618 /* Helper function for rtx cost calculation. Strip an extend
10619 expression from X. Returns the inner operand if successful, or the
10620 original expression on failure. We deal with a number of possible
10621 canonicalization variations here. If STRIP_SHIFT is true, then
10622 we can strip off a shift also. */
10623 static rtx
10624 aarch64_strip_extend (rtx x, bool strip_shift)
10625 {
10626 scalar_int_mode mode;
10627 rtx op = x;
10628
10629 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
10630 return op;
10631
10632 /* Zero and sign extraction of a widened value. */
10633 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
10634 && XEXP (op, 2) == const0_rtx
10635 && GET_CODE (XEXP (op, 0)) == MULT
10636 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
10637 XEXP (op, 1)))
10638 return XEXP (XEXP (op, 0), 0);
10639
10640 /* It can also be represented (for zero-extend) as an AND with an
10641 immediate. */
10642 if (GET_CODE (op) == AND
10643 && GET_CODE (XEXP (op, 0)) == MULT
10644 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
10645 && CONST_INT_P (XEXP (op, 1))
10646 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
10647 INTVAL (XEXP (op, 1))) != 0)
10648 return XEXP (XEXP (op, 0), 0);
10649
10650 /* Now handle extended register, as this may also have an optional
10651 left shift by 1..4. */
10652 if (strip_shift
10653 && GET_CODE (op) == ASHIFT
10654 && CONST_INT_P (XEXP (op, 1))
10655 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
10656 op = XEXP (op, 0);
10657
10658 if (GET_CODE (op) == ZERO_EXTEND
10659 || GET_CODE (op) == SIGN_EXTEND)
10660 op = XEXP (op, 0);
10661
10662 if (op != x)
10663 return op;
10664
10665 return x;
10666 }
10667
10668 /* Return true iff CODE is a shift supported in combination
10669 with arithmetic instructions. */
10670
10671 static bool
10672 aarch64_shift_p (enum rtx_code code)
10673 {
10674 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
10675 }
10676
10677
10678 /* Return true iff X is a cheap shift without a sign extend. */
10679
10680 static bool
10681 aarch64_cheap_mult_shift_p (rtx x)
10682 {
10683 rtx op0, op1;
10684
10685 op0 = XEXP (x, 0);
10686 op1 = XEXP (x, 1);
10687
10688 if (!(aarch64_tune_params.extra_tuning_flags
10689 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
10690 return false;
10691
10692 if (GET_CODE (op0) == SIGN_EXTEND)
10693 return false;
10694
10695 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
10696 && UINTVAL (op1) <= 4)
10697 return true;
10698
10699 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
10700 return false;
10701
10702 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
10703
10704 if (l2 > 0 && l2 <= 4)
10705 return true;
10706
10707 return false;
10708 }
10709
10710 /* Helper function for rtx cost calculation. Calculate the cost of
10711 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
10712 Return the calculated cost of the expression, recursing manually in to
10713 operands where needed. */
10714
10715 static int
10716 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
10717 {
10718 rtx op0, op1;
10719 const struct cpu_cost_table *extra_cost
10720 = aarch64_tune_params.insn_extra_cost;
10721 int cost = 0;
10722 bool compound_p = (outer == PLUS || outer == MINUS);
10723 machine_mode mode = GET_MODE (x);
10724
10725 gcc_checking_assert (code == MULT);
10726
10727 op0 = XEXP (x, 0);
10728 op1 = XEXP (x, 1);
10729
10730 if (VECTOR_MODE_P (mode))
10731 mode = GET_MODE_INNER (mode);
10732
10733 /* Integer multiply/fma. */
10734 if (GET_MODE_CLASS (mode) == MODE_INT)
10735 {
10736 /* The multiply will be canonicalized as a shift, cost it as such. */
10737 if (aarch64_shift_p (GET_CODE (x))
10738 || (CONST_INT_P (op1)
10739 && exact_log2 (INTVAL (op1)) > 0))
10740 {
10741 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
10742 || GET_CODE (op0) == SIGN_EXTEND;
10743 if (speed)
10744 {
10745 if (compound_p)
10746 {
10747 /* If the shift is considered cheap,
10748 then don't add any cost. */
10749 if (aarch64_cheap_mult_shift_p (x))
10750 ;
10751 else if (REG_P (op1))
10752 /* ARITH + shift-by-register. */
10753 cost += extra_cost->alu.arith_shift_reg;
10754 else if (is_extend)
10755 /* ARITH + extended register. We don't have a cost field
10756 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
10757 cost += extra_cost->alu.extend_arith;
10758 else
10759 /* ARITH + shift-by-immediate. */
10760 cost += extra_cost->alu.arith_shift;
10761 }
10762 else
10763 /* LSL (immediate). */
10764 cost += extra_cost->alu.shift;
10765
10766 }
10767 /* Strip extends as we will have costed them in the case above. */
10768 if (is_extend)
10769 op0 = aarch64_strip_extend (op0, true);
10770
10771 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
10772
10773 return cost;
10774 }
10775
10776 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
10777 compound and let the below cases handle it. After all, MNEG is a
10778 special-case alias of MSUB. */
10779 if (GET_CODE (op0) == NEG)
10780 {
10781 op0 = XEXP (op0, 0);
10782 compound_p = true;
10783 }
10784
10785 /* Integer multiplies or FMAs have zero/sign extending variants. */
10786 if ((GET_CODE (op0) == ZERO_EXTEND
10787 && GET_CODE (op1) == ZERO_EXTEND)
10788 || (GET_CODE (op0) == SIGN_EXTEND
10789 && GET_CODE (op1) == SIGN_EXTEND))
10790 {
10791 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
10792 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
10793
10794 if (speed)
10795 {
10796 if (compound_p)
10797 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
10798 cost += extra_cost->mult[0].extend_add;
10799 else
10800 /* MUL/SMULL/UMULL. */
10801 cost += extra_cost->mult[0].extend;
10802 }
10803
10804 return cost;
10805 }
10806
10807 /* This is either an integer multiply or a MADD. In both cases
10808 we want to recurse and cost the operands. */
10809 cost += rtx_cost (op0, mode, MULT, 0, speed);
10810 cost += rtx_cost (op1, mode, MULT, 1, speed);
10811
10812 if (speed)
10813 {
10814 if (compound_p)
10815 /* MADD/MSUB. */
10816 cost += extra_cost->mult[mode == DImode].add;
10817 else
10818 /* MUL. */
10819 cost += extra_cost->mult[mode == DImode].simple;
10820 }
10821
10822 return cost;
10823 }
10824 else
10825 {
10826 if (speed)
10827 {
10828 /* Floating-point FMA/FMUL can also support negations of the
10829 operands, unless the rounding mode is upward or downward in
10830 which case FNMUL is different than FMUL with operand negation. */
10831 bool neg0 = GET_CODE (op0) == NEG;
10832 bool neg1 = GET_CODE (op1) == NEG;
10833 if (compound_p || !flag_rounding_math || (neg0 && neg1))
10834 {
10835 if (neg0)
10836 op0 = XEXP (op0, 0);
10837 if (neg1)
10838 op1 = XEXP (op1, 0);
10839 }
10840
10841 if (compound_p)
10842 /* FMADD/FNMADD/FNMSUB/FMSUB. */
10843 cost += extra_cost->fp[mode == DFmode].fma;
10844 else
10845 /* FMUL/FNMUL. */
10846 cost += extra_cost->fp[mode == DFmode].mult;
10847 }
10848
10849 cost += rtx_cost (op0, mode, MULT, 0, speed);
10850 cost += rtx_cost (op1, mode, MULT, 1, speed);
10851 return cost;
10852 }
10853 }
10854
10855 static int
10856 aarch64_address_cost (rtx x,
10857 machine_mode mode,
10858 addr_space_t as ATTRIBUTE_UNUSED,
10859 bool speed)
10860 {
10861 enum rtx_code c = GET_CODE (x);
10862 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
10863 struct aarch64_address_info info;
10864 int cost = 0;
10865 info.shift = 0;
10866
10867 if (!aarch64_classify_address (&info, x, mode, false))
10868 {
10869 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
10870 {
10871 /* This is a CONST or SYMBOL ref which will be split
10872 in a different way depending on the code model in use.
10873 Cost it through the generic infrastructure. */
10874 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
10875 /* Divide through by the cost of one instruction to
10876 bring it to the same units as the address costs. */
10877 cost_symbol_ref /= COSTS_N_INSNS (1);
10878 /* The cost is then the cost of preparing the address,
10879 followed by an immediate (possibly 0) offset. */
10880 return cost_symbol_ref + addr_cost->imm_offset;
10881 }
10882 else
10883 {
10884 /* This is most likely a jump table from a case
10885 statement. */
10886 return addr_cost->register_offset;
10887 }
10888 }
10889
10890 switch (info.type)
10891 {
10892 case ADDRESS_LO_SUM:
10893 case ADDRESS_SYMBOLIC:
10894 case ADDRESS_REG_IMM:
10895 cost += addr_cost->imm_offset;
10896 break;
10897
10898 case ADDRESS_REG_WB:
10899 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
10900 cost += addr_cost->pre_modify;
10901 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
10902 cost += addr_cost->post_modify;
10903 else
10904 gcc_unreachable ();
10905
10906 break;
10907
10908 case ADDRESS_REG_REG:
10909 cost += addr_cost->register_offset;
10910 break;
10911
10912 case ADDRESS_REG_SXTW:
10913 cost += addr_cost->register_sextend;
10914 break;
10915
10916 case ADDRESS_REG_UXTW:
10917 cost += addr_cost->register_zextend;
10918 break;
10919
10920 default:
10921 gcc_unreachable ();
10922 }
10923
10924
10925 if (info.shift > 0)
10926 {
10927 /* For the sake of calculating the cost of the shifted register
10928 component, we can treat same sized modes in the same way. */
10929 if (known_eq (GET_MODE_BITSIZE (mode), 16))
10930 cost += addr_cost->addr_scale_costs.hi;
10931 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
10932 cost += addr_cost->addr_scale_costs.si;
10933 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
10934 cost += addr_cost->addr_scale_costs.di;
10935 else
10936 /* We can't tell, or this is a 128-bit vector. */
10937 cost += addr_cost->addr_scale_costs.ti;
10938 }
10939
10940 return cost;
10941 }
10942
10943 /* Return the cost of a branch. If SPEED_P is true then the compiler is
10944 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
10945 to be taken. */
10946
10947 int
10948 aarch64_branch_cost (bool speed_p, bool predictable_p)
10949 {
10950 /* When optimizing for speed, use the cost of unpredictable branches. */
10951 const struct cpu_branch_cost *branch_costs =
10952 aarch64_tune_params.branch_costs;
10953
10954 if (!speed_p || predictable_p)
10955 return branch_costs->predictable;
10956 else
10957 return branch_costs->unpredictable;
10958 }
10959
10960 /* Return true if the RTX X in mode MODE is a zero or sign extract
10961 usable in an ADD or SUB (extended register) instruction. */
10962 static bool
10963 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10964 {
10965 /* Catch add with a sign extract.
10966 This is add_<optab><mode>_multp2. */
10967 if (GET_CODE (x) == SIGN_EXTRACT
10968 || GET_CODE (x) == ZERO_EXTRACT)
10969 {
10970 rtx op0 = XEXP (x, 0);
10971 rtx op1 = XEXP (x, 1);
10972 rtx op2 = XEXP (x, 2);
10973
10974 if (GET_CODE (op0) == MULT
10975 && CONST_INT_P (op1)
10976 && op2 == const0_rtx
10977 && CONST_INT_P (XEXP (op0, 1))
10978 && aarch64_is_extend_from_extract (mode,
10979 XEXP (op0, 1),
10980 op1))
10981 {
10982 return true;
10983 }
10984 }
10985 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10986 No shift. */
10987 else if (GET_CODE (x) == SIGN_EXTEND
10988 || GET_CODE (x) == ZERO_EXTEND)
10989 return REG_P (XEXP (x, 0));
10990
10991 return false;
10992 }
10993
10994 static bool
10995 aarch64_frint_unspec_p (unsigned int u)
10996 {
10997 switch (u)
10998 {
10999 case UNSPEC_FRINTZ:
11000 case UNSPEC_FRINTP:
11001 case UNSPEC_FRINTM:
11002 case UNSPEC_FRINTA:
11003 case UNSPEC_FRINTN:
11004 case UNSPEC_FRINTX:
11005 case UNSPEC_FRINTI:
11006 return true;
11007
11008 default:
11009 return false;
11010 }
11011 }
11012
11013 /* Return true iff X is an rtx that will match an extr instruction
11014 i.e. as described in the *extr<mode>5_insn family of patterns.
11015 OP0 and OP1 will be set to the operands of the shifts involved
11016 on success and will be NULL_RTX otherwise. */
11017
11018 static bool
11019 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11020 {
11021 rtx op0, op1;
11022 scalar_int_mode mode;
11023 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11024 return false;
11025
11026 *res_op0 = NULL_RTX;
11027 *res_op1 = NULL_RTX;
11028
11029 if (GET_CODE (x) != IOR)
11030 return false;
11031
11032 op0 = XEXP (x, 0);
11033 op1 = XEXP (x, 1);
11034
11035 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11036 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11037 {
11038 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
11039 if (GET_CODE (op1) == ASHIFT)
11040 std::swap (op0, op1);
11041
11042 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11043 return false;
11044
11045 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11046 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11047
11048 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11049 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11050 {
11051 *res_op0 = XEXP (op0, 0);
11052 *res_op1 = XEXP (op1, 0);
11053 return true;
11054 }
11055 }
11056
11057 return false;
11058 }
11059
11060 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11061 storing it in *COST. Result is true if the total cost of the operation
11062 has now been calculated. */
11063 static bool
11064 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11065 {
11066 rtx inner;
11067 rtx comparator;
11068 enum rtx_code cmpcode;
11069 const struct cpu_cost_table *extra_cost
11070 = aarch64_tune_params.insn_extra_cost;
11071
11072 if (COMPARISON_P (op0))
11073 {
11074 inner = XEXP (op0, 0);
11075 comparator = XEXP (op0, 1);
11076 cmpcode = GET_CODE (op0);
11077 }
11078 else
11079 {
11080 inner = op0;
11081 comparator = const0_rtx;
11082 cmpcode = NE;
11083 }
11084
11085 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11086 {
11087 /* Conditional branch. */
11088 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11089 return true;
11090 else
11091 {
11092 if (cmpcode == NE || cmpcode == EQ)
11093 {
11094 if (comparator == const0_rtx)
11095 {
11096 /* TBZ/TBNZ/CBZ/CBNZ. */
11097 if (GET_CODE (inner) == ZERO_EXTRACT)
11098 /* TBZ/TBNZ. */
11099 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11100 ZERO_EXTRACT, 0, speed);
11101 else
11102 /* CBZ/CBNZ. */
11103 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11104
11105 return true;
11106 }
11107 if (register_operand (inner, VOIDmode)
11108 && aarch64_imm24 (comparator, VOIDmode))
11109 {
11110 /* SUB and SUBS. */
11111 *cost += COSTS_N_INSNS (2);
11112 if (speed)
11113 *cost += extra_cost->alu.arith * 2;
11114 return true;
11115 }
11116 }
11117 else if (cmpcode == LT || cmpcode == GE)
11118 {
11119 /* TBZ/TBNZ. */
11120 if (comparator == const0_rtx)
11121 return true;
11122 }
11123 }
11124 }
11125 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11126 {
11127 /* CCMP. */
11128 if (GET_CODE (op1) == COMPARE)
11129 {
11130 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11131 if (XEXP (op1, 1) == const0_rtx)
11132 *cost += 1;
11133 if (speed)
11134 {
11135 machine_mode mode = GET_MODE (XEXP (op1, 0));
11136 const struct cpu_cost_table *extra_cost
11137 = aarch64_tune_params.insn_extra_cost;
11138
11139 if (GET_MODE_CLASS (mode) == MODE_INT)
11140 *cost += extra_cost->alu.arith;
11141 else
11142 *cost += extra_cost->fp[mode == DFmode].compare;
11143 }
11144 return true;
11145 }
11146
11147 /* It's a conditional operation based on the status flags,
11148 so it must be some flavor of CSEL. */
11149
11150 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11151 if (GET_CODE (op1) == NEG
11152 || GET_CODE (op1) == NOT
11153 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11154 op1 = XEXP (op1, 0);
11155 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11156 {
11157 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11158 op1 = XEXP (op1, 0);
11159 op2 = XEXP (op2, 0);
11160 }
11161
11162 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11163 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11164 return true;
11165 }
11166
11167 /* We don't know what this is, cost all operands. */
11168 return false;
11169 }
11170
11171 /* Check whether X is a bitfield operation of the form shift + extend that
11172 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11173 operand to which the bitfield operation is applied. Otherwise return
11174 NULL_RTX. */
11175
11176 static rtx
11177 aarch64_extend_bitfield_pattern_p (rtx x)
11178 {
11179 rtx_code outer_code = GET_CODE (x);
11180 machine_mode outer_mode = GET_MODE (x);
11181
11182 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11183 && outer_mode != SImode && outer_mode != DImode)
11184 return NULL_RTX;
11185
11186 rtx inner = XEXP (x, 0);
11187 rtx_code inner_code = GET_CODE (inner);
11188 machine_mode inner_mode = GET_MODE (inner);
11189 rtx op = NULL_RTX;
11190
11191 switch (inner_code)
11192 {
11193 case ASHIFT:
11194 if (CONST_INT_P (XEXP (inner, 1))
11195 && (inner_mode == QImode || inner_mode == HImode))
11196 op = XEXP (inner, 0);
11197 break;
11198 case LSHIFTRT:
11199 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11200 && (inner_mode == QImode || inner_mode == HImode))
11201 op = XEXP (inner, 0);
11202 break;
11203 case ASHIFTRT:
11204 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11205 && (inner_mode == QImode || inner_mode == HImode))
11206 op = XEXP (inner, 0);
11207 break;
11208 default:
11209 break;
11210 }
11211
11212 return op;
11213 }
11214
11215 /* Return true if the mask and a shift amount from an RTX of the form
11216 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11217 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11218
11219 bool
11220 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11221 rtx shft_amnt)
11222 {
11223 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11224 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11225 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11226 && (INTVAL (mask)
11227 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11228 }
11229
11230 /* Return true if the masks and a shift amount from an RTX of the form
11231 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11232 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11233
11234 bool
11235 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11236 unsigned HOST_WIDE_INT mask1,
11237 unsigned HOST_WIDE_INT shft_amnt,
11238 unsigned HOST_WIDE_INT mask2)
11239 {
11240 unsigned HOST_WIDE_INT t;
11241
11242 /* Verify that there is no overlap in what bits are set in the two masks. */
11243 if (mask1 != ~mask2)
11244 return false;
11245
11246 /* Verify that mask2 is not all zeros or ones. */
11247 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11248 return false;
11249
11250 /* The shift amount should always be less than the mode size. */
11251 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11252
11253 /* Verify that the mask being shifted is contiguous and would be in the
11254 least significant bits after shifting by shft_amnt. */
11255 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11256 return (t == (t & -t));
11257 }
11258
11259 /* Calculate the cost of calculating X, storing it in *COST. Result
11260 is true if the total cost of the operation has now been calculated. */
11261 static bool
11262 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11263 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11264 {
11265 rtx op0, op1, op2;
11266 const struct cpu_cost_table *extra_cost
11267 = aarch64_tune_params.insn_extra_cost;
11268 int code = GET_CODE (x);
11269 scalar_int_mode int_mode;
11270
11271 /* By default, assume that everything has equivalent cost to the
11272 cheapest instruction. Any additional costs are applied as a delta
11273 above this default. */
11274 *cost = COSTS_N_INSNS (1);
11275
11276 switch (code)
11277 {
11278 case SET:
11279 /* The cost depends entirely on the operands to SET. */
11280 *cost = 0;
11281 op0 = SET_DEST (x);
11282 op1 = SET_SRC (x);
11283
11284 switch (GET_CODE (op0))
11285 {
11286 case MEM:
11287 if (speed)
11288 {
11289 rtx address = XEXP (op0, 0);
11290 if (VECTOR_MODE_P (mode))
11291 *cost += extra_cost->ldst.storev;
11292 else if (GET_MODE_CLASS (mode) == MODE_INT)
11293 *cost += extra_cost->ldst.store;
11294 else if (mode == SFmode)
11295 *cost += extra_cost->ldst.storef;
11296 else if (mode == DFmode)
11297 *cost += extra_cost->ldst.stored;
11298
11299 *cost +=
11300 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11301 0, speed));
11302 }
11303
11304 *cost += rtx_cost (op1, mode, SET, 1, speed);
11305 return true;
11306
11307 case SUBREG:
11308 if (! REG_P (SUBREG_REG (op0)))
11309 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11310
11311 /* Fall through. */
11312 case REG:
11313 /* The cost is one per vector-register copied. */
11314 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11315 {
11316 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11317 *cost = COSTS_N_INSNS (nregs);
11318 }
11319 /* const0_rtx is in general free, but we will use an
11320 instruction to set a register to 0. */
11321 else if (REG_P (op1) || op1 == const0_rtx)
11322 {
11323 /* The cost is 1 per register copied. */
11324 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11325 *cost = COSTS_N_INSNS (nregs);
11326 }
11327 else
11328 /* Cost is just the cost of the RHS of the set. */
11329 *cost += rtx_cost (op1, mode, SET, 1, speed);
11330 return true;
11331
11332 case ZERO_EXTRACT:
11333 case SIGN_EXTRACT:
11334 /* Bit-field insertion. Strip any redundant widening of
11335 the RHS to meet the width of the target. */
11336 if (GET_CODE (op1) == SUBREG)
11337 op1 = SUBREG_REG (op1);
11338 if ((GET_CODE (op1) == ZERO_EXTEND
11339 || GET_CODE (op1) == SIGN_EXTEND)
11340 && CONST_INT_P (XEXP (op0, 1))
11341 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11342 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11343 op1 = XEXP (op1, 0);
11344
11345 if (CONST_INT_P (op1))
11346 {
11347 /* MOV immediate is assumed to always be cheap. */
11348 *cost = COSTS_N_INSNS (1);
11349 }
11350 else
11351 {
11352 /* BFM. */
11353 if (speed)
11354 *cost += extra_cost->alu.bfi;
11355 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11356 }
11357
11358 return true;
11359
11360 default:
11361 /* We can't make sense of this, assume default cost. */
11362 *cost = COSTS_N_INSNS (1);
11363 return false;
11364 }
11365 return false;
11366
11367 case CONST_INT:
11368 /* If an instruction can incorporate a constant within the
11369 instruction, the instruction's expression avoids calling
11370 rtx_cost() on the constant. If rtx_cost() is called on a
11371 constant, then it is usually because the constant must be
11372 moved into a register by one or more instructions.
11373
11374 The exception is constant 0, which can be expressed
11375 as XZR/WZR and is therefore free. The exception to this is
11376 if we have (set (reg) (const0_rtx)) in which case we must cost
11377 the move. However, we can catch that when we cost the SET, so
11378 we don't need to consider that here. */
11379 if (x == const0_rtx)
11380 *cost = 0;
11381 else
11382 {
11383 /* To an approximation, building any other constant is
11384 proportionally expensive to the number of instructions
11385 required to build that constant. This is true whether we
11386 are compiling for SPEED or otherwise. */
11387 if (!is_a <scalar_int_mode> (mode, &int_mode))
11388 int_mode = word_mode;
11389 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
11390 (NULL_RTX, x, false, int_mode));
11391 }
11392 return true;
11393
11394 case CONST_DOUBLE:
11395
11396 /* First determine number of instructions to do the move
11397 as an integer constant. */
11398 if (!aarch64_float_const_representable_p (x)
11399 && !aarch64_can_const_movi_rtx_p (x, mode)
11400 && aarch64_float_const_rtx_p (x))
11401 {
11402 unsigned HOST_WIDE_INT ival;
11403 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
11404 gcc_assert (succeed);
11405
11406 scalar_int_mode imode = (mode == HFmode
11407 ? SImode
11408 : int_mode_for_mode (mode).require ());
11409 int ncost = aarch64_internal_mov_immediate
11410 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11411 *cost += COSTS_N_INSNS (ncost);
11412 return true;
11413 }
11414
11415 if (speed)
11416 {
11417 /* mov[df,sf]_aarch64. */
11418 if (aarch64_float_const_representable_p (x))
11419 /* FMOV (scalar immediate). */
11420 *cost += extra_cost->fp[mode == DFmode].fpconst;
11421 else if (!aarch64_float_const_zero_rtx_p (x))
11422 {
11423 /* This will be a load from memory. */
11424 if (mode == DFmode)
11425 *cost += extra_cost->ldst.loadd;
11426 else
11427 *cost += extra_cost->ldst.loadf;
11428 }
11429 else
11430 /* Otherwise this is +0.0. We get this using MOVI d0, #0
11431 or MOV v0.s[0], wzr - neither of which are modeled by the
11432 cost tables. Just use the default cost. */
11433 {
11434 }
11435 }
11436
11437 return true;
11438
11439 case MEM:
11440 if (speed)
11441 {
11442 /* For loads we want the base cost of a load, plus an
11443 approximation for the additional cost of the addressing
11444 mode. */
11445 rtx address = XEXP (x, 0);
11446 if (VECTOR_MODE_P (mode))
11447 *cost += extra_cost->ldst.loadv;
11448 else if (GET_MODE_CLASS (mode) == MODE_INT)
11449 *cost += extra_cost->ldst.load;
11450 else if (mode == SFmode)
11451 *cost += extra_cost->ldst.loadf;
11452 else if (mode == DFmode)
11453 *cost += extra_cost->ldst.loadd;
11454
11455 *cost +=
11456 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11457 0, speed));
11458 }
11459
11460 return true;
11461
11462 case NEG:
11463 op0 = XEXP (x, 0);
11464
11465 if (VECTOR_MODE_P (mode))
11466 {
11467 if (speed)
11468 {
11469 /* FNEG. */
11470 *cost += extra_cost->vect.alu;
11471 }
11472 return false;
11473 }
11474
11475 if (GET_MODE_CLASS (mode) == MODE_INT)
11476 {
11477 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11478 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11479 {
11480 /* CSETM. */
11481 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
11482 return true;
11483 }
11484
11485 /* Cost this as SUB wzr, X. */
11486 op0 = CONST0_RTX (mode);
11487 op1 = XEXP (x, 0);
11488 goto cost_minus;
11489 }
11490
11491 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11492 {
11493 /* Support (neg(fma...)) as a single instruction only if
11494 sign of zeros is unimportant. This matches the decision
11495 making in aarch64.md. */
11496 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
11497 {
11498 /* FNMADD. */
11499 *cost = rtx_cost (op0, mode, NEG, 0, speed);
11500 return true;
11501 }
11502 if (GET_CODE (op0) == MULT)
11503 {
11504 /* FNMUL. */
11505 *cost = rtx_cost (op0, mode, NEG, 0, speed);
11506 return true;
11507 }
11508 if (speed)
11509 /* FNEG. */
11510 *cost += extra_cost->fp[mode == DFmode].neg;
11511 return false;
11512 }
11513
11514 return false;
11515
11516 case CLRSB:
11517 case CLZ:
11518 if (speed)
11519 {
11520 if (VECTOR_MODE_P (mode))
11521 *cost += extra_cost->vect.alu;
11522 else
11523 *cost += extra_cost->alu.clz;
11524 }
11525
11526 return false;
11527
11528 case CTZ:
11529 *cost = COSTS_N_INSNS (2);
11530
11531 if (speed)
11532 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
11533 return false;
11534
11535 case COMPARE:
11536 op0 = XEXP (x, 0);
11537 op1 = XEXP (x, 1);
11538
11539 if (op1 == const0_rtx
11540 && GET_CODE (op0) == AND)
11541 {
11542 x = op0;
11543 mode = GET_MODE (op0);
11544 goto cost_logic;
11545 }
11546
11547 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
11548 {
11549 /* TODO: A write to the CC flags possibly costs extra, this
11550 needs encoding in the cost tables. */
11551
11552 mode = GET_MODE (op0);
11553 /* ANDS. */
11554 if (GET_CODE (op0) == AND)
11555 {
11556 x = op0;
11557 goto cost_logic;
11558 }
11559
11560 if (GET_CODE (op0) == PLUS)
11561 {
11562 /* ADDS (and CMN alias). */
11563 x = op0;
11564 goto cost_plus;
11565 }
11566
11567 if (GET_CODE (op0) == MINUS)
11568 {
11569 /* SUBS. */
11570 x = op0;
11571 goto cost_minus;
11572 }
11573
11574 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
11575 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
11576 && CONST_INT_P (XEXP (op0, 2)))
11577 {
11578 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
11579 Handle it here directly rather than going to cost_logic
11580 since we know the immediate generated for the TST is valid
11581 so we can avoid creating an intermediate rtx for it only
11582 for costing purposes. */
11583 if (speed)
11584 *cost += extra_cost->alu.logical;
11585
11586 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
11587 ZERO_EXTRACT, 0, speed);
11588 return true;
11589 }
11590
11591 if (GET_CODE (op1) == NEG)
11592 {
11593 /* CMN. */
11594 if (speed)
11595 *cost += extra_cost->alu.arith;
11596
11597 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
11598 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
11599 return true;
11600 }
11601
11602 /* CMP.
11603
11604 Compare can freely swap the order of operands, and
11605 canonicalization puts the more complex operation first.
11606 But the integer MINUS logic expects the shift/extend
11607 operation in op1. */
11608 if (! (REG_P (op0)
11609 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
11610 {
11611 op0 = XEXP (x, 1);
11612 op1 = XEXP (x, 0);
11613 }
11614 goto cost_minus;
11615 }
11616
11617 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
11618 {
11619 /* FCMP. */
11620 if (speed)
11621 *cost += extra_cost->fp[mode == DFmode].compare;
11622
11623 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
11624 {
11625 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
11626 /* FCMP supports constant 0.0 for no extra cost. */
11627 return true;
11628 }
11629 return false;
11630 }
11631
11632 if (VECTOR_MODE_P (mode))
11633 {
11634 /* Vector compare. */
11635 if (speed)
11636 *cost += extra_cost->vect.alu;
11637
11638 if (aarch64_float_const_zero_rtx_p (op1))
11639 {
11640 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
11641 cost. */
11642 return true;
11643 }
11644 return false;
11645 }
11646 return false;
11647
11648 case MINUS:
11649 {
11650 op0 = XEXP (x, 0);
11651 op1 = XEXP (x, 1);
11652
11653 cost_minus:
11654 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
11655
11656 /* Detect valid immediates. */
11657 if ((GET_MODE_CLASS (mode) == MODE_INT
11658 || (GET_MODE_CLASS (mode) == MODE_CC
11659 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
11660 && CONST_INT_P (op1)
11661 && aarch64_uimm12_shift (INTVAL (op1)))
11662 {
11663 if (speed)
11664 /* SUB(S) (immediate). */
11665 *cost += extra_cost->alu.arith;
11666 return true;
11667 }
11668
11669 /* Look for SUB (extended register). */
11670 if (is_a <scalar_int_mode> (mode, &int_mode)
11671 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
11672 {
11673 if (speed)
11674 *cost += extra_cost->alu.extend_arith;
11675
11676 op1 = aarch64_strip_extend (op1, true);
11677 *cost += rtx_cost (op1, VOIDmode,
11678 (enum rtx_code) GET_CODE (op1), 0, speed);
11679 return true;
11680 }
11681
11682 rtx new_op1 = aarch64_strip_extend (op1, false);
11683
11684 /* Cost this as an FMA-alike operation. */
11685 if ((GET_CODE (new_op1) == MULT
11686 || aarch64_shift_p (GET_CODE (new_op1)))
11687 && code != COMPARE)
11688 {
11689 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
11690 (enum rtx_code) code,
11691 speed);
11692 return true;
11693 }
11694
11695 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
11696
11697 if (speed)
11698 {
11699 if (VECTOR_MODE_P (mode))
11700 {
11701 /* Vector SUB. */
11702 *cost += extra_cost->vect.alu;
11703 }
11704 else if (GET_MODE_CLASS (mode) == MODE_INT)
11705 {
11706 /* SUB(S). */
11707 *cost += extra_cost->alu.arith;
11708 }
11709 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11710 {
11711 /* FSUB. */
11712 *cost += extra_cost->fp[mode == DFmode].addsub;
11713 }
11714 }
11715 return true;
11716 }
11717
11718 case PLUS:
11719 {
11720 rtx new_op0;
11721
11722 op0 = XEXP (x, 0);
11723 op1 = XEXP (x, 1);
11724
11725 cost_plus:
11726 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11727 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11728 {
11729 /* CSINC. */
11730 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
11731 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11732 return true;
11733 }
11734
11735 if (GET_MODE_CLASS (mode) == MODE_INT
11736 && (aarch64_plus_immediate (op1, mode)
11737 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
11738 {
11739 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
11740
11741 if (speed)
11742 /* ADD (immediate). */
11743 *cost += extra_cost->alu.arith;
11744 return true;
11745 }
11746
11747 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11748
11749 /* Look for ADD (extended register). */
11750 if (is_a <scalar_int_mode> (mode, &int_mode)
11751 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
11752 {
11753 if (speed)
11754 *cost += extra_cost->alu.extend_arith;
11755
11756 op0 = aarch64_strip_extend (op0, true);
11757 *cost += rtx_cost (op0, VOIDmode,
11758 (enum rtx_code) GET_CODE (op0), 0, speed);
11759 return true;
11760 }
11761
11762 /* Strip any extend, leave shifts behind as we will
11763 cost them through mult_cost. */
11764 new_op0 = aarch64_strip_extend (op0, false);
11765
11766 if (GET_CODE (new_op0) == MULT
11767 || aarch64_shift_p (GET_CODE (new_op0)))
11768 {
11769 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
11770 speed);
11771 return true;
11772 }
11773
11774 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
11775
11776 if (speed)
11777 {
11778 if (VECTOR_MODE_P (mode))
11779 {
11780 /* Vector ADD. */
11781 *cost += extra_cost->vect.alu;
11782 }
11783 else if (GET_MODE_CLASS (mode) == MODE_INT)
11784 {
11785 /* ADD. */
11786 *cost += extra_cost->alu.arith;
11787 }
11788 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11789 {
11790 /* FADD. */
11791 *cost += extra_cost->fp[mode == DFmode].addsub;
11792 }
11793 }
11794 return true;
11795 }
11796
11797 case BSWAP:
11798 *cost = COSTS_N_INSNS (1);
11799
11800 if (speed)
11801 {
11802 if (VECTOR_MODE_P (mode))
11803 *cost += extra_cost->vect.alu;
11804 else
11805 *cost += extra_cost->alu.rev;
11806 }
11807 return false;
11808
11809 case IOR:
11810 if (aarch_rev16_p (x))
11811 {
11812 *cost = COSTS_N_INSNS (1);
11813
11814 if (speed)
11815 {
11816 if (VECTOR_MODE_P (mode))
11817 *cost += extra_cost->vect.alu;
11818 else
11819 *cost += extra_cost->alu.rev;
11820 }
11821 return true;
11822 }
11823
11824 if (aarch64_extr_rtx_p (x, &op0, &op1))
11825 {
11826 *cost += rtx_cost (op0, mode, IOR, 0, speed);
11827 *cost += rtx_cost (op1, mode, IOR, 1, speed);
11828 if (speed)
11829 *cost += extra_cost->alu.shift;
11830
11831 return true;
11832 }
11833 /* Fall through. */
11834 case XOR:
11835 case AND:
11836 cost_logic:
11837 op0 = XEXP (x, 0);
11838 op1 = XEXP (x, 1);
11839
11840 if (VECTOR_MODE_P (mode))
11841 {
11842 if (speed)
11843 *cost += extra_cost->vect.alu;
11844 return true;
11845 }
11846
11847 if (code == AND
11848 && GET_CODE (op0) == MULT
11849 && CONST_INT_P (XEXP (op0, 1))
11850 && CONST_INT_P (op1)
11851 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
11852 INTVAL (op1)) != 0)
11853 {
11854 /* This is a UBFM/SBFM. */
11855 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
11856 if (speed)
11857 *cost += extra_cost->alu.bfx;
11858 return true;
11859 }
11860
11861 if (is_int_mode (mode, &int_mode))
11862 {
11863 if (CONST_INT_P (op1))
11864 {
11865 /* We have a mask + shift version of a UBFIZ
11866 i.e. the *andim_ashift<mode>_bfiz pattern. */
11867 if (GET_CODE (op0) == ASHIFT
11868 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
11869 XEXP (op0, 1)))
11870 {
11871 *cost += rtx_cost (XEXP (op0, 0), int_mode,
11872 (enum rtx_code) code, 0, speed);
11873 if (speed)
11874 *cost += extra_cost->alu.bfx;
11875
11876 return true;
11877 }
11878 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
11879 {
11880 /* We possibly get the immediate for free, this is not
11881 modelled. */
11882 *cost += rtx_cost (op0, int_mode,
11883 (enum rtx_code) code, 0, speed);
11884 if (speed)
11885 *cost += extra_cost->alu.logical;
11886
11887 return true;
11888 }
11889 }
11890 else
11891 {
11892 rtx new_op0 = op0;
11893
11894 /* Handle ORN, EON, or BIC. */
11895 if (GET_CODE (op0) == NOT)
11896 op0 = XEXP (op0, 0);
11897
11898 new_op0 = aarch64_strip_shift (op0);
11899
11900 /* If we had a shift on op0 then this is a logical-shift-
11901 by-register/immediate operation. Otherwise, this is just
11902 a logical operation. */
11903 if (speed)
11904 {
11905 if (new_op0 != op0)
11906 {
11907 /* Shift by immediate. */
11908 if (CONST_INT_P (XEXP (op0, 1)))
11909 *cost += extra_cost->alu.log_shift;
11910 else
11911 *cost += extra_cost->alu.log_shift_reg;
11912 }
11913 else
11914 *cost += extra_cost->alu.logical;
11915 }
11916
11917 /* In both cases we want to cost both operands. */
11918 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
11919 0, speed);
11920 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
11921 1, speed);
11922
11923 return true;
11924 }
11925 }
11926 return false;
11927
11928 case NOT:
11929 x = XEXP (x, 0);
11930 op0 = aarch64_strip_shift (x);
11931
11932 if (VECTOR_MODE_P (mode))
11933 {
11934 /* Vector NOT. */
11935 *cost += extra_cost->vect.alu;
11936 return false;
11937 }
11938
11939 /* MVN-shifted-reg. */
11940 if (op0 != x)
11941 {
11942 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11943
11944 if (speed)
11945 *cost += extra_cost->alu.log_shift;
11946
11947 return true;
11948 }
11949 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
11950 Handle the second form here taking care that 'a' in the above can
11951 be a shift. */
11952 else if (GET_CODE (op0) == XOR)
11953 {
11954 rtx newop0 = XEXP (op0, 0);
11955 rtx newop1 = XEXP (op0, 1);
11956 rtx op0_stripped = aarch64_strip_shift (newop0);
11957
11958 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
11959 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
11960
11961 if (speed)
11962 {
11963 if (op0_stripped != newop0)
11964 *cost += extra_cost->alu.log_shift;
11965 else
11966 *cost += extra_cost->alu.logical;
11967 }
11968
11969 return true;
11970 }
11971 /* MVN. */
11972 if (speed)
11973 *cost += extra_cost->alu.logical;
11974
11975 return false;
11976
11977 case ZERO_EXTEND:
11978
11979 op0 = XEXP (x, 0);
11980 /* If a value is written in SI mode, then zero extended to DI
11981 mode, the operation will in general be free as a write to
11982 a 'w' register implicitly zeroes the upper bits of an 'x'
11983 register. However, if this is
11984
11985 (set (reg) (zero_extend (reg)))
11986
11987 we must cost the explicit register move. */
11988 if (mode == DImode
11989 && GET_MODE (op0) == SImode
11990 && outer == SET)
11991 {
11992 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11993
11994 /* If OP_COST is non-zero, then the cost of the zero extend
11995 is effectively the cost of the inner operation. Otherwise
11996 we have a MOV instruction and we take the cost from the MOV
11997 itself. This is true independently of whether we are
11998 optimizing for space or time. */
11999 if (op_cost)
12000 *cost = op_cost;
12001
12002 return true;
12003 }
12004 else if (MEM_P (op0))
12005 {
12006 /* All loads can zero extend to any size for free. */
12007 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
12008 return true;
12009 }
12010
12011 op0 = aarch64_extend_bitfield_pattern_p (x);
12012 if (op0)
12013 {
12014 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12015 if (speed)
12016 *cost += extra_cost->alu.bfx;
12017 return true;
12018 }
12019
12020 if (speed)
12021 {
12022 if (VECTOR_MODE_P (mode))
12023 {
12024 /* UMOV. */
12025 *cost += extra_cost->vect.alu;
12026 }
12027 else
12028 {
12029 /* We generate an AND instead of UXTB/UXTH. */
12030 *cost += extra_cost->alu.logical;
12031 }
12032 }
12033 return false;
12034
12035 case SIGN_EXTEND:
12036 if (MEM_P (XEXP (x, 0)))
12037 {
12038 /* LDRSH. */
12039 if (speed)
12040 {
12041 rtx address = XEXP (XEXP (x, 0), 0);
12042 *cost += extra_cost->ldst.load_sign_extend;
12043
12044 *cost +=
12045 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12046 0, speed));
12047 }
12048 return true;
12049 }
12050
12051 op0 = aarch64_extend_bitfield_pattern_p (x);
12052 if (op0)
12053 {
12054 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
12055 if (speed)
12056 *cost += extra_cost->alu.bfx;
12057 return true;
12058 }
12059
12060 if (speed)
12061 {
12062 if (VECTOR_MODE_P (mode))
12063 *cost += extra_cost->vect.alu;
12064 else
12065 *cost += extra_cost->alu.extend;
12066 }
12067 return false;
12068
12069 case ASHIFT:
12070 op0 = XEXP (x, 0);
12071 op1 = XEXP (x, 1);
12072
12073 if (CONST_INT_P (op1))
12074 {
12075 if (speed)
12076 {
12077 if (VECTOR_MODE_P (mode))
12078 {
12079 /* Vector shift (immediate). */
12080 *cost += extra_cost->vect.alu;
12081 }
12082 else
12083 {
12084 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12085 aliases. */
12086 *cost += extra_cost->alu.shift;
12087 }
12088 }
12089
12090 /* We can incorporate zero/sign extend for free. */
12091 if (GET_CODE (op0) == ZERO_EXTEND
12092 || GET_CODE (op0) == SIGN_EXTEND)
12093 op0 = XEXP (op0, 0);
12094
12095 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12096 return true;
12097 }
12098 else
12099 {
12100 if (VECTOR_MODE_P (mode))
12101 {
12102 if (speed)
12103 /* Vector shift (register). */
12104 *cost += extra_cost->vect.alu;
12105 }
12106 else
12107 {
12108 if (speed)
12109 /* LSLV. */
12110 *cost += extra_cost->alu.shift_reg;
12111
12112 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12113 && CONST_INT_P (XEXP (op1, 1))
12114 && known_eq (INTVAL (XEXP (op1, 1)),
12115 GET_MODE_BITSIZE (mode) - 1))
12116 {
12117 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12118 /* We already demanded XEXP (op1, 0) to be REG_P, so
12119 don't recurse into it. */
12120 return true;
12121 }
12122 }
12123 return false; /* All arguments need to be in registers. */
12124 }
12125
12126 case ROTATE:
12127 case ROTATERT:
12128 case LSHIFTRT:
12129 case ASHIFTRT:
12130 op0 = XEXP (x, 0);
12131 op1 = XEXP (x, 1);
12132
12133 if (CONST_INT_P (op1))
12134 {
12135 /* ASR (immediate) and friends. */
12136 if (speed)
12137 {
12138 if (VECTOR_MODE_P (mode))
12139 *cost += extra_cost->vect.alu;
12140 else
12141 *cost += extra_cost->alu.shift;
12142 }
12143
12144 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12145 return true;
12146 }
12147 else
12148 {
12149 if (VECTOR_MODE_P (mode))
12150 {
12151 if (speed)
12152 /* Vector shift (register). */
12153 *cost += extra_cost->vect.alu;
12154 }
12155 else
12156 {
12157 if (speed)
12158 /* ASR (register) and friends. */
12159 *cost += extra_cost->alu.shift_reg;
12160
12161 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12162 && CONST_INT_P (XEXP (op1, 1))
12163 && known_eq (INTVAL (XEXP (op1, 1)),
12164 GET_MODE_BITSIZE (mode) - 1))
12165 {
12166 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12167 /* We already demanded XEXP (op1, 0) to be REG_P, so
12168 don't recurse into it. */
12169 return true;
12170 }
12171 }
12172 return false; /* All arguments need to be in registers. */
12173 }
12174
12175 case SYMBOL_REF:
12176
12177 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12178 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12179 {
12180 /* LDR. */
12181 if (speed)
12182 *cost += extra_cost->ldst.load;
12183 }
12184 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12185 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12186 {
12187 /* ADRP, followed by ADD. */
12188 *cost += COSTS_N_INSNS (1);
12189 if (speed)
12190 *cost += 2 * extra_cost->alu.arith;
12191 }
12192 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12193 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12194 {
12195 /* ADR. */
12196 if (speed)
12197 *cost += extra_cost->alu.arith;
12198 }
12199
12200 if (flag_pic)
12201 {
12202 /* One extra load instruction, after accessing the GOT. */
12203 *cost += COSTS_N_INSNS (1);
12204 if (speed)
12205 *cost += extra_cost->ldst.load;
12206 }
12207 return true;
12208
12209 case HIGH:
12210 case LO_SUM:
12211 /* ADRP/ADD (immediate). */
12212 if (speed)
12213 *cost += extra_cost->alu.arith;
12214 return true;
12215
12216 case ZERO_EXTRACT:
12217 case SIGN_EXTRACT:
12218 /* UBFX/SBFX. */
12219 if (speed)
12220 {
12221 if (VECTOR_MODE_P (mode))
12222 *cost += extra_cost->vect.alu;
12223 else
12224 *cost += extra_cost->alu.bfx;
12225 }
12226
12227 /* We can trust that the immediates used will be correct (there
12228 are no by-register forms), so we need only cost op0. */
12229 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12230 return true;
12231
12232 case MULT:
12233 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12234 /* aarch64_rtx_mult_cost always handles recursion to its
12235 operands. */
12236 return true;
12237
12238 case MOD:
12239 /* We can expand signed mod by power of 2 using a NEGS, two parallel
12240 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
12241 an unconditional negate. This case should only ever be reached through
12242 the set_smod_pow2_cheap check in expmed.c. */
12243 if (CONST_INT_P (XEXP (x, 1))
12244 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12245 && (mode == SImode || mode == DImode))
12246 {
12247 /* We expand to 4 instructions. Reset the baseline. */
12248 *cost = COSTS_N_INSNS (4);
12249
12250 if (speed)
12251 *cost += 2 * extra_cost->alu.logical
12252 + 2 * extra_cost->alu.arith;
12253
12254 return true;
12255 }
12256
12257 /* Fall-through. */
12258 case UMOD:
12259 if (speed)
12260 {
12261 /* Slighly prefer UMOD over SMOD. */
12262 if (VECTOR_MODE_P (mode))
12263 *cost += extra_cost->vect.alu;
12264 else if (GET_MODE_CLASS (mode) == MODE_INT)
12265 *cost += (extra_cost->mult[mode == DImode].add
12266 + extra_cost->mult[mode == DImode].idiv
12267 + (code == MOD ? 1 : 0));
12268 }
12269 return false; /* All arguments need to be in registers. */
12270
12271 case DIV:
12272 case UDIV:
12273 case SQRT:
12274 if (speed)
12275 {
12276 if (VECTOR_MODE_P (mode))
12277 *cost += extra_cost->vect.alu;
12278 else if (GET_MODE_CLASS (mode) == MODE_INT)
12279 /* There is no integer SQRT, so only DIV and UDIV can get
12280 here. */
12281 *cost += (extra_cost->mult[mode == DImode].idiv
12282 /* Slighly prefer UDIV over SDIV. */
12283 + (code == DIV ? 1 : 0));
12284 else
12285 *cost += extra_cost->fp[mode == DFmode].div;
12286 }
12287 return false; /* All arguments need to be in registers. */
12288
12289 case IF_THEN_ELSE:
12290 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12291 XEXP (x, 2), cost, speed);
12292
12293 case EQ:
12294 case NE:
12295 case GT:
12296 case GTU:
12297 case LT:
12298 case LTU:
12299 case GE:
12300 case GEU:
12301 case LE:
12302 case LEU:
12303
12304 return false; /* All arguments must be in registers. */
12305
12306 case FMA:
12307 op0 = XEXP (x, 0);
12308 op1 = XEXP (x, 1);
12309 op2 = XEXP (x, 2);
12310
12311 if (speed)
12312 {
12313 if (VECTOR_MODE_P (mode))
12314 *cost += extra_cost->vect.alu;
12315 else
12316 *cost += extra_cost->fp[mode == DFmode].fma;
12317 }
12318
12319 /* FMSUB, FNMADD, and FNMSUB are free. */
12320 if (GET_CODE (op0) == NEG)
12321 op0 = XEXP (op0, 0);
12322
12323 if (GET_CODE (op2) == NEG)
12324 op2 = XEXP (op2, 0);
12325
12326 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12327 and the by-element operand as operand 0. */
12328 if (GET_CODE (op1) == NEG)
12329 op1 = XEXP (op1, 0);
12330
12331 /* Catch vector-by-element operations. The by-element operand can
12332 either be (vec_duplicate (vec_select (x))) or just
12333 (vec_select (x)), depending on whether we are multiplying by
12334 a vector or a scalar.
12335
12336 Canonicalization is not very good in these cases, FMA4 will put the
12337 by-element operand as operand 0, FNMA4 will have it as operand 1. */
12338 if (GET_CODE (op0) == VEC_DUPLICATE)
12339 op0 = XEXP (op0, 0);
12340 else if (GET_CODE (op1) == VEC_DUPLICATE)
12341 op1 = XEXP (op1, 0);
12342
12343 if (GET_CODE (op0) == VEC_SELECT)
12344 op0 = XEXP (op0, 0);
12345 else if (GET_CODE (op1) == VEC_SELECT)
12346 op1 = XEXP (op1, 0);
12347
12348 /* If the remaining parameters are not registers,
12349 get the cost to put them into registers. */
12350 *cost += rtx_cost (op0, mode, FMA, 0, speed);
12351 *cost += rtx_cost (op1, mode, FMA, 1, speed);
12352 *cost += rtx_cost (op2, mode, FMA, 2, speed);
12353 return true;
12354
12355 case FLOAT:
12356 case UNSIGNED_FLOAT:
12357 if (speed)
12358 *cost += extra_cost->fp[mode == DFmode].fromint;
12359 return false;
12360
12361 case FLOAT_EXTEND:
12362 if (speed)
12363 {
12364 if (VECTOR_MODE_P (mode))
12365 {
12366 /*Vector truncate. */
12367 *cost += extra_cost->vect.alu;
12368 }
12369 else
12370 *cost += extra_cost->fp[mode == DFmode].widen;
12371 }
12372 return false;
12373
12374 case FLOAT_TRUNCATE:
12375 if (speed)
12376 {
12377 if (VECTOR_MODE_P (mode))
12378 {
12379 /*Vector conversion. */
12380 *cost += extra_cost->vect.alu;
12381 }
12382 else
12383 *cost += extra_cost->fp[mode == DFmode].narrow;
12384 }
12385 return false;
12386
12387 case FIX:
12388 case UNSIGNED_FIX:
12389 x = XEXP (x, 0);
12390 /* Strip the rounding part. They will all be implemented
12391 by the fcvt* family of instructions anyway. */
12392 if (GET_CODE (x) == UNSPEC)
12393 {
12394 unsigned int uns_code = XINT (x, 1);
12395
12396 if (uns_code == UNSPEC_FRINTA
12397 || uns_code == UNSPEC_FRINTM
12398 || uns_code == UNSPEC_FRINTN
12399 || uns_code == UNSPEC_FRINTP
12400 || uns_code == UNSPEC_FRINTZ)
12401 x = XVECEXP (x, 0, 0);
12402 }
12403
12404 if (speed)
12405 {
12406 if (VECTOR_MODE_P (mode))
12407 *cost += extra_cost->vect.alu;
12408 else
12409 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
12410 }
12411
12412 /* We can combine fmul by a power of 2 followed by a fcvt into a single
12413 fixed-point fcvt. */
12414 if (GET_CODE (x) == MULT
12415 && ((VECTOR_MODE_P (mode)
12416 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
12417 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
12418 {
12419 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
12420 0, speed);
12421 return true;
12422 }
12423
12424 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
12425 return true;
12426
12427 case ABS:
12428 if (VECTOR_MODE_P (mode))
12429 {
12430 /* ABS (vector). */
12431 if (speed)
12432 *cost += extra_cost->vect.alu;
12433 }
12434 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12435 {
12436 op0 = XEXP (x, 0);
12437
12438 /* FABD, which is analogous to FADD. */
12439 if (GET_CODE (op0) == MINUS)
12440 {
12441 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
12442 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
12443 if (speed)
12444 *cost += extra_cost->fp[mode == DFmode].addsub;
12445
12446 return true;
12447 }
12448 /* Simple FABS is analogous to FNEG. */
12449 if (speed)
12450 *cost += extra_cost->fp[mode == DFmode].neg;
12451 }
12452 else
12453 {
12454 /* Integer ABS will either be split to
12455 two arithmetic instructions, or will be an ABS
12456 (scalar), which we don't model. */
12457 *cost = COSTS_N_INSNS (2);
12458 if (speed)
12459 *cost += 2 * extra_cost->alu.arith;
12460 }
12461 return false;
12462
12463 case SMAX:
12464 case SMIN:
12465 if (speed)
12466 {
12467 if (VECTOR_MODE_P (mode))
12468 *cost += extra_cost->vect.alu;
12469 else
12470 {
12471 /* FMAXNM/FMINNM/FMAX/FMIN.
12472 TODO: This may not be accurate for all implementations, but
12473 we do not model this in the cost tables. */
12474 *cost += extra_cost->fp[mode == DFmode].addsub;
12475 }
12476 }
12477 return false;
12478
12479 case UNSPEC:
12480 /* The floating point round to integer frint* instructions. */
12481 if (aarch64_frint_unspec_p (XINT (x, 1)))
12482 {
12483 if (speed)
12484 *cost += extra_cost->fp[mode == DFmode].roundint;
12485
12486 return false;
12487 }
12488
12489 if (XINT (x, 1) == UNSPEC_RBIT)
12490 {
12491 if (speed)
12492 *cost += extra_cost->alu.rev;
12493
12494 return false;
12495 }
12496 break;
12497
12498 case TRUNCATE:
12499
12500 /* Decompose <su>muldi3_highpart. */
12501 if (/* (truncate:DI */
12502 mode == DImode
12503 /* (lshiftrt:TI */
12504 && GET_MODE (XEXP (x, 0)) == TImode
12505 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
12506 /* (mult:TI */
12507 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12508 /* (ANY_EXTEND:TI (reg:DI))
12509 (ANY_EXTEND:TI (reg:DI))) */
12510 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
12511 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
12512 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
12513 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
12514 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
12515 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
12516 /* (const_int 64) */
12517 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12518 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
12519 {
12520 /* UMULH/SMULH. */
12521 if (speed)
12522 *cost += extra_cost->mult[mode == DImode].extend;
12523 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
12524 mode, MULT, 0, speed);
12525 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
12526 mode, MULT, 1, speed);
12527 return true;
12528 }
12529
12530 /* Fall through. */
12531 default:
12532 break;
12533 }
12534
12535 if (dump_file
12536 && flag_aarch64_verbose_cost)
12537 fprintf (dump_file,
12538 "\nFailed to cost RTX. Assuming default cost.\n");
12539
12540 return true;
12541 }
12542
12543 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12544 calculated for X. This cost is stored in *COST. Returns true
12545 if the total cost of X was calculated. */
12546 static bool
12547 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
12548 int param, int *cost, bool speed)
12549 {
12550 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
12551
12552 if (dump_file
12553 && flag_aarch64_verbose_cost)
12554 {
12555 print_rtl_single (dump_file, x);
12556 fprintf (dump_file, "\n%s cost: %d (%s)\n",
12557 speed ? "Hot" : "Cold",
12558 *cost, result ? "final" : "partial");
12559 }
12560
12561 return result;
12562 }
12563
12564 static int
12565 aarch64_register_move_cost (machine_mode mode,
12566 reg_class_t from_i, reg_class_t to_i)
12567 {
12568 enum reg_class from = (enum reg_class) from_i;
12569 enum reg_class to = (enum reg_class) to_i;
12570 const struct cpu_regmove_cost *regmove_cost
12571 = aarch64_tune_params.regmove_cost;
12572
12573 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
12574 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
12575 to = GENERAL_REGS;
12576
12577 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
12578 from = GENERAL_REGS;
12579
12580 /* Make RDFFR very expensive. In particular, if we know that the FFR
12581 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
12582 as a way of obtaining a PTRUE. */
12583 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
12584 && hard_reg_set_subset_p (reg_class_contents[from_i],
12585 reg_class_contents[FFR_REGS]))
12586 return 80;
12587
12588 /* Moving between GPR and stack cost is the same as GP2GP. */
12589 if ((from == GENERAL_REGS && to == STACK_REG)
12590 || (to == GENERAL_REGS && from == STACK_REG))
12591 return regmove_cost->GP2GP;
12592
12593 /* To/From the stack register, we move via the gprs. */
12594 if (to == STACK_REG || from == STACK_REG)
12595 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
12596 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
12597
12598 if (known_eq (GET_MODE_SIZE (mode), 16))
12599 {
12600 /* 128-bit operations on general registers require 2 instructions. */
12601 if (from == GENERAL_REGS && to == GENERAL_REGS)
12602 return regmove_cost->GP2GP * 2;
12603 else if (from == GENERAL_REGS)
12604 return regmove_cost->GP2FP * 2;
12605 else if (to == GENERAL_REGS)
12606 return regmove_cost->FP2GP * 2;
12607
12608 /* When AdvSIMD instructions are disabled it is not possible to move
12609 a 128-bit value directly between Q registers. This is handled in
12610 secondary reload. A general register is used as a scratch to move
12611 the upper DI value and the lower DI value is moved directly,
12612 hence the cost is the sum of three moves. */
12613 if (! TARGET_SIMD)
12614 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
12615
12616 return regmove_cost->FP2FP;
12617 }
12618
12619 if (from == GENERAL_REGS && to == GENERAL_REGS)
12620 return regmove_cost->GP2GP;
12621 else if (from == GENERAL_REGS)
12622 return regmove_cost->GP2FP;
12623 else if (to == GENERAL_REGS)
12624 return regmove_cost->FP2GP;
12625
12626 return regmove_cost->FP2FP;
12627 }
12628
12629 static int
12630 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
12631 reg_class_t rclass ATTRIBUTE_UNUSED,
12632 bool in ATTRIBUTE_UNUSED)
12633 {
12634 return aarch64_tune_params.memmov_cost;
12635 }
12636
12637 /* Implement TARGET_INIT_BUILTINS. */
12638 static void
12639 aarch64_init_builtins ()
12640 {
12641 aarch64_general_init_builtins ();
12642 aarch64_sve::init_builtins ();
12643 }
12644
12645 /* Implement TARGET_FOLD_BUILTIN. */
12646 static tree
12647 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
12648 {
12649 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12650 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12651 tree type = TREE_TYPE (TREE_TYPE (fndecl));
12652 switch (code & AARCH64_BUILTIN_CLASS)
12653 {
12654 case AARCH64_BUILTIN_GENERAL:
12655 return aarch64_general_fold_builtin (subcode, type, nargs, args);
12656
12657 case AARCH64_BUILTIN_SVE:
12658 return NULL_TREE;
12659 }
12660 gcc_unreachable ();
12661 }
12662
12663 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
12664 static bool
12665 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
12666 {
12667 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
12668 tree fndecl = gimple_call_fndecl (stmt);
12669 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12670 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12671 gimple *new_stmt = NULL;
12672 switch (code & AARCH64_BUILTIN_CLASS)
12673 {
12674 case AARCH64_BUILTIN_GENERAL:
12675 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
12676 break;
12677
12678 case AARCH64_BUILTIN_SVE:
12679 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
12680 break;
12681 }
12682
12683 if (!new_stmt)
12684 return false;
12685
12686 gsi_replace (gsi, new_stmt, true);
12687 return true;
12688 }
12689
12690 /* Implement TARGET_EXPAND_BUILTIN. */
12691 static rtx
12692 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
12693 {
12694 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12695 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12696 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12697 switch (code & AARCH64_BUILTIN_CLASS)
12698 {
12699 case AARCH64_BUILTIN_GENERAL:
12700 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
12701
12702 case AARCH64_BUILTIN_SVE:
12703 return aarch64_sve::expand_builtin (subcode, exp, target);
12704 }
12705 gcc_unreachable ();
12706 }
12707
12708 /* Implement TARGET_BUILTIN_DECL. */
12709 static tree
12710 aarch64_builtin_decl (unsigned int code, bool initialize_p)
12711 {
12712 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12713 switch (code & AARCH64_BUILTIN_CLASS)
12714 {
12715 case AARCH64_BUILTIN_GENERAL:
12716 return aarch64_general_builtin_decl (subcode, initialize_p);
12717
12718 case AARCH64_BUILTIN_SVE:
12719 return aarch64_sve::builtin_decl (subcode, initialize_p);
12720 }
12721 gcc_unreachable ();
12722 }
12723
12724 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
12725 to optimize 1.0/sqrt. */
12726
12727 static bool
12728 use_rsqrt_p (machine_mode mode)
12729 {
12730 return (!flag_trapping_math
12731 && flag_unsafe_math_optimizations
12732 && ((aarch64_tune_params.approx_modes->recip_sqrt
12733 & AARCH64_APPROX_MODE (mode))
12734 || flag_mrecip_low_precision_sqrt));
12735 }
12736
12737 /* Function to decide when to use the approximate reciprocal square root
12738 builtin. */
12739
12740 static tree
12741 aarch64_builtin_reciprocal (tree fndecl)
12742 {
12743 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
12744
12745 if (!use_rsqrt_p (mode))
12746 return NULL_TREE;
12747 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12748 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12749 switch (code & AARCH64_BUILTIN_CLASS)
12750 {
12751 case AARCH64_BUILTIN_GENERAL:
12752 return aarch64_general_builtin_rsqrt (subcode);
12753
12754 case AARCH64_BUILTIN_SVE:
12755 return NULL_TREE;
12756 }
12757 gcc_unreachable ();
12758 }
12759
12760 /* Emit code to perform the floating-point operation:
12761
12762 DST = SRC1 * SRC2
12763
12764 where all three operands are already known to be registers.
12765 If the operation is an SVE one, PTRUE is a suitable all-true
12766 predicate. */
12767
12768 static void
12769 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
12770 {
12771 if (ptrue)
12772 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
12773 dst, ptrue, src1, src2,
12774 gen_int_mode (SVE_RELAXED_GP, SImode)));
12775 else
12776 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
12777 }
12778
12779 /* Emit instruction sequence to compute either the approximate square root
12780 or its approximate reciprocal, depending on the flag RECP, and return
12781 whether the sequence was emitted or not. */
12782
12783 bool
12784 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
12785 {
12786 machine_mode mode = GET_MODE (dst);
12787
12788 if (GET_MODE_INNER (mode) == HFmode)
12789 {
12790 gcc_assert (!recp);
12791 return false;
12792 }
12793
12794 if (!recp)
12795 {
12796 if (!(flag_mlow_precision_sqrt
12797 || (aarch64_tune_params.approx_modes->sqrt
12798 & AARCH64_APPROX_MODE (mode))))
12799 return false;
12800
12801 if (!flag_finite_math_only
12802 || flag_trapping_math
12803 || !flag_unsafe_math_optimizations
12804 || optimize_function_for_size_p (cfun))
12805 return false;
12806 }
12807 else
12808 /* Caller assumes we cannot fail. */
12809 gcc_assert (use_rsqrt_p (mode));
12810
12811 rtx pg = NULL_RTX;
12812 if (aarch64_sve_mode_p (mode))
12813 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
12814 machine_mode mmsk = (VECTOR_MODE_P (mode)
12815 ? related_int_vector_mode (mode).require ()
12816 : int_mode_for_mode (mode).require ());
12817 rtx xmsk = NULL_RTX;
12818 if (!recp)
12819 {
12820 /* When calculating the approximate square root, compare the
12821 argument with 0.0 and create a mask. */
12822 rtx zero = CONST0_RTX (mode);
12823 if (pg)
12824 {
12825 xmsk = gen_reg_rtx (GET_MODE (pg));
12826 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
12827 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
12828 xmsk, pg, hint, src, zero));
12829 }
12830 else
12831 {
12832 xmsk = gen_reg_rtx (mmsk);
12833 emit_insn (gen_rtx_SET (xmsk,
12834 gen_rtx_NEG (mmsk,
12835 gen_rtx_EQ (mmsk, src, zero))));
12836 }
12837 }
12838
12839 /* Estimate the approximate reciprocal square root. */
12840 rtx xdst = gen_reg_rtx (mode);
12841 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
12842
12843 /* Iterate over the series twice for SF and thrice for DF. */
12844 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12845
12846 /* Optionally iterate over the series once less for faster performance
12847 while sacrificing the accuracy. */
12848 if ((recp && flag_mrecip_low_precision_sqrt)
12849 || (!recp && flag_mlow_precision_sqrt))
12850 iterations--;
12851
12852 /* Iterate over the series to calculate the approximate reciprocal square
12853 root. */
12854 rtx x1 = gen_reg_rtx (mode);
12855 while (iterations--)
12856 {
12857 rtx x2 = gen_reg_rtx (mode);
12858 aarch64_emit_mult (x2, pg, xdst, xdst);
12859
12860 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
12861
12862 if (iterations > 0)
12863 aarch64_emit_mult (xdst, pg, xdst, x1);
12864 }
12865
12866 if (!recp)
12867 {
12868 if (pg)
12869 /* Multiply nonzero source values by the corresponding intermediate
12870 result elements, so that the final calculation is the approximate
12871 square root rather than its reciprocal. Select a zero result for
12872 zero source values, to avoid the Inf * 0 -> NaN that we'd get
12873 otherwise. */
12874 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
12875 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
12876 else
12877 {
12878 /* Qualify the approximate reciprocal square root when the
12879 argument is 0.0 by squashing the intermediary result to 0.0. */
12880 rtx xtmp = gen_reg_rtx (mmsk);
12881 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
12882 gen_rtx_SUBREG (mmsk, xdst, 0)));
12883 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
12884
12885 /* Calculate the approximate square root. */
12886 aarch64_emit_mult (xdst, pg, xdst, src);
12887 }
12888 }
12889
12890 /* Finalize the approximation. */
12891 aarch64_emit_mult (dst, pg, xdst, x1);
12892
12893 return true;
12894 }
12895
12896 /* Emit the instruction sequence to compute the approximation for the division
12897 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
12898
12899 bool
12900 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
12901 {
12902 machine_mode mode = GET_MODE (quo);
12903
12904 if (GET_MODE_INNER (mode) == HFmode)
12905 return false;
12906
12907 bool use_approx_division_p = (flag_mlow_precision_div
12908 || (aarch64_tune_params.approx_modes->division
12909 & AARCH64_APPROX_MODE (mode)));
12910
12911 if (!flag_finite_math_only
12912 || flag_trapping_math
12913 || !flag_unsafe_math_optimizations
12914 || optimize_function_for_size_p (cfun)
12915 || !use_approx_division_p)
12916 return false;
12917
12918 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
12919 return false;
12920
12921 rtx pg = NULL_RTX;
12922 if (aarch64_sve_mode_p (mode))
12923 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
12924
12925 /* Estimate the approximate reciprocal. */
12926 rtx xrcp = gen_reg_rtx (mode);
12927 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
12928
12929 /* Iterate over the series twice for SF and thrice for DF. */
12930 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12931
12932 /* Optionally iterate over the series less for faster performance,
12933 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
12934 if (flag_mlow_precision_div)
12935 iterations = (GET_MODE_INNER (mode) == DFmode
12936 ? aarch64_double_recp_precision
12937 : aarch64_float_recp_precision);
12938
12939 /* Iterate over the series to calculate the approximate reciprocal. */
12940 rtx xtmp = gen_reg_rtx (mode);
12941 while (iterations--)
12942 {
12943 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
12944
12945 if (iterations > 0)
12946 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
12947 }
12948
12949 if (num != CONST1_RTX (mode))
12950 {
12951 /* As the approximate reciprocal of DEN is already calculated, only
12952 calculate the approximate division when NUM is not 1.0. */
12953 rtx xnum = force_reg (mode, num);
12954 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
12955 }
12956
12957 /* Finalize the approximation. */
12958 aarch64_emit_mult (quo, pg, xrcp, xtmp);
12959 return true;
12960 }
12961
12962 /* Return the number of instructions that can be issued per cycle. */
12963 static int
12964 aarch64_sched_issue_rate (void)
12965 {
12966 return aarch64_tune_params.issue_rate;
12967 }
12968
12969 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
12970 static int
12971 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
12972 {
12973 if (DEBUG_INSN_P (insn))
12974 return more;
12975
12976 rtx_code code = GET_CODE (PATTERN (insn));
12977 if (code == USE || code == CLOBBER)
12978 return more;
12979
12980 if (get_attr_type (insn) == TYPE_NO_INSN)
12981 return more;
12982
12983 return more - 1;
12984 }
12985
12986 static int
12987 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
12988 {
12989 int issue_rate = aarch64_sched_issue_rate ();
12990
12991 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
12992 }
12993
12994
12995 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
12996 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
12997 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
12998
12999 static int
13000 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13001 int ready_index)
13002 {
13003 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13004 }
13005
13006
13007 /* Vectorizer cost model target hooks. */
13008
13009 /* Implement targetm.vectorize.builtin_vectorization_cost. */
13010 static int
13011 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13012 tree vectype,
13013 int misalign ATTRIBUTE_UNUSED)
13014 {
13015 unsigned elements;
13016 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13017 bool fp = false;
13018
13019 if (vectype != NULL)
13020 fp = FLOAT_TYPE_P (vectype);
13021
13022 switch (type_of_cost)
13023 {
13024 case scalar_stmt:
13025 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
13026
13027 case scalar_load:
13028 return costs->scalar_load_cost;
13029
13030 case scalar_store:
13031 return costs->scalar_store_cost;
13032
13033 case vector_stmt:
13034 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13035
13036 case vector_load:
13037 return costs->vec_align_load_cost;
13038
13039 case vector_store:
13040 return costs->vec_store_cost;
13041
13042 case vec_to_scalar:
13043 return costs->vec_to_scalar_cost;
13044
13045 case scalar_to_vec:
13046 return costs->scalar_to_vec_cost;
13047
13048 case unaligned_load:
13049 case vector_gather_load:
13050 return costs->vec_unalign_load_cost;
13051
13052 case unaligned_store:
13053 case vector_scatter_store:
13054 return costs->vec_unalign_store_cost;
13055
13056 case cond_branch_taken:
13057 return costs->cond_taken_branch_cost;
13058
13059 case cond_branch_not_taken:
13060 return costs->cond_not_taken_branch_cost;
13061
13062 case vec_perm:
13063 return costs->vec_permute_cost;
13064
13065 case vec_promote_demote:
13066 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13067
13068 case vec_construct:
13069 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
13070 return elements / 2 + 1;
13071
13072 default:
13073 gcc_unreachable ();
13074 }
13075 }
13076
13077 /* Return true if STMT_INFO extends the result of a load. */
13078 static bool
13079 aarch64_extending_load_p (stmt_vec_info stmt_info)
13080 {
13081 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13082 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13083 return false;
13084
13085 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
13086 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13087 tree rhs_type = TREE_TYPE (rhs);
13088 if (!INTEGRAL_TYPE_P (lhs_type)
13089 || !INTEGRAL_TYPE_P (rhs_type)
13090 || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
13091 return false;
13092
13093 stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs);
13094 return (def_stmt_info
13095 && STMT_VINFO_DATA_REF (def_stmt_info)
13096 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
13097 }
13098
13099 /* Return true if STMT_INFO is an integer truncation. */
13100 static bool
13101 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
13102 {
13103 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13104 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13105 return false;
13106
13107 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13108 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
13109 return (INTEGRAL_TYPE_P (lhs_type)
13110 && INTEGRAL_TYPE_P (rhs_type)
13111 && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
13112 }
13113
13114 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13115 for STMT_INFO, which has cost kind KIND. Adjust the cost as necessary
13116 for SVE targets. */
13117 static unsigned int
13118 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
13119 unsigned int stmt_cost)
13120 {
13121 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13122 vector register size or number of units. Integer promotions of this
13123 type therefore map to SXT[BHW] or UXT[BHW].
13124
13125 Most loads have extending forms that can do the sign or zero extension
13126 on the fly. Optimistically assume that a load followed by an extension
13127 will fold to this form during combine, and that the extension therefore
13128 comes for free. */
13129 if (kind == vector_stmt && aarch64_extending_load_p (stmt_info))
13130 stmt_cost = 0;
13131
13132 /* For similar reasons, vector_stmt integer truncations are a no-op,
13133 because we can just ignore the unused upper bits of the source. */
13134 if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13135 stmt_cost = 0;
13136
13137 return stmt_cost;
13138 }
13139
13140 /* Implement targetm.vectorize.add_stmt_cost. */
13141 static unsigned
13142 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
13143 struct _stmt_vec_info *stmt_info, int misalign,
13144 enum vect_cost_model_location where)
13145 {
13146 unsigned *cost = (unsigned *) data;
13147 unsigned retval = 0;
13148
13149 if (flag_vect_cost_model)
13150 {
13151 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
13152 int stmt_cost =
13153 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13154
13155 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13156 stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, stmt_cost);
13157
13158 /* Statements in an inner loop relative to the loop being
13159 vectorized are weighted more heavily. The value here is
13160 arbitrary and could potentially be improved with analysis. */
13161 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
13162 count *= 50; /* FIXME */
13163
13164 retval = (unsigned) (count * stmt_cost);
13165 cost[where] += retval;
13166 }
13167
13168 return retval;
13169 }
13170
13171 static void initialize_aarch64_code_model (struct gcc_options *);
13172
13173 /* Parse the TO_PARSE string and put the architecture struct that it
13174 selects into RES and the architectural features into ISA_FLAGS.
13175 Return an aarch64_parse_opt_result describing the parse result.
13176 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13177 When the TO_PARSE string contains an invalid extension,
13178 a copy of the string is created and stored to INVALID_EXTENSION. */
13179
13180 static enum aarch64_parse_opt_result
13181 aarch64_parse_arch (const char *to_parse, const struct processor **res,
13182 uint64_t *isa_flags, std::string *invalid_extension)
13183 {
13184 const char *ext;
13185 const struct processor *arch;
13186 size_t len;
13187
13188 ext = strchr (to_parse, '+');
13189
13190 if (ext != NULL)
13191 len = ext - to_parse;
13192 else
13193 len = strlen (to_parse);
13194
13195 if (len == 0)
13196 return AARCH64_PARSE_MISSING_ARG;
13197
13198
13199 /* Loop through the list of supported ARCHes to find a match. */
13200 for (arch = all_architectures; arch->name != NULL; arch++)
13201 {
13202 if (strlen (arch->name) == len
13203 && strncmp (arch->name, to_parse, len) == 0)
13204 {
13205 uint64_t isa_temp = arch->flags;
13206
13207 if (ext != NULL)
13208 {
13209 /* TO_PARSE string contains at least one extension. */
13210 enum aarch64_parse_opt_result ext_res
13211 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13212
13213 if (ext_res != AARCH64_PARSE_OK)
13214 return ext_res;
13215 }
13216 /* Extension parsing was successful. Confirm the result
13217 arch and ISA flags. */
13218 *res = arch;
13219 *isa_flags = isa_temp;
13220 return AARCH64_PARSE_OK;
13221 }
13222 }
13223
13224 /* ARCH name not found in list. */
13225 return AARCH64_PARSE_INVALID_ARG;
13226 }
13227
13228 /* Parse the TO_PARSE string and put the result tuning in RES and the
13229 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
13230 describing the parse result. If there is an error parsing, RES and
13231 ISA_FLAGS are left unchanged.
13232 When the TO_PARSE string contains an invalid extension,
13233 a copy of the string is created and stored to INVALID_EXTENSION. */
13234
13235 static enum aarch64_parse_opt_result
13236 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
13237 uint64_t *isa_flags, std::string *invalid_extension)
13238 {
13239 const char *ext;
13240 const struct processor *cpu;
13241 size_t len;
13242
13243 ext = strchr (to_parse, '+');
13244
13245 if (ext != NULL)
13246 len = ext - to_parse;
13247 else
13248 len = strlen (to_parse);
13249
13250 if (len == 0)
13251 return AARCH64_PARSE_MISSING_ARG;
13252
13253
13254 /* Loop through the list of supported CPUs to find a match. */
13255 for (cpu = all_cores; cpu->name != NULL; cpu++)
13256 {
13257 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
13258 {
13259 uint64_t isa_temp = cpu->flags;
13260
13261
13262 if (ext != NULL)
13263 {
13264 /* TO_PARSE string contains at least one extension. */
13265 enum aarch64_parse_opt_result ext_res
13266 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13267
13268 if (ext_res != AARCH64_PARSE_OK)
13269 return ext_res;
13270 }
13271 /* Extension parsing was successfull. Confirm the result
13272 cpu and ISA flags. */
13273 *res = cpu;
13274 *isa_flags = isa_temp;
13275 return AARCH64_PARSE_OK;
13276 }
13277 }
13278
13279 /* CPU name not found in list. */
13280 return AARCH64_PARSE_INVALID_ARG;
13281 }
13282
13283 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13284 Return an aarch64_parse_opt_result describing the parse result.
13285 If the parsing fails the RES does not change. */
13286
13287 static enum aarch64_parse_opt_result
13288 aarch64_parse_tune (const char *to_parse, const struct processor **res)
13289 {
13290 const struct processor *cpu;
13291
13292 /* Loop through the list of supported CPUs to find a match. */
13293 for (cpu = all_cores; cpu->name != NULL; cpu++)
13294 {
13295 if (strcmp (cpu->name, to_parse) == 0)
13296 {
13297 *res = cpu;
13298 return AARCH64_PARSE_OK;
13299 }
13300 }
13301
13302 /* CPU name not found in list. */
13303 return AARCH64_PARSE_INVALID_ARG;
13304 }
13305
13306 /* Parse TOKEN, which has length LENGTH to see if it is an option
13307 described in FLAG. If it is, return the index bit for that fusion type.
13308 If not, error (printing OPTION_NAME) and return zero. */
13309
13310 static unsigned int
13311 aarch64_parse_one_option_token (const char *token,
13312 size_t length,
13313 const struct aarch64_flag_desc *flag,
13314 const char *option_name)
13315 {
13316 for (; flag->name != NULL; flag++)
13317 {
13318 if (length == strlen (flag->name)
13319 && !strncmp (flag->name, token, length))
13320 return flag->flag;
13321 }
13322
13323 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
13324 return 0;
13325 }
13326
13327 /* Parse OPTION which is a comma-separated list of flags to enable.
13328 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13329 default state we inherit from the CPU tuning structures. OPTION_NAME
13330 gives the top-level option we are parsing in the -moverride string,
13331 for use in error messages. */
13332
13333 static unsigned int
13334 aarch64_parse_boolean_options (const char *option,
13335 const struct aarch64_flag_desc *flags,
13336 unsigned int initial_state,
13337 const char *option_name)
13338 {
13339 const char separator = '.';
13340 const char* specs = option;
13341 const char* ntoken = option;
13342 unsigned int found_flags = initial_state;
13343
13344 while ((ntoken = strchr (specs, separator)))
13345 {
13346 size_t token_length = ntoken - specs;
13347 unsigned token_ops = aarch64_parse_one_option_token (specs,
13348 token_length,
13349 flags,
13350 option_name);
13351 /* If we find "none" (or, for simplicity's sake, an error) anywhere
13352 in the token stream, reset the supported operations. So:
13353
13354 adrp+add.cmp+branch.none.adrp+add
13355
13356 would have the result of turning on only adrp+add fusion. */
13357 if (!token_ops)
13358 found_flags = 0;
13359
13360 found_flags |= token_ops;
13361 specs = ++ntoken;
13362 }
13363
13364 /* We ended with a comma, print something. */
13365 if (!(*specs))
13366 {
13367 error ("%s string ill-formed\n", option_name);
13368 return 0;
13369 }
13370
13371 /* We still have one more token to parse. */
13372 size_t token_length = strlen (specs);
13373 unsigned token_ops = aarch64_parse_one_option_token (specs,
13374 token_length,
13375 flags,
13376 option_name);
13377 if (!token_ops)
13378 found_flags = 0;
13379
13380 found_flags |= token_ops;
13381 return found_flags;
13382 }
13383
13384 /* Support for overriding instruction fusion. */
13385
13386 static void
13387 aarch64_parse_fuse_string (const char *fuse_string,
13388 struct tune_params *tune)
13389 {
13390 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
13391 aarch64_fusible_pairs,
13392 tune->fusible_ops,
13393 "fuse=");
13394 }
13395
13396 /* Support for overriding other tuning flags. */
13397
13398 static void
13399 aarch64_parse_tune_string (const char *tune_string,
13400 struct tune_params *tune)
13401 {
13402 tune->extra_tuning_flags
13403 = aarch64_parse_boolean_options (tune_string,
13404 aarch64_tuning_flags,
13405 tune->extra_tuning_flags,
13406 "tune=");
13407 }
13408
13409 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13410 Accept the valid SVE vector widths allowed by
13411 aarch64_sve_vector_bits_enum and use it to override sve_width
13412 in TUNE. */
13413
13414 static void
13415 aarch64_parse_sve_width_string (const char *tune_string,
13416 struct tune_params *tune)
13417 {
13418 int width = -1;
13419
13420 int n = sscanf (tune_string, "%d", &width);
13421 if (n == EOF)
13422 {
13423 error ("invalid format for sve_width");
13424 return;
13425 }
13426 switch (width)
13427 {
13428 case SVE_128:
13429 case SVE_256:
13430 case SVE_512:
13431 case SVE_1024:
13432 case SVE_2048:
13433 break;
13434 default:
13435 error ("invalid sve_width value: %d", width);
13436 }
13437 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
13438 }
13439
13440 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13441 we understand. If it is, extract the option string and handoff to
13442 the appropriate function. */
13443
13444 void
13445 aarch64_parse_one_override_token (const char* token,
13446 size_t length,
13447 struct tune_params *tune)
13448 {
13449 const struct aarch64_tuning_override_function *fn
13450 = aarch64_tuning_override_functions;
13451
13452 const char *option_part = strchr (token, '=');
13453 if (!option_part)
13454 {
13455 error ("tuning string missing in option (%s)", token);
13456 return;
13457 }
13458
13459 /* Get the length of the option name. */
13460 length = option_part - token;
13461 /* Skip the '=' to get to the option string. */
13462 option_part++;
13463
13464 for (; fn->name != NULL; fn++)
13465 {
13466 if (!strncmp (fn->name, token, length))
13467 {
13468 fn->parse_override (option_part, tune);
13469 return;
13470 }
13471 }
13472
13473 error ("unknown tuning option (%s)",token);
13474 return;
13475 }
13476
13477 /* A checking mechanism for the implementation of the tls size. */
13478
13479 static void
13480 initialize_aarch64_tls_size (struct gcc_options *opts)
13481 {
13482 if (aarch64_tls_size == 0)
13483 aarch64_tls_size = 24;
13484
13485 switch (opts->x_aarch64_cmodel_var)
13486 {
13487 case AARCH64_CMODEL_TINY:
13488 /* Both the default and maximum TLS size allowed under tiny is 1M which
13489 needs two instructions to address, so we clamp the size to 24. */
13490 if (aarch64_tls_size > 24)
13491 aarch64_tls_size = 24;
13492 break;
13493 case AARCH64_CMODEL_SMALL:
13494 /* The maximum TLS size allowed under small is 4G. */
13495 if (aarch64_tls_size > 32)
13496 aarch64_tls_size = 32;
13497 break;
13498 case AARCH64_CMODEL_LARGE:
13499 /* The maximum TLS size allowed under large is 16E.
13500 FIXME: 16E should be 64bit, we only support 48bit offset now. */
13501 if (aarch64_tls_size > 48)
13502 aarch64_tls_size = 48;
13503 break;
13504 default:
13505 gcc_unreachable ();
13506 }
13507
13508 return;
13509 }
13510
13511 /* Parse STRING looking for options in the format:
13512 string :: option:string
13513 option :: name=substring
13514 name :: {a-z}
13515 substring :: defined by option. */
13516
13517 static void
13518 aarch64_parse_override_string (const char* input_string,
13519 struct tune_params* tune)
13520 {
13521 const char separator = ':';
13522 size_t string_length = strlen (input_string) + 1;
13523 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
13524 char *string = string_root;
13525 strncpy (string, input_string, string_length);
13526 string[string_length - 1] = '\0';
13527
13528 char* ntoken = string;
13529
13530 while ((ntoken = strchr (string, separator)))
13531 {
13532 size_t token_length = ntoken - string;
13533 /* Make this substring look like a string. */
13534 *ntoken = '\0';
13535 aarch64_parse_one_override_token (string, token_length, tune);
13536 string = ++ntoken;
13537 }
13538
13539 /* One last option to parse. */
13540 aarch64_parse_one_override_token (string, strlen (string), tune);
13541 free (string_root);
13542 }
13543
13544
13545 static void
13546 aarch64_override_options_after_change_1 (struct gcc_options *opts)
13547 {
13548 if (accepted_branch_protection_string)
13549 {
13550 opts->x_aarch64_branch_protection_string
13551 = xstrdup (accepted_branch_protection_string);
13552 }
13553
13554 /* PR 70044: We have to be careful about being called multiple times for the
13555 same function. This means all changes should be repeatable. */
13556
13557 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
13558 Disable the frame pointer flag so the mid-end will not use a frame
13559 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
13560 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
13561 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
13562 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
13563 if (opts->x_flag_omit_frame_pointer == 0)
13564 opts->x_flag_omit_frame_pointer = 2;
13565
13566 /* If not optimizing for size, set the default
13567 alignment to what the target wants. */
13568 if (!opts->x_optimize_size)
13569 {
13570 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
13571 opts->x_str_align_loops = aarch64_tune_params.loop_align;
13572 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
13573 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
13574 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
13575 opts->x_str_align_functions = aarch64_tune_params.function_align;
13576 }
13577
13578 /* We default to no pc-relative literal loads. */
13579
13580 aarch64_pcrelative_literal_loads = false;
13581
13582 /* If -mpc-relative-literal-loads is set on the command line, this
13583 implies that the user asked for PC relative literal loads. */
13584 if (opts->x_pcrelative_literal_loads == 1)
13585 aarch64_pcrelative_literal_loads = true;
13586
13587 /* In the tiny memory model it makes no sense to disallow PC relative
13588 literal pool loads. */
13589 if (aarch64_cmodel == AARCH64_CMODEL_TINY
13590 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13591 aarch64_pcrelative_literal_loads = true;
13592
13593 /* When enabling the lower precision Newton series for the square root, also
13594 enable it for the reciprocal square root, since the latter is an
13595 intermediary step for the former. */
13596 if (flag_mlow_precision_sqrt)
13597 flag_mrecip_low_precision_sqrt = true;
13598 }
13599
13600 /* 'Unpack' up the internal tuning structs and update the options
13601 in OPTS. The caller must have set up selected_tune and selected_arch
13602 as all the other target-specific codegen decisions are
13603 derived from them. */
13604
13605 void
13606 aarch64_override_options_internal (struct gcc_options *opts)
13607 {
13608 aarch64_tune_flags = selected_tune->flags;
13609 aarch64_tune = selected_tune->sched_core;
13610 /* Make a copy of the tuning parameters attached to the core, which
13611 we may later overwrite. */
13612 aarch64_tune_params = *(selected_tune->tune);
13613 aarch64_architecture_version = selected_arch->architecture_version;
13614
13615 if (opts->x_aarch64_override_tune_string)
13616 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
13617 &aarch64_tune_params);
13618
13619 /* This target defaults to strict volatile bitfields. */
13620 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
13621 opts->x_flag_strict_volatile_bitfields = 1;
13622
13623 if (aarch64_stack_protector_guard == SSP_GLOBAL
13624 && opts->x_aarch64_stack_protector_guard_offset_str)
13625 {
13626 error ("incompatible options %<-mstack-protector-guard=global%> and "
13627 "%<-mstack-protector-guard-offset=%s%>",
13628 aarch64_stack_protector_guard_offset_str);
13629 }
13630
13631 if (aarch64_stack_protector_guard == SSP_SYSREG
13632 && !(opts->x_aarch64_stack_protector_guard_offset_str
13633 && opts->x_aarch64_stack_protector_guard_reg_str))
13634 {
13635 error ("both %<-mstack-protector-guard-offset%> and "
13636 "%<-mstack-protector-guard-reg%> must be used "
13637 "with %<-mstack-protector-guard=sysreg%>");
13638 }
13639
13640 if (opts->x_aarch64_stack_protector_guard_reg_str)
13641 {
13642 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
13643 error ("specify a system register with a small string length.");
13644 }
13645
13646 if (opts->x_aarch64_stack_protector_guard_offset_str)
13647 {
13648 char *end;
13649 const char *str = aarch64_stack_protector_guard_offset_str;
13650 errno = 0;
13651 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
13652 if (!*str || *end || errno)
13653 error ("%qs is not a valid offset in %qs", str,
13654 "-mstack-protector-guard-offset=");
13655 aarch64_stack_protector_guard_offset = offs;
13656 }
13657
13658 initialize_aarch64_code_model (opts);
13659 initialize_aarch64_tls_size (opts);
13660
13661 int queue_depth = 0;
13662 switch (aarch64_tune_params.autoprefetcher_model)
13663 {
13664 case tune_params::AUTOPREFETCHER_OFF:
13665 queue_depth = -1;
13666 break;
13667 case tune_params::AUTOPREFETCHER_WEAK:
13668 queue_depth = 0;
13669 break;
13670 case tune_params::AUTOPREFETCHER_STRONG:
13671 queue_depth = max_insn_queue_index + 1;
13672 break;
13673 default:
13674 gcc_unreachable ();
13675 }
13676
13677 /* We don't mind passing in global_options_set here as we don't use
13678 the *options_set structs anyway. */
13679 SET_OPTION_IF_UNSET (opts, &global_options_set,
13680 param_sched_autopref_queue_depth, queue_depth);
13681
13682 /* Set up parameters to be used in prefetching algorithm. Do not
13683 override the defaults unless we are tuning for a core we have
13684 researched values for. */
13685 if (aarch64_tune_params.prefetch->num_slots > 0)
13686 SET_OPTION_IF_UNSET (opts, &global_options_set,
13687 param_simultaneous_prefetches,
13688 aarch64_tune_params.prefetch->num_slots);
13689 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
13690 SET_OPTION_IF_UNSET (opts, &global_options_set,
13691 param_l1_cache_size,
13692 aarch64_tune_params.prefetch->l1_cache_size);
13693 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
13694 SET_OPTION_IF_UNSET (opts, &global_options_set,
13695 param_l1_cache_line_size,
13696 aarch64_tune_params.prefetch->l1_cache_line_size);
13697 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
13698 SET_OPTION_IF_UNSET (opts, &global_options_set,
13699 param_l2_cache_size,
13700 aarch64_tune_params.prefetch->l2_cache_size);
13701 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
13702 SET_OPTION_IF_UNSET (opts, &global_options_set,
13703 param_prefetch_dynamic_strides, 0);
13704 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
13705 SET_OPTION_IF_UNSET (opts, &global_options_set,
13706 param_prefetch_minimum_stride,
13707 aarch64_tune_params.prefetch->minimum_stride);
13708
13709 /* Use the alternative scheduling-pressure algorithm by default. */
13710 SET_OPTION_IF_UNSET (opts, &global_options_set,
13711 param_sched_pressure_algorithm,
13712 SCHED_PRESSURE_MODEL);
13713
13714 /* Validate the guard size. */
13715 int guard_size = param_stack_clash_protection_guard_size;
13716
13717 if (guard_size != 12 && guard_size != 16)
13718 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
13719 "size. Given value %d (%llu KB) is out of range",
13720 guard_size, (1ULL << guard_size) / 1024ULL);
13721
13722 /* Enforce that interval is the same size as size so the mid-end does the
13723 right thing. */
13724 SET_OPTION_IF_UNSET (opts, &global_options_set,
13725 param_stack_clash_protection_probe_interval,
13726 guard_size);
13727
13728 /* The maybe_set calls won't update the value if the user has explicitly set
13729 one. Which means we need to validate that probing interval and guard size
13730 are equal. */
13731 int probe_interval
13732 = param_stack_clash_protection_probe_interval;
13733 if (guard_size != probe_interval)
13734 error ("stack clash guard size %<%d%> must be equal to probing interval "
13735 "%<%d%>", guard_size, probe_interval);
13736
13737 /* Enable sw prefetching at specified optimization level for
13738 CPUS that have prefetch. Lower optimization level threshold by 1
13739 when profiling is enabled. */
13740 if (opts->x_flag_prefetch_loop_arrays < 0
13741 && !opts->x_optimize_size
13742 && aarch64_tune_params.prefetch->default_opt_level >= 0
13743 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
13744 opts->x_flag_prefetch_loop_arrays = 1;
13745
13746 if (opts->x_aarch64_arch_string == NULL)
13747 opts->x_aarch64_arch_string = selected_arch->name;
13748 if (opts->x_aarch64_cpu_string == NULL)
13749 opts->x_aarch64_cpu_string = selected_cpu->name;
13750 if (opts->x_aarch64_tune_string == NULL)
13751 opts->x_aarch64_tune_string = selected_tune->name;
13752
13753 aarch64_override_options_after_change_1 (opts);
13754 }
13755
13756 /* Print a hint with a suggestion for a core or architecture name that
13757 most closely resembles what the user passed in STR. ARCH is true if
13758 the user is asking for an architecture name. ARCH is false if the user
13759 is asking for a core name. */
13760
13761 static void
13762 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
13763 {
13764 auto_vec<const char *> candidates;
13765 const struct processor *entry = arch ? all_architectures : all_cores;
13766 for (; entry->name != NULL; entry++)
13767 candidates.safe_push (entry->name);
13768
13769 #ifdef HAVE_LOCAL_CPU_DETECT
13770 /* Add also "native" as possible value. */
13771 if (arch)
13772 candidates.safe_push ("native");
13773 #endif
13774
13775 char *s;
13776 const char *hint = candidates_list_and_hint (str, s, candidates);
13777 if (hint)
13778 inform (input_location, "valid arguments are: %s;"
13779 " did you mean %qs?", s, hint);
13780 else
13781 inform (input_location, "valid arguments are: %s", s);
13782
13783 XDELETEVEC (s);
13784 }
13785
13786 /* Print a hint with a suggestion for a core name that most closely resembles
13787 what the user passed in STR. */
13788
13789 inline static void
13790 aarch64_print_hint_for_core (const char *str)
13791 {
13792 aarch64_print_hint_for_core_or_arch (str, false);
13793 }
13794
13795 /* Print a hint with a suggestion for an architecture name that most closely
13796 resembles what the user passed in STR. */
13797
13798 inline static void
13799 aarch64_print_hint_for_arch (const char *str)
13800 {
13801 aarch64_print_hint_for_core_or_arch (str, true);
13802 }
13803
13804
13805 /* Print a hint with a suggestion for an extension name
13806 that most closely resembles what the user passed in STR. */
13807
13808 void
13809 aarch64_print_hint_for_extensions (const std::string &str)
13810 {
13811 auto_vec<const char *> candidates;
13812 aarch64_get_all_extension_candidates (&candidates);
13813 char *s;
13814 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
13815 if (hint)
13816 inform (input_location, "valid arguments are: %s;"
13817 " did you mean %qs?", s, hint);
13818 else
13819 inform (input_location, "valid arguments are: %s;", s);
13820
13821 XDELETEVEC (s);
13822 }
13823
13824 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
13825 specified in STR and throw errors if appropriate. Put the results if
13826 they are valid in RES and ISA_FLAGS. Return whether the option is
13827 valid. */
13828
13829 static bool
13830 aarch64_validate_mcpu (const char *str, const struct processor **res,
13831 uint64_t *isa_flags)
13832 {
13833 std::string invalid_extension;
13834 enum aarch64_parse_opt_result parse_res
13835 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
13836
13837 if (parse_res == AARCH64_PARSE_OK)
13838 return true;
13839
13840 switch (parse_res)
13841 {
13842 case AARCH64_PARSE_MISSING_ARG:
13843 error ("missing cpu name in %<-mcpu=%s%>", str);
13844 break;
13845 case AARCH64_PARSE_INVALID_ARG:
13846 error ("unknown value %qs for %<-mcpu%>", str);
13847 aarch64_print_hint_for_core (str);
13848 break;
13849 case AARCH64_PARSE_INVALID_FEATURE:
13850 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
13851 invalid_extension.c_str (), str);
13852 aarch64_print_hint_for_extensions (invalid_extension);
13853 break;
13854 default:
13855 gcc_unreachable ();
13856 }
13857
13858 return false;
13859 }
13860
13861 /* Parses CONST_STR for branch protection features specified in
13862 aarch64_branch_protect_types, and set any global variables required. Returns
13863 the parsing result and assigns LAST_STR to the last processed token from
13864 CONST_STR so that it can be used for error reporting. */
13865
13866 static enum
13867 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
13868 char** last_str)
13869 {
13870 char *str_root = xstrdup (const_str);
13871 char* token_save = NULL;
13872 char *str = strtok_r (str_root, "+", &token_save);
13873 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
13874 if (!str)
13875 res = AARCH64_PARSE_MISSING_ARG;
13876 else
13877 {
13878 char *next_str = strtok_r (NULL, "+", &token_save);
13879 /* Reset the branch protection features to their defaults. */
13880 aarch64_handle_no_branch_protection (NULL, NULL);
13881
13882 while (str && res == AARCH64_PARSE_OK)
13883 {
13884 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
13885 bool found = false;
13886 /* Search for this type. */
13887 while (type && type->name && !found && res == AARCH64_PARSE_OK)
13888 {
13889 if (strcmp (str, type->name) == 0)
13890 {
13891 found = true;
13892 res = type->handler (str, next_str);
13893 str = next_str;
13894 next_str = strtok_r (NULL, "+", &token_save);
13895 }
13896 else
13897 type++;
13898 }
13899 if (found && res == AARCH64_PARSE_OK)
13900 {
13901 bool found_subtype = true;
13902 /* Loop through each token until we find one that isn't a
13903 subtype. */
13904 while (found_subtype)
13905 {
13906 found_subtype = false;
13907 const aarch64_branch_protect_type *subtype = type->subtypes;
13908 /* Search for the subtype. */
13909 while (str && subtype && subtype->name && !found_subtype
13910 && res == AARCH64_PARSE_OK)
13911 {
13912 if (strcmp (str, subtype->name) == 0)
13913 {
13914 found_subtype = true;
13915 res = subtype->handler (str, next_str);
13916 str = next_str;
13917 next_str = strtok_r (NULL, "+", &token_save);
13918 }
13919 else
13920 subtype++;
13921 }
13922 }
13923 }
13924 else if (!found)
13925 res = AARCH64_PARSE_INVALID_ARG;
13926 }
13927 }
13928 /* Copy the last processed token into the argument to pass it back.
13929 Used by option and attribute validation to print the offending token. */
13930 if (last_str)
13931 {
13932 if (str) strcpy (*last_str, str);
13933 else *last_str = NULL;
13934 }
13935 if (res == AARCH64_PARSE_OK)
13936 {
13937 /* If needed, alloc the accepted string then copy in const_str.
13938 Used by override_option_after_change_1. */
13939 if (!accepted_branch_protection_string)
13940 accepted_branch_protection_string = (char *) xmalloc (
13941 BRANCH_PROTECT_STR_MAX
13942 + 1);
13943 strncpy (accepted_branch_protection_string, const_str,
13944 BRANCH_PROTECT_STR_MAX + 1);
13945 /* Forcibly null-terminate. */
13946 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
13947 }
13948 return res;
13949 }
13950
13951 static bool
13952 aarch64_validate_mbranch_protection (const char *const_str)
13953 {
13954 char *str = (char *) xmalloc (strlen (const_str));
13955 enum aarch64_parse_opt_result res =
13956 aarch64_parse_branch_protection (const_str, &str);
13957 if (res == AARCH64_PARSE_INVALID_ARG)
13958 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
13959 else if (res == AARCH64_PARSE_MISSING_ARG)
13960 error ("missing argument for %<-mbranch-protection=%>");
13961 free (str);
13962 return res == AARCH64_PARSE_OK;
13963 }
13964
13965 /* Validate a command-line -march option. Parse the arch and extensions
13966 (if any) specified in STR and throw errors if appropriate. Put the
13967 results, if they are valid, in RES and ISA_FLAGS. Return whether the
13968 option is valid. */
13969
13970 static bool
13971 aarch64_validate_march (const char *str, const struct processor **res,
13972 uint64_t *isa_flags)
13973 {
13974 std::string invalid_extension;
13975 enum aarch64_parse_opt_result parse_res
13976 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
13977
13978 if (parse_res == AARCH64_PARSE_OK)
13979 return true;
13980
13981 switch (parse_res)
13982 {
13983 case AARCH64_PARSE_MISSING_ARG:
13984 error ("missing arch name in %<-march=%s%>", str);
13985 break;
13986 case AARCH64_PARSE_INVALID_ARG:
13987 error ("unknown value %qs for %<-march%>", str);
13988 aarch64_print_hint_for_arch (str);
13989 break;
13990 case AARCH64_PARSE_INVALID_FEATURE:
13991 error ("invalid feature modifier %qs in %<-march=%s%>",
13992 invalid_extension.c_str (), str);
13993 aarch64_print_hint_for_extensions (invalid_extension);
13994 break;
13995 default:
13996 gcc_unreachable ();
13997 }
13998
13999 return false;
14000 }
14001
14002 /* Validate a command-line -mtune option. Parse the cpu
14003 specified in STR and throw errors if appropriate. Put the
14004 result, if it is valid, in RES. Return whether the option is
14005 valid. */
14006
14007 static bool
14008 aarch64_validate_mtune (const char *str, const struct processor **res)
14009 {
14010 enum aarch64_parse_opt_result parse_res
14011 = aarch64_parse_tune (str, res);
14012
14013 if (parse_res == AARCH64_PARSE_OK)
14014 return true;
14015
14016 switch (parse_res)
14017 {
14018 case AARCH64_PARSE_MISSING_ARG:
14019 error ("missing cpu name in %<-mtune=%s%>", str);
14020 break;
14021 case AARCH64_PARSE_INVALID_ARG:
14022 error ("unknown value %qs for %<-mtune%>", str);
14023 aarch64_print_hint_for_core (str);
14024 break;
14025 default:
14026 gcc_unreachable ();
14027 }
14028 return false;
14029 }
14030
14031 /* Return the CPU corresponding to the enum CPU.
14032 If it doesn't specify a cpu, return the default. */
14033
14034 static const struct processor *
14035 aarch64_get_tune_cpu (enum aarch64_processor cpu)
14036 {
14037 if (cpu != aarch64_none)
14038 return &all_cores[cpu];
14039
14040 /* The & 0x3f is to extract the bottom 6 bits that encode the
14041 default cpu as selected by the --with-cpu GCC configure option
14042 in config.gcc.
14043 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14044 flags mechanism should be reworked to make it more sane. */
14045 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14046 }
14047
14048 /* Return the architecture corresponding to the enum ARCH.
14049 If it doesn't specify a valid architecture, return the default. */
14050
14051 static const struct processor *
14052 aarch64_get_arch (enum aarch64_arch arch)
14053 {
14054 if (arch != aarch64_no_arch)
14055 return &all_architectures[arch];
14056
14057 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14058
14059 return &all_architectures[cpu->arch];
14060 }
14061
14062 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
14063
14064 static poly_uint16
14065 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
14066 {
14067 /* 128-bit SVE and Advanced SIMD modes use different register layouts
14068 on big-endian targets, so we would need to forbid subregs that convert
14069 from one to the other. By default a reinterpret sequence would then
14070 involve a store to memory in one mode and a load back in the other.
14071 Even if we optimize that sequence using reverse instructions,
14072 it would still be a significant potential overhead.
14073
14074 For now, it seems better to generate length-agnostic code for that
14075 case instead. */
14076 if (value == SVE_SCALABLE
14077 || (value == SVE_128 && BYTES_BIG_ENDIAN))
14078 return poly_uint16 (2, 2);
14079 else
14080 return (int) value / 64;
14081 }
14082
14083 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
14084 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
14085 tuning structs. In particular it must set selected_tune and
14086 aarch64_isa_flags that define the available ISA features and tuning
14087 decisions. It must also set selected_arch as this will be used to
14088 output the .arch asm tags for each function. */
14089
14090 static void
14091 aarch64_override_options (void)
14092 {
14093 uint64_t cpu_isa = 0;
14094 uint64_t arch_isa = 0;
14095 aarch64_isa_flags = 0;
14096
14097 bool valid_cpu = true;
14098 bool valid_tune = true;
14099 bool valid_arch = true;
14100
14101 selected_cpu = NULL;
14102 selected_arch = NULL;
14103 selected_tune = NULL;
14104
14105 if (aarch64_branch_protection_string)
14106 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
14107
14108 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
14109 If either of -march or -mtune is given, they override their
14110 respective component of -mcpu. */
14111 if (aarch64_cpu_string)
14112 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
14113 &cpu_isa);
14114
14115 if (aarch64_arch_string)
14116 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
14117 &arch_isa);
14118
14119 if (aarch64_tune_string)
14120 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
14121
14122 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14123 SUBTARGET_OVERRIDE_OPTIONS;
14124 #endif
14125
14126 /* If the user did not specify a processor, choose the default
14127 one for them. This will be the CPU set during configuration using
14128 --with-cpu, otherwise it is "generic". */
14129 if (!selected_cpu)
14130 {
14131 if (selected_arch)
14132 {
14133 selected_cpu = &all_cores[selected_arch->ident];
14134 aarch64_isa_flags = arch_isa;
14135 explicit_arch = selected_arch->arch;
14136 }
14137 else
14138 {
14139 /* Get default configure-time CPU. */
14140 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
14141 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14142 }
14143
14144 if (selected_tune)
14145 explicit_tune_core = selected_tune->ident;
14146 }
14147 /* If both -mcpu and -march are specified check that they are architecturally
14148 compatible, warn if they're not and prefer the -march ISA flags. */
14149 else if (selected_arch)
14150 {
14151 if (selected_arch->arch != selected_cpu->arch)
14152 {
14153 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14154 aarch64_cpu_string,
14155 aarch64_arch_string);
14156 }
14157 aarch64_isa_flags = arch_isa;
14158 explicit_arch = selected_arch->arch;
14159 explicit_tune_core = selected_tune ? selected_tune->ident
14160 : selected_cpu->ident;
14161 }
14162 else
14163 {
14164 /* -mcpu but no -march. */
14165 aarch64_isa_flags = cpu_isa;
14166 explicit_tune_core = selected_tune ? selected_tune->ident
14167 : selected_cpu->ident;
14168 gcc_assert (selected_cpu);
14169 selected_arch = &all_architectures[selected_cpu->arch];
14170 explicit_arch = selected_arch->arch;
14171 }
14172
14173 /* Set the arch as well as we will need it when outputing
14174 the .arch directive in assembly. */
14175 if (!selected_arch)
14176 {
14177 gcc_assert (selected_cpu);
14178 selected_arch = &all_architectures[selected_cpu->arch];
14179 }
14180
14181 if (!selected_tune)
14182 selected_tune = selected_cpu;
14183
14184 if (aarch64_enable_bti == 2)
14185 {
14186 #ifdef TARGET_ENABLE_BTI
14187 aarch64_enable_bti = 1;
14188 #else
14189 aarch64_enable_bti = 0;
14190 #endif
14191 }
14192
14193 /* Return address signing is currently not supported for ILP32 targets. For
14194 LP64 targets use the configured option in the absence of a command-line
14195 option for -mbranch-protection. */
14196 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14197 {
14198 #ifdef TARGET_ENABLE_PAC_RET
14199 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
14200 #else
14201 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14202 #endif
14203 }
14204
14205 #ifndef HAVE_AS_MABI_OPTION
14206 /* The compiler may have been configured with 2.23.* binutils, which does
14207 not have support for ILP32. */
14208 if (TARGET_ILP32)
14209 error ("assembler does not support %<-mabi=ilp32%>");
14210 #endif
14211
14212 /* Convert -msve-vector-bits to a VG count. */
14213 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14214
14215 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
14216 sorry ("return address signing is only supported for %<-mabi=lp64%>");
14217
14218 /* Make sure we properly set up the explicit options. */
14219 if ((aarch64_cpu_string && valid_cpu)
14220 || (aarch64_tune_string && valid_tune))
14221 gcc_assert (explicit_tune_core != aarch64_none);
14222
14223 if ((aarch64_cpu_string && valid_cpu)
14224 || (aarch64_arch_string && valid_arch))
14225 gcc_assert (explicit_arch != aarch64_no_arch);
14226
14227 /* The pass to insert speculation tracking runs before
14228 shrink-wrapping and the latter does not know how to update the
14229 tracking status. So disable it in this case. */
14230 if (aarch64_track_speculation)
14231 flag_shrink_wrap = 0;
14232
14233 aarch64_override_options_internal (&global_options);
14234
14235 /* Save these options as the default ones in case we push and pop them later
14236 while processing functions with potential target attributes. */
14237 target_option_default_node = target_option_current_node
14238 = build_target_option_node (&global_options);
14239 }
14240
14241 /* Implement targetm.override_options_after_change. */
14242
14243 static void
14244 aarch64_override_options_after_change (void)
14245 {
14246 aarch64_override_options_after_change_1 (&global_options);
14247 }
14248
14249 static struct machine_function *
14250 aarch64_init_machine_status (void)
14251 {
14252 struct machine_function *machine;
14253 machine = ggc_cleared_alloc<machine_function> ();
14254 return machine;
14255 }
14256
14257 void
14258 aarch64_init_expanders (void)
14259 {
14260 init_machine_status = aarch64_init_machine_status;
14261 }
14262
14263 /* A checking mechanism for the implementation of the various code models. */
14264 static void
14265 initialize_aarch64_code_model (struct gcc_options *opts)
14266 {
14267 if (opts->x_flag_pic)
14268 {
14269 switch (opts->x_aarch64_cmodel_var)
14270 {
14271 case AARCH64_CMODEL_TINY:
14272 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
14273 break;
14274 case AARCH64_CMODEL_SMALL:
14275 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14276 aarch64_cmodel = (flag_pic == 2
14277 ? AARCH64_CMODEL_SMALL_PIC
14278 : AARCH64_CMODEL_SMALL_SPIC);
14279 #else
14280 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
14281 #endif
14282 break;
14283 case AARCH64_CMODEL_LARGE:
14284 sorry ("code model %qs with %<-f%s%>", "large",
14285 opts->x_flag_pic > 1 ? "PIC" : "pic");
14286 break;
14287 default:
14288 gcc_unreachable ();
14289 }
14290 }
14291 else
14292 aarch64_cmodel = opts->x_aarch64_cmodel_var;
14293 }
14294
14295 /* Implement TARGET_OPTION_SAVE. */
14296
14297 static void
14298 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
14299 {
14300 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
14301 ptr->x_aarch64_branch_protection_string
14302 = opts->x_aarch64_branch_protection_string;
14303 }
14304
14305 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
14306 using the information saved in PTR. */
14307
14308 static void
14309 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
14310 {
14311 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
14312 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14313 opts->x_explicit_arch = ptr->x_explicit_arch;
14314 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
14315 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
14316 opts->x_aarch64_branch_protection_string
14317 = ptr->x_aarch64_branch_protection_string;
14318 if (opts->x_aarch64_branch_protection_string)
14319 {
14320 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
14321 NULL);
14322 }
14323
14324 aarch64_override_options_internal (opts);
14325 }
14326
14327 /* Implement TARGET_OPTION_PRINT. */
14328
14329 static void
14330 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
14331 {
14332 const struct processor *cpu
14333 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14334 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
14335 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
14336 std::string extension
14337 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
14338
14339 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
14340 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
14341 arch->name, extension.c_str ());
14342 }
14343
14344 static GTY(()) tree aarch64_previous_fndecl;
14345
14346 void
14347 aarch64_reset_previous_fndecl (void)
14348 {
14349 aarch64_previous_fndecl = NULL;
14350 }
14351
14352 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14353 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14354 make sure optab availability predicates are recomputed when necessary. */
14355
14356 void
14357 aarch64_save_restore_target_globals (tree new_tree)
14358 {
14359 if (TREE_TARGET_GLOBALS (new_tree))
14360 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
14361 else if (new_tree == target_option_default_node)
14362 restore_target_globals (&default_target_globals);
14363 else
14364 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
14365 }
14366
14367 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
14368 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14369 of the function, if such exists. This function may be called multiple
14370 times on a single function so use aarch64_previous_fndecl to avoid
14371 setting up identical state. */
14372
14373 static void
14374 aarch64_set_current_function (tree fndecl)
14375 {
14376 if (!fndecl || fndecl == aarch64_previous_fndecl)
14377 return;
14378
14379 tree old_tree = (aarch64_previous_fndecl
14380 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
14381 : NULL_TREE);
14382
14383 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14384
14385 /* If current function has no attributes but the previous one did,
14386 use the default node. */
14387 if (!new_tree && old_tree)
14388 new_tree = target_option_default_node;
14389
14390 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
14391 the default have been handled by aarch64_save_restore_target_globals from
14392 aarch64_pragma_target_parse. */
14393 if (old_tree == new_tree)
14394 return;
14395
14396 aarch64_previous_fndecl = fndecl;
14397
14398 /* First set the target options. */
14399 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
14400
14401 aarch64_save_restore_target_globals (new_tree);
14402 }
14403
14404 /* Enum describing the various ways we can handle attributes.
14405 In many cases we can reuse the generic option handling machinery. */
14406
14407 enum aarch64_attr_opt_type
14408 {
14409 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
14410 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
14411 aarch64_attr_enum, /* Attribute sets an enum variable. */
14412 aarch64_attr_custom /* Attribute requires a custom handling function. */
14413 };
14414
14415 /* All the information needed to handle a target attribute.
14416 NAME is the name of the attribute.
14417 ATTR_TYPE specifies the type of behavior of the attribute as described
14418 in the definition of enum aarch64_attr_opt_type.
14419 ALLOW_NEG is true if the attribute supports a "no-" form.
14420 HANDLER is the function that takes the attribute string as an argument
14421 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14422 OPT_NUM is the enum specifying the option that the attribute modifies.
14423 This is needed for attributes that mirror the behavior of a command-line
14424 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14425 aarch64_attr_enum. */
14426
14427 struct aarch64_attribute_info
14428 {
14429 const char *name;
14430 enum aarch64_attr_opt_type attr_type;
14431 bool allow_neg;
14432 bool (*handler) (const char *);
14433 enum opt_code opt_num;
14434 };
14435
14436 /* Handle the ARCH_STR argument to the arch= target attribute. */
14437
14438 static bool
14439 aarch64_handle_attr_arch (const char *str)
14440 {
14441 const struct processor *tmp_arch = NULL;
14442 std::string invalid_extension;
14443 enum aarch64_parse_opt_result parse_res
14444 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
14445
14446 if (parse_res == AARCH64_PARSE_OK)
14447 {
14448 gcc_assert (tmp_arch);
14449 selected_arch = tmp_arch;
14450 explicit_arch = selected_arch->arch;
14451 return true;
14452 }
14453
14454 switch (parse_res)
14455 {
14456 case AARCH64_PARSE_MISSING_ARG:
14457 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14458 break;
14459 case AARCH64_PARSE_INVALID_ARG:
14460 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
14461 aarch64_print_hint_for_arch (str);
14462 break;
14463 case AARCH64_PARSE_INVALID_FEATURE:
14464 error ("invalid feature modifier %s of value (\"%s\") in "
14465 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14466 aarch64_print_hint_for_extensions (invalid_extension);
14467 break;
14468 default:
14469 gcc_unreachable ();
14470 }
14471
14472 return false;
14473 }
14474
14475 /* Handle the argument CPU_STR to the cpu= target attribute. */
14476
14477 static bool
14478 aarch64_handle_attr_cpu (const char *str)
14479 {
14480 const struct processor *tmp_cpu = NULL;
14481 std::string invalid_extension;
14482 enum aarch64_parse_opt_result parse_res
14483 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
14484
14485 if (parse_res == AARCH64_PARSE_OK)
14486 {
14487 gcc_assert (tmp_cpu);
14488 selected_tune = tmp_cpu;
14489 explicit_tune_core = selected_tune->ident;
14490
14491 selected_arch = &all_architectures[tmp_cpu->arch];
14492 explicit_arch = selected_arch->arch;
14493 return true;
14494 }
14495
14496 switch (parse_res)
14497 {
14498 case AARCH64_PARSE_MISSING_ARG:
14499 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14500 break;
14501 case AARCH64_PARSE_INVALID_ARG:
14502 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
14503 aarch64_print_hint_for_core (str);
14504 break;
14505 case AARCH64_PARSE_INVALID_FEATURE:
14506 error ("invalid feature modifier %s of value (\"%s\") in "
14507 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14508 aarch64_print_hint_for_extensions (invalid_extension);
14509 break;
14510 default:
14511 gcc_unreachable ();
14512 }
14513
14514 return false;
14515 }
14516
14517 /* Handle the argument STR to the branch-protection= attribute. */
14518
14519 static bool
14520 aarch64_handle_attr_branch_protection (const char* str)
14521 {
14522 char *err_str = (char *) xmalloc (strlen (str) + 1);
14523 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
14524 &err_str);
14525 bool success = false;
14526 switch (res)
14527 {
14528 case AARCH64_PARSE_MISSING_ARG:
14529 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14530 " attribute");
14531 break;
14532 case AARCH64_PARSE_INVALID_ARG:
14533 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14534 "=\")%> pragma or attribute", err_str);
14535 break;
14536 case AARCH64_PARSE_OK:
14537 success = true;
14538 /* Fall through. */
14539 case AARCH64_PARSE_INVALID_FEATURE:
14540 break;
14541 default:
14542 gcc_unreachable ();
14543 }
14544 free (err_str);
14545 return success;
14546 }
14547
14548 /* Handle the argument STR to the tune= target attribute. */
14549
14550 static bool
14551 aarch64_handle_attr_tune (const char *str)
14552 {
14553 const struct processor *tmp_tune = NULL;
14554 enum aarch64_parse_opt_result parse_res
14555 = aarch64_parse_tune (str, &tmp_tune);
14556
14557 if (parse_res == AARCH64_PARSE_OK)
14558 {
14559 gcc_assert (tmp_tune);
14560 selected_tune = tmp_tune;
14561 explicit_tune_core = selected_tune->ident;
14562 return true;
14563 }
14564
14565 switch (parse_res)
14566 {
14567 case AARCH64_PARSE_INVALID_ARG:
14568 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
14569 aarch64_print_hint_for_core (str);
14570 break;
14571 default:
14572 gcc_unreachable ();
14573 }
14574
14575 return false;
14576 }
14577
14578 /* Parse an architecture extensions target attribute string specified in STR.
14579 For example "+fp+nosimd". Show any errors if needed. Return TRUE
14580 if successful. Update aarch64_isa_flags to reflect the ISA features
14581 modified. */
14582
14583 static bool
14584 aarch64_handle_attr_isa_flags (char *str)
14585 {
14586 enum aarch64_parse_opt_result parse_res;
14587 uint64_t isa_flags = aarch64_isa_flags;
14588
14589 /* We allow "+nothing" in the beginning to clear out all architectural
14590 features if the user wants to handpick specific features. */
14591 if (strncmp ("+nothing", str, 8) == 0)
14592 {
14593 isa_flags = 0;
14594 str += 8;
14595 }
14596
14597 std::string invalid_extension;
14598 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
14599
14600 if (parse_res == AARCH64_PARSE_OK)
14601 {
14602 aarch64_isa_flags = isa_flags;
14603 return true;
14604 }
14605
14606 switch (parse_res)
14607 {
14608 case AARCH64_PARSE_MISSING_ARG:
14609 error ("missing value in %<target()%> pragma or attribute");
14610 break;
14611
14612 case AARCH64_PARSE_INVALID_FEATURE:
14613 error ("invalid feature modifier %s of value (\"%s\") in "
14614 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14615 break;
14616
14617 default:
14618 gcc_unreachable ();
14619 }
14620
14621 return false;
14622 }
14623
14624 /* The target attributes that we support. On top of these we also support just
14625 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
14626 handled explicitly in aarch64_process_one_target_attr. */
14627
14628 static const struct aarch64_attribute_info aarch64_attributes[] =
14629 {
14630 { "general-regs-only", aarch64_attr_mask, false, NULL,
14631 OPT_mgeneral_regs_only },
14632 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
14633 OPT_mfix_cortex_a53_835769 },
14634 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
14635 OPT_mfix_cortex_a53_843419 },
14636 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
14637 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
14638 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
14639 OPT_momit_leaf_frame_pointer },
14640 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
14641 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
14642 OPT_march_ },
14643 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
14644 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
14645 OPT_mtune_ },
14646 { "branch-protection", aarch64_attr_custom, false,
14647 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
14648 { "sign-return-address", aarch64_attr_enum, false, NULL,
14649 OPT_msign_return_address_ },
14650 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
14651 };
14652
14653 /* Parse ARG_STR which contains the definition of one target attribute.
14654 Show appropriate errors if any or return true if the attribute is valid. */
14655
14656 static bool
14657 aarch64_process_one_target_attr (char *arg_str)
14658 {
14659 bool invert = false;
14660
14661 size_t len = strlen (arg_str);
14662
14663 if (len == 0)
14664 {
14665 error ("malformed %<target()%> pragma or attribute");
14666 return false;
14667 }
14668
14669 char *str_to_check = (char *) alloca (len + 1);
14670 strcpy (str_to_check, arg_str);
14671
14672 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
14673 It is easier to detect and handle it explicitly here rather than going
14674 through the machinery for the rest of the target attributes in this
14675 function. */
14676 if (*str_to_check == '+')
14677 return aarch64_handle_attr_isa_flags (str_to_check);
14678
14679 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
14680 {
14681 invert = true;
14682 str_to_check += 3;
14683 }
14684 char *arg = strchr (str_to_check, '=');
14685
14686 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
14687 and point ARG to "foo". */
14688 if (arg)
14689 {
14690 *arg = '\0';
14691 arg++;
14692 }
14693 const struct aarch64_attribute_info *p_attr;
14694 bool found = false;
14695 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
14696 {
14697 /* If the names don't match up, or the user has given an argument
14698 to an attribute that doesn't accept one, or didn't give an argument
14699 to an attribute that expects one, fail to match. */
14700 if (strcmp (str_to_check, p_attr->name) != 0)
14701 continue;
14702
14703 found = true;
14704 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
14705 || p_attr->attr_type == aarch64_attr_enum;
14706
14707 if (attr_need_arg_p ^ (arg != NULL))
14708 {
14709 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
14710 return false;
14711 }
14712
14713 /* If the name matches but the attribute does not allow "no-" versions
14714 then we can't match. */
14715 if (invert && !p_attr->allow_neg)
14716 {
14717 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
14718 return false;
14719 }
14720
14721 switch (p_attr->attr_type)
14722 {
14723 /* Has a custom handler registered.
14724 For example, cpu=, arch=, tune=. */
14725 case aarch64_attr_custom:
14726 gcc_assert (p_attr->handler);
14727 if (!p_attr->handler (arg))
14728 return false;
14729 break;
14730
14731 /* Either set or unset a boolean option. */
14732 case aarch64_attr_bool:
14733 {
14734 struct cl_decoded_option decoded;
14735
14736 generate_option (p_attr->opt_num, NULL, !invert,
14737 CL_TARGET, &decoded);
14738 aarch64_handle_option (&global_options, &global_options_set,
14739 &decoded, input_location);
14740 break;
14741 }
14742 /* Set or unset a bit in the target_flags. aarch64_handle_option
14743 should know what mask to apply given the option number. */
14744 case aarch64_attr_mask:
14745 {
14746 struct cl_decoded_option decoded;
14747 /* We only need to specify the option number.
14748 aarch64_handle_option will know which mask to apply. */
14749 decoded.opt_index = p_attr->opt_num;
14750 decoded.value = !invert;
14751 aarch64_handle_option (&global_options, &global_options_set,
14752 &decoded, input_location);
14753 break;
14754 }
14755 /* Use the option setting machinery to set an option to an enum. */
14756 case aarch64_attr_enum:
14757 {
14758 gcc_assert (arg);
14759 bool valid;
14760 int value;
14761 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
14762 &value, CL_TARGET);
14763 if (valid)
14764 {
14765 set_option (&global_options, NULL, p_attr->opt_num, value,
14766 NULL, DK_UNSPECIFIED, input_location,
14767 global_dc);
14768 }
14769 else
14770 {
14771 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
14772 }
14773 break;
14774 }
14775 default:
14776 gcc_unreachable ();
14777 }
14778 }
14779
14780 /* If we reached here we either have found an attribute and validated
14781 it or didn't match any. If we matched an attribute but its arguments
14782 were malformed we will have returned false already. */
14783 return found;
14784 }
14785
14786 /* Count how many times the character C appears in
14787 NULL-terminated string STR. */
14788
14789 static unsigned int
14790 num_occurences_in_str (char c, char *str)
14791 {
14792 unsigned int res = 0;
14793 while (*str != '\0')
14794 {
14795 if (*str == c)
14796 res++;
14797
14798 str++;
14799 }
14800
14801 return res;
14802 }
14803
14804 /* Parse the tree in ARGS that contains the target attribute information
14805 and update the global target options space. */
14806
14807 bool
14808 aarch64_process_target_attr (tree args)
14809 {
14810 if (TREE_CODE (args) == TREE_LIST)
14811 {
14812 do
14813 {
14814 tree head = TREE_VALUE (args);
14815 if (head)
14816 {
14817 if (!aarch64_process_target_attr (head))
14818 return false;
14819 }
14820 args = TREE_CHAIN (args);
14821 } while (args);
14822
14823 return true;
14824 }
14825
14826 if (TREE_CODE (args) != STRING_CST)
14827 {
14828 error ("attribute %<target%> argument not a string");
14829 return false;
14830 }
14831
14832 size_t len = strlen (TREE_STRING_POINTER (args));
14833 char *str_to_check = (char *) alloca (len + 1);
14834 strcpy (str_to_check, TREE_STRING_POINTER (args));
14835
14836 if (len == 0)
14837 {
14838 error ("malformed %<target()%> pragma or attribute");
14839 return false;
14840 }
14841
14842 /* Used to catch empty spaces between commas i.e.
14843 attribute ((target ("attr1,,attr2"))). */
14844 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
14845
14846 /* Handle multiple target attributes separated by ','. */
14847 char *token = strtok_r (str_to_check, ",", &str_to_check);
14848
14849 unsigned int num_attrs = 0;
14850 while (token)
14851 {
14852 num_attrs++;
14853 if (!aarch64_process_one_target_attr (token))
14854 {
14855 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
14856 return false;
14857 }
14858
14859 token = strtok_r (NULL, ",", &str_to_check);
14860 }
14861
14862 if (num_attrs != num_commas + 1)
14863 {
14864 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
14865 return false;
14866 }
14867
14868 return true;
14869 }
14870
14871 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
14872 process attribute ((target ("..."))). */
14873
14874 static bool
14875 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
14876 {
14877 struct cl_target_option cur_target;
14878 bool ret;
14879 tree old_optimize;
14880 tree new_target, new_optimize;
14881 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14882
14883 /* If what we're processing is the current pragma string then the
14884 target option node is already stored in target_option_current_node
14885 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
14886 having to re-parse the string. This is especially useful to keep
14887 arm_neon.h compile times down since that header contains a lot
14888 of intrinsics enclosed in pragmas. */
14889 if (!existing_target && args == current_target_pragma)
14890 {
14891 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
14892 return true;
14893 }
14894 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14895
14896 old_optimize = build_optimization_node (&global_options);
14897 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14898
14899 /* If the function changed the optimization levels as well as setting
14900 target options, start with the optimizations specified. */
14901 if (func_optimize && func_optimize != old_optimize)
14902 cl_optimization_restore (&global_options,
14903 TREE_OPTIMIZATION (func_optimize));
14904
14905 /* Save the current target options to restore at the end. */
14906 cl_target_option_save (&cur_target, &global_options);
14907
14908 /* If fndecl already has some target attributes applied to it, unpack
14909 them so that we add this attribute on top of them, rather than
14910 overwriting them. */
14911 if (existing_target)
14912 {
14913 struct cl_target_option *existing_options
14914 = TREE_TARGET_OPTION (existing_target);
14915
14916 if (existing_options)
14917 cl_target_option_restore (&global_options, existing_options);
14918 }
14919 else
14920 cl_target_option_restore (&global_options,
14921 TREE_TARGET_OPTION (target_option_current_node));
14922
14923 ret = aarch64_process_target_attr (args);
14924
14925 /* Set up any additional state. */
14926 if (ret)
14927 {
14928 aarch64_override_options_internal (&global_options);
14929 /* Initialize SIMD builtins if we haven't already.
14930 Set current_target_pragma to NULL for the duration so that
14931 the builtin initialization code doesn't try to tag the functions
14932 being built with the attributes specified by any current pragma, thus
14933 going into an infinite recursion. */
14934 if (TARGET_SIMD)
14935 {
14936 tree saved_current_target_pragma = current_target_pragma;
14937 current_target_pragma = NULL;
14938 aarch64_init_simd_builtins ();
14939 current_target_pragma = saved_current_target_pragma;
14940 }
14941 new_target = build_target_option_node (&global_options);
14942 }
14943 else
14944 new_target = NULL;
14945
14946 new_optimize = build_optimization_node (&global_options);
14947
14948 if (fndecl && ret)
14949 {
14950 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
14951
14952 if (old_optimize != new_optimize)
14953 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
14954 }
14955
14956 cl_target_option_restore (&global_options, &cur_target);
14957
14958 if (old_optimize != new_optimize)
14959 cl_optimization_restore (&global_options,
14960 TREE_OPTIMIZATION (old_optimize));
14961 return ret;
14962 }
14963
14964 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
14965 tri-bool options (yes, no, don't care) and the default value is
14966 DEF, determine whether to reject inlining. */
14967
14968 static bool
14969 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
14970 int dont_care, int def)
14971 {
14972 /* If the callee doesn't care, always allow inlining. */
14973 if (callee == dont_care)
14974 return true;
14975
14976 /* If the caller doesn't care, always allow inlining. */
14977 if (caller == dont_care)
14978 return true;
14979
14980 /* Otherwise, allow inlining if either the callee and caller values
14981 agree, or if the callee is using the default value. */
14982 return (callee == caller || callee == def);
14983 }
14984
14985 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
14986 to inline CALLEE into CALLER based on target-specific info.
14987 Make sure that the caller and callee have compatible architectural
14988 features. Then go through the other possible target attributes
14989 and see if they can block inlining. Try not to reject always_inline
14990 callees unless they are incompatible architecturally. */
14991
14992 static bool
14993 aarch64_can_inline_p (tree caller, tree callee)
14994 {
14995 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
14996 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
14997
14998 struct cl_target_option *caller_opts
14999 = TREE_TARGET_OPTION (caller_tree ? caller_tree
15000 : target_option_default_node);
15001
15002 struct cl_target_option *callee_opts
15003 = TREE_TARGET_OPTION (callee_tree ? callee_tree
15004 : target_option_default_node);
15005
15006 /* Callee's ISA flags should be a subset of the caller's. */
15007 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
15008 != callee_opts->x_aarch64_isa_flags)
15009 return false;
15010
15011 /* Allow non-strict aligned functions inlining into strict
15012 aligned ones. */
15013 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
15014 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
15015 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
15016 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
15017 return false;
15018
15019 bool always_inline = lookup_attribute ("always_inline",
15020 DECL_ATTRIBUTES (callee));
15021
15022 /* If the architectural features match up and the callee is always_inline
15023 then the other attributes don't matter. */
15024 if (always_inline)
15025 return true;
15026
15027 if (caller_opts->x_aarch64_cmodel_var
15028 != callee_opts->x_aarch64_cmodel_var)
15029 return false;
15030
15031 if (caller_opts->x_aarch64_tls_dialect
15032 != callee_opts->x_aarch64_tls_dialect)
15033 return false;
15034
15035 /* Honour explicit requests to workaround errata. */
15036 if (!aarch64_tribools_ok_for_inlining_p (
15037 caller_opts->x_aarch64_fix_a53_err835769,
15038 callee_opts->x_aarch64_fix_a53_err835769,
15039 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
15040 return false;
15041
15042 if (!aarch64_tribools_ok_for_inlining_p (
15043 caller_opts->x_aarch64_fix_a53_err843419,
15044 callee_opts->x_aarch64_fix_a53_err843419,
15045 2, TARGET_FIX_ERR_A53_843419))
15046 return false;
15047
15048 /* If the user explicitly specified -momit-leaf-frame-pointer for the
15049 caller and calle and they don't match up, reject inlining. */
15050 if (!aarch64_tribools_ok_for_inlining_p (
15051 caller_opts->x_flag_omit_leaf_frame_pointer,
15052 callee_opts->x_flag_omit_leaf_frame_pointer,
15053 2, 1))
15054 return false;
15055
15056 /* If the callee has specific tuning overrides, respect them. */
15057 if (callee_opts->x_aarch64_override_tune_string != NULL
15058 && caller_opts->x_aarch64_override_tune_string == NULL)
15059 return false;
15060
15061 /* If the user specified tuning override strings for the
15062 caller and callee and they don't match up, reject inlining.
15063 We just do a string compare here, we don't analyze the meaning
15064 of the string, as it would be too costly for little gain. */
15065 if (callee_opts->x_aarch64_override_tune_string
15066 && caller_opts->x_aarch64_override_tune_string
15067 && (strcmp (callee_opts->x_aarch64_override_tune_string,
15068 caller_opts->x_aarch64_override_tune_string) != 0))
15069 return false;
15070
15071 return true;
15072 }
15073
15074 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
15075 been already. */
15076
15077 unsigned int
15078 aarch64_tlsdesc_abi_id ()
15079 {
15080 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
15081 if (!tlsdesc_abi.initialized_p ())
15082 {
15083 HARD_REG_SET full_reg_clobbers;
15084 CLEAR_HARD_REG_SET (full_reg_clobbers);
15085 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
15086 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
15087 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
15088 SET_HARD_REG_BIT (full_reg_clobbers, regno);
15089 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
15090 }
15091 return tlsdesc_abi.id ();
15092 }
15093
15094 /* Return true if SYMBOL_REF X binds locally. */
15095
15096 static bool
15097 aarch64_symbol_binds_local_p (const_rtx x)
15098 {
15099 return (SYMBOL_REF_DECL (x)
15100 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
15101 : SYMBOL_REF_LOCAL_P (x));
15102 }
15103
15104 /* Return true if SYMBOL_REF X is thread local */
15105 static bool
15106 aarch64_tls_symbol_p (rtx x)
15107 {
15108 if (! TARGET_HAVE_TLS)
15109 return false;
15110
15111 if (GET_CODE (x) != SYMBOL_REF)
15112 return false;
15113
15114 return SYMBOL_REF_TLS_MODEL (x) != 0;
15115 }
15116
15117 /* Classify a TLS symbol into one of the TLS kinds. */
15118 enum aarch64_symbol_type
15119 aarch64_classify_tls_symbol (rtx x)
15120 {
15121 enum tls_model tls_kind = tls_symbolic_operand_type (x);
15122
15123 switch (tls_kind)
15124 {
15125 case TLS_MODEL_GLOBAL_DYNAMIC:
15126 case TLS_MODEL_LOCAL_DYNAMIC:
15127 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15128
15129 case TLS_MODEL_INITIAL_EXEC:
15130 switch (aarch64_cmodel)
15131 {
15132 case AARCH64_CMODEL_TINY:
15133 case AARCH64_CMODEL_TINY_PIC:
15134 return SYMBOL_TINY_TLSIE;
15135 default:
15136 return SYMBOL_SMALL_TLSIE;
15137 }
15138
15139 case TLS_MODEL_LOCAL_EXEC:
15140 if (aarch64_tls_size == 12)
15141 return SYMBOL_TLSLE12;
15142 else if (aarch64_tls_size == 24)
15143 return SYMBOL_TLSLE24;
15144 else if (aarch64_tls_size == 32)
15145 return SYMBOL_TLSLE32;
15146 else if (aarch64_tls_size == 48)
15147 return SYMBOL_TLSLE48;
15148 else
15149 gcc_unreachable ();
15150
15151 case TLS_MODEL_EMULATED:
15152 case TLS_MODEL_NONE:
15153 return SYMBOL_FORCE_TO_MEM;
15154
15155 default:
15156 gcc_unreachable ();
15157 }
15158 }
15159
15160 /* Return the correct method for accessing X + OFFSET, where X is either
15161 a SYMBOL_REF or LABEL_REF. */
15162
15163 enum aarch64_symbol_type
15164 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
15165 {
15166 if (GET_CODE (x) == LABEL_REF)
15167 {
15168 switch (aarch64_cmodel)
15169 {
15170 case AARCH64_CMODEL_LARGE:
15171 return SYMBOL_FORCE_TO_MEM;
15172
15173 case AARCH64_CMODEL_TINY_PIC:
15174 case AARCH64_CMODEL_TINY:
15175 return SYMBOL_TINY_ABSOLUTE;
15176
15177 case AARCH64_CMODEL_SMALL_SPIC:
15178 case AARCH64_CMODEL_SMALL_PIC:
15179 case AARCH64_CMODEL_SMALL:
15180 return SYMBOL_SMALL_ABSOLUTE;
15181
15182 default:
15183 gcc_unreachable ();
15184 }
15185 }
15186
15187 if (GET_CODE (x) == SYMBOL_REF)
15188 {
15189 if (aarch64_tls_symbol_p (x))
15190 return aarch64_classify_tls_symbol (x);
15191
15192 switch (aarch64_cmodel)
15193 {
15194 case AARCH64_CMODEL_TINY:
15195 /* When we retrieve symbol + offset address, we have to make sure
15196 the offset does not cause overflow of the final address. But
15197 we have no way of knowing the address of symbol at compile time
15198 so we can't accurately say if the distance between the PC and
15199 symbol + offset is outside the addressible range of +/-1MB in the
15200 TINY code model. So we limit the maximum offset to +/-64KB and
15201 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15202 If offset_within_block_p is true we allow larger offsets.
15203 Furthermore force to memory if the symbol is a weak reference to
15204 something that doesn't resolve to a symbol in this module. */
15205
15206 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15207 return SYMBOL_FORCE_TO_MEM;
15208 if (!(IN_RANGE (offset, -0x10000, 0x10000)
15209 || offset_within_block_p (x, offset)))
15210 return SYMBOL_FORCE_TO_MEM;
15211
15212 return SYMBOL_TINY_ABSOLUTE;
15213
15214 case AARCH64_CMODEL_SMALL:
15215 /* Same reasoning as the tiny code model, but the offset cap here is
15216 1MB, allowing +/-3.9GB for the offset to the symbol. */
15217
15218 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15219 return SYMBOL_FORCE_TO_MEM;
15220 if (!(IN_RANGE (offset, -0x100000, 0x100000)
15221 || offset_within_block_p (x, offset)))
15222 return SYMBOL_FORCE_TO_MEM;
15223
15224 return SYMBOL_SMALL_ABSOLUTE;
15225
15226 case AARCH64_CMODEL_TINY_PIC:
15227 if (!aarch64_symbol_binds_local_p (x))
15228 return SYMBOL_TINY_GOT;
15229 return SYMBOL_TINY_ABSOLUTE;
15230
15231 case AARCH64_CMODEL_SMALL_SPIC:
15232 case AARCH64_CMODEL_SMALL_PIC:
15233 if (!aarch64_symbol_binds_local_p (x))
15234 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
15235 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
15236 return SYMBOL_SMALL_ABSOLUTE;
15237
15238 case AARCH64_CMODEL_LARGE:
15239 /* This is alright even in PIC code as the constant
15240 pool reference is always PC relative and within
15241 the same translation unit. */
15242 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
15243 return SYMBOL_SMALL_ABSOLUTE;
15244 else
15245 return SYMBOL_FORCE_TO_MEM;
15246
15247 default:
15248 gcc_unreachable ();
15249 }
15250 }
15251
15252 /* By default push everything into the constant pool. */
15253 return SYMBOL_FORCE_TO_MEM;
15254 }
15255
15256 bool
15257 aarch64_constant_address_p (rtx x)
15258 {
15259 return (CONSTANT_P (x) && memory_address_p (DImode, x));
15260 }
15261
15262 bool
15263 aarch64_legitimate_pic_operand_p (rtx x)
15264 {
15265 if (GET_CODE (x) == SYMBOL_REF
15266 || (GET_CODE (x) == CONST
15267 && GET_CODE (XEXP (x, 0)) == PLUS
15268 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
15269 return false;
15270
15271 return true;
15272 }
15273
15274 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
15275 that should be rematerialized rather than spilled. */
15276
15277 static bool
15278 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
15279 {
15280 /* Support CSE and rematerialization of common constants. */
15281 if (CONST_INT_P (x)
15282 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
15283 || GET_CODE (x) == CONST_VECTOR)
15284 return true;
15285
15286 /* Do not allow vector struct mode constants for Advanced SIMD.
15287 We could support 0 and -1 easily, but they need support in
15288 aarch64-simd.md. */
15289 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15290 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15291 return false;
15292
15293 /* Only accept variable-length vector constants if they can be
15294 handled directly.
15295
15296 ??? It would be possible to handle rematerialization of other
15297 constants via secondary reloads. */
15298 if (vec_flags & VEC_ANY_SVE)
15299 return aarch64_simd_valid_immediate (x, NULL);
15300
15301 if (GET_CODE (x) == HIGH)
15302 x = XEXP (x, 0);
15303
15304 /* Accept polynomial constants that can be calculated by using the
15305 destination of a move as the sole temporary. Constants that
15306 require a second temporary cannot be rematerialized (they can't be
15307 forced to memory and also aren't legitimate constants). */
15308 poly_int64 offset;
15309 if (poly_int_rtx_p (x, &offset))
15310 return aarch64_offset_temporaries (false, offset) <= 1;
15311
15312 /* If an offset is being added to something else, we need to allow the
15313 base to be moved into the destination register, meaning that there
15314 are no free temporaries for the offset. */
15315 x = strip_offset (x, &offset);
15316 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
15317 return false;
15318
15319 /* Do not allow const (plus (anchor_symbol, const_int)). */
15320 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
15321 return false;
15322
15323 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
15324 so spilling them is better than rematerialization. */
15325 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
15326 return true;
15327
15328 /* Label references are always constant. */
15329 if (GET_CODE (x) == LABEL_REF)
15330 return true;
15331
15332 return false;
15333 }
15334
15335 rtx
15336 aarch64_load_tp (rtx target)
15337 {
15338 if (!target
15339 || GET_MODE (target) != Pmode
15340 || !register_operand (target, Pmode))
15341 target = gen_reg_rtx (Pmode);
15342
15343 /* Can return in any reg. */
15344 emit_insn (gen_aarch64_load_tp_hard (target));
15345 return target;
15346 }
15347
15348 /* On AAPCS systems, this is the "struct __va_list". */
15349 static GTY(()) tree va_list_type;
15350
15351 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15352 Return the type to use as __builtin_va_list.
15353
15354 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15355
15356 struct __va_list
15357 {
15358 void *__stack;
15359 void *__gr_top;
15360 void *__vr_top;
15361 int __gr_offs;
15362 int __vr_offs;
15363 }; */
15364
15365 static tree
15366 aarch64_build_builtin_va_list (void)
15367 {
15368 tree va_list_name;
15369 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15370
15371 /* Create the type. */
15372 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
15373 /* Give it the required name. */
15374 va_list_name = build_decl (BUILTINS_LOCATION,
15375 TYPE_DECL,
15376 get_identifier ("__va_list"),
15377 va_list_type);
15378 DECL_ARTIFICIAL (va_list_name) = 1;
15379 TYPE_NAME (va_list_type) = va_list_name;
15380 TYPE_STUB_DECL (va_list_type) = va_list_name;
15381
15382 /* Create the fields. */
15383 f_stack = build_decl (BUILTINS_LOCATION,
15384 FIELD_DECL, get_identifier ("__stack"),
15385 ptr_type_node);
15386 f_grtop = build_decl (BUILTINS_LOCATION,
15387 FIELD_DECL, get_identifier ("__gr_top"),
15388 ptr_type_node);
15389 f_vrtop = build_decl (BUILTINS_LOCATION,
15390 FIELD_DECL, get_identifier ("__vr_top"),
15391 ptr_type_node);
15392 f_groff = build_decl (BUILTINS_LOCATION,
15393 FIELD_DECL, get_identifier ("__gr_offs"),
15394 integer_type_node);
15395 f_vroff = build_decl (BUILTINS_LOCATION,
15396 FIELD_DECL, get_identifier ("__vr_offs"),
15397 integer_type_node);
15398
15399 /* Tell tree-stdarg pass about our internal offset fields.
15400 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15401 purpose to identify whether the code is updating va_list internal
15402 offset fields through irregular way. */
15403 va_list_gpr_counter_field = f_groff;
15404 va_list_fpr_counter_field = f_vroff;
15405
15406 DECL_ARTIFICIAL (f_stack) = 1;
15407 DECL_ARTIFICIAL (f_grtop) = 1;
15408 DECL_ARTIFICIAL (f_vrtop) = 1;
15409 DECL_ARTIFICIAL (f_groff) = 1;
15410 DECL_ARTIFICIAL (f_vroff) = 1;
15411
15412 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
15413 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
15414 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
15415 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
15416 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
15417
15418 TYPE_FIELDS (va_list_type) = f_stack;
15419 DECL_CHAIN (f_stack) = f_grtop;
15420 DECL_CHAIN (f_grtop) = f_vrtop;
15421 DECL_CHAIN (f_vrtop) = f_groff;
15422 DECL_CHAIN (f_groff) = f_vroff;
15423
15424 /* Compute its layout. */
15425 layout_type (va_list_type);
15426
15427 return va_list_type;
15428 }
15429
15430 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
15431 static void
15432 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
15433 {
15434 const CUMULATIVE_ARGS *cum;
15435 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15436 tree stack, grtop, vrtop, groff, vroff;
15437 tree t;
15438 int gr_save_area_size = cfun->va_list_gpr_size;
15439 int vr_save_area_size = cfun->va_list_fpr_size;
15440 int vr_offset;
15441
15442 cum = &crtl->args.info;
15443 if (cfun->va_list_gpr_size)
15444 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
15445 cfun->va_list_gpr_size);
15446 if (cfun->va_list_fpr_size)
15447 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
15448 * UNITS_PER_VREG, cfun->va_list_fpr_size);
15449
15450 if (!TARGET_FLOAT)
15451 {
15452 gcc_assert (cum->aapcs_nvrn == 0);
15453 vr_save_area_size = 0;
15454 }
15455
15456 f_stack = TYPE_FIELDS (va_list_type_node);
15457 f_grtop = DECL_CHAIN (f_stack);
15458 f_vrtop = DECL_CHAIN (f_grtop);
15459 f_groff = DECL_CHAIN (f_vrtop);
15460 f_vroff = DECL_CHAIN (f_groff);
15461
15462 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
15463 NULL_TREE);
15464 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
15465 NULL_TREE);
15466 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
15467 NULL_TREE);
15468 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
15469 NULL_TREE);
15470 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
15471 NULL_TREE);
15472
15473 /* Emit code to initialize STACK, which points to the next varargs stack
15474 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
15475 by named arguments. STACK is 8-byte aligned. */
15476 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
15477 if (cum->aapcs_stack_size > 0)
15478 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
15479 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
15480 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15481
15482 /* Emit code to initialize GRTOP, the top of the GR save area.
15483 virtual_incoming_args_rtx should have been 16 byte aligned. */
15484 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
15485 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
15486 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15487
15488 /* Emit code to initialize VRTOP, the top of the VR save area.
15489 This address is gr_save_area_bytes below GRTOP, rounded
15490 down to the next 16-byte boundary. */
15491 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
15492 vr_offset = ROUND_UP (gr_save_area_size,
15493 STACK_BOUNDARY / BITS_PER_UNIT);
15494
15495 if (vr_offset)
15496 t = fold_build_pointer_plus_hwi (t, -vr_offset);
15497 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
15498 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15499
15500 /* Emit code to initialize GROFF, the offset from GRTOP of the
15501 next GPR argument. */
15502 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
15503 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
15504 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15505
15506 /* Likewise emit code to initialize VROFF, the offset from FTOP
15507 of the next VR argument. */
15508 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
15509 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
15510 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15511 }
15512
15513 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
15514
15515 static tree
15516 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
15517 gimple_seq *post_p ATTRIBUTE_UNUSED)
15518 {
15519 tree addr;
15520 bool indirect_p;
15521 bool is_ha; /* is HFA or HVA. */
15522 bool dw_align; /* double-word align. */
15523 machine_mode ag_mode = VOIDmode;
15524 int nregs;
15525 machine_mode mode;
15526
15527 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15528 tree stack, f_top, f_off, off, arg, roundup, on_stack;
15529 HOST_WIDE_INT size, rsize, adjust, align;
15530 tree t, u, cond1, cond2;
15531
15532 indirect_p = pass_va_arg_by_reference (type);
15533 if (indirect_p)
15534 type = build_pointer_type (type);
15535
15536 mode = TYPE_MODE (type);
15537
15538 f_stack = TYPE_FIELDS (va_list_type_node);
15539 f_grtop = DECL_CHAIN (f_stack);
15540 f_vrtop = DECL_CHAIN (f_grtop);
15541 f_groff = DECL_CHAIN (f_vrtop);
15542 f_vroff = DECL_CHAIN (f_groff);
15543
15544 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
15545 f_stack, NULL_TREE);
15546 size = int_size_in_bytes (type);
15547
15548 bool abi_break;
15549 align
15550 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
15551
15552 dw_align = false;
15553 adjust = 0;
15554 if (aarch64_vfp_is_call_or_return_candidate (mode,
15555 type,
15556 &ag_mode,
15557 &nregs,
15558 &is_ha))
15559 {
15560 /* No frontends can create types with variable-sized modes, so we
15561 shouldn't be asked to pass or return them. */
15562 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
15563
15564 /* TYPE passed in fp/simd registers. */
15565 if (!TARGET_FLOAT)
15566 aarch64_err_no_fpadvsimd (mode);
15567
15568 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
15569 unshare_expr (valist), f_vrtop, NULL_TREE);
15570 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
15571 unshare_expr (valist), f_vroff, NULL_TREE);
15572
15573 rsize = nregs * UNITS_PER_VREG;
15574
15575 if (is_ha)
15576 {
15577 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
15578 adjust = UNITS_PER_VREG - ag_size;
15579 }
15580 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15581 && size < UNITS_PER_VREG)
15582 {
15583 adjust = UNITS_PER_VREG - size;
15584 }
15585 }
15586 else
15587 {
15588 /* TYPE passed in general registers. */
15589 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
15590 unshare_expr (valist), f_grtop, NULL_TREE);
15591 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
15592 unshare_expr (valist), f_groff, NULL_TREE);
15593 rsize = ROUND_UP (size, UNITS_PER_WORD);
15594 nregs = rsize / UNITS_PER_WORD;
15595
15596 if (align > 8)
15597 {
15598 if (abi_break && warn_psabi)
15599 inform (input_location, "parameter passing for argument of type "
15600 "%qT changed in GCC 9.1", type);
15601 dw_align = true;
15602 }
15603
15604 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15605 && size < UNITS_PER_WORD)
15606 {
15607 adjust = UNITS_PER_WORD - size;
15608 }
15609 }
15610
15611 /* Get a local temporary for the field value. */
15612 off = get_initialized_tmp_var (f_off, pre_p, NULL);
15613
15614 /* Emit code to branch if off >= 0. */
15615 t = build2 (GE_EXPR, boolean_type_node, off,
15616 build_int_cst (TREE_TYPE (off), 0));
15617 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
15618
15619 if (dw_align)
15620 {
15621 /* Emit: offs = (offs + 15) & -16. */
15622 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15623 build_int_cst (TREE_TYPE (off), 15));
15624 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
15625 build_int_cst (TREE_TYPE (off), -16));
15626 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
15627 }
15628 else
15629 roundup = NULL;
15630
15631 /* Update ap.__[g|v]r_offs */
15632 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15633 build_int_cst (TREE_TYPE (off), rsize));
15634 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
15635
15636 /* String up. */
15637 if (roundup)
15638 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15639
15640 /* [cond2] if (ap.__[g|v]r_offs > 0) */
15641 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
15642 build_int_cst (TREE_TYPE (f_off), 0));
15643 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
15644
15645 /* String up: make sure the assignment happens before the use. */
15646 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
15647 COND_EXPR_ELSE (cond1) = t;
15648
15649 /* Prepare the trees handling the argument that is passed on the stack;
15650 the top level node will store in ON_STACK. */
15651 arg = get_initialized_tmp_var (stack, pre_p, NULL);
15652 if (align > 8)
15653 {
15654 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
15655 t = fold_build_pointer_plus_hwi (arg, 15);
15656 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15657 build_int_cst (TREE_TYPE (t), -16));
15658 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
15659 }
15660 else
15661 roundup = NULL;
15662 /* Advance ap.__stack */
15663 t = fold_build_pointer_plus_hwi (arg, size + 7);
15664 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15665 build_int_cst (TREE_TYPE (t), -8));
15666 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
15667 /* String up roundup and advance. */
15668 if (roundup)
15669 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15670 /* String up with arg */
15671 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
15672 /* Big-endianness related address adjustment. */
15673 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15674 && size < UNITS_PER_WORD)
15675 {
15676 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
15677 size_int (UNITS_PER_WORD - size));
15678 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
15679 }
15680
15681 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
15682 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
15683
15684 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
15685 t = off;
15686 if (adjust)
15687 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
15688 build_int_cst (TREE_TYPE (off), adjust));
15689
15690 t = fold_convert (sizetype, t);
15691 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
15692
15693 if (is_ha)
15694 {
15695 /* type ha; // treat as "struct {ftype field[n];}"
15696 ... [computing offs]
15697 for (i = 0; i <nregs; ++i, offs += 16)
15698 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
15699 return ha; */
15700 int i;
15701 tree tmp_ha, field_t, field_ptr_t;
15702
15703 /* Declare a local variable. */
15704 tmp_ha = create_tmp_var_raw (type, "ha");
15705 gimple_add_tmp_var (tmp_ha);
15706
15707 /* Establish the base type. */
15708 switch (ag_mode)
15709 {
15710 case E_SFmode:
15711 field_t = float_type_node;
15712 field_ptr_t = float_ptr_type_node;
15713 break;
15714 case E_DFmode:
15715 field_t = double_type_node;
15716 field_ptr_t = double_ptr_type_node;
15717 break;
15718 case E_TFmode:
15719 field_t = long_double_type_node;
15720 field_ptr_t = long_double_ptr_type_node;
15721 break;
15722 case E_HFmode:
15723 field_t = aarch64_fp16_type_node;
15724 field_ptr_t = aarch64_fp16_ptr_type_node;
15725 break;
15726 case E_BFmode:
15727 field_t = aarch64_bf16_type_node;
15728 field_ptr_t = aarch64_bf16_ptr_type_node;
15729 break;
15730 case E_V2SImode:
15731 case E_V4SImode:
15732 {
15733 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
15734 field_t = build_vector_type_for_mode (innertype, ag_mode);
15735 field_ptr_t = build_pointer_type (field_t);
15736 }
15737 break;
15738 default:
15739 gcc_assert (0);
15740 }
15741
15742 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
15743 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
15744 addr = t;
15745 t = fold_convert (field_ptr_t, addr);
15746 t = build2 (MODIFY_EXPR, field_t,
15747 build1 (INDIRECT_REF, field_t, tmp_ha),
15748 build1 (INDIRECT_REF, field_t, t));
15749
15750 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
15751 for (i = 1; i < nregs; ++i)
15752 {
15753 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
15754 u = fold_convert (field_ptr_t, addr);
15755 u = build2 (MODIFY_EXPR, field_t,
15756 build2 (MEM_REF, field_t, tmp_ha,
15757 build_int_cst (field_ptr_t,
15758 (i *
15759 int_size_in_bytes (field_t)))),
15760 build1 (INDIRECT_REF, field_t, u));
15761 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
15762 }
15763
15764 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
15765 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
15766 }
15767
15768 COND_EXPR_ELSE (cond2) = t;
15769 addr = fold_convert (build_pointer_type (type), cond1);
15770 addr = build_va_arg_indirect_ref (addr);
15771
15772 if (indirect_p)
15773 addr = build_va_arg_indirect_ref (addr);
15774
15775 return addr;
15776 }
15777
15778 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
15779
15780 static void
15781 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
15782 const function_arg_info &arg,
15783 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
15784 {
15785 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
15786 CUMULATIVE_ARGS local_cum;
15787 int gr_saved = cfun->va_list_gpr_size;
15788 int vr_saved = cfun->va_list_fpr_size;
15789
15790 /* The caller has advanced CUM up to, but not beyond, the last named
15791 argument. Advance a local copy of CUM past the last "real" named
15792 argument, to find out how many registers are left over. */
15793 local_cum = *cum;
15794 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
15795
15796 /* Found out how many registers we need to save.
15797 Honor tree-stdvar analysis results. */
15798 if (cfun->va_list_gpr_size)
15799 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
15800 cfun->va_list_gpr_size / UNITS_PER_WORD);
15801 if (cfun->va_list_fpr_size)
15802 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
15803 cfun->va_list_fpr_size / UNITS_PER_VREG);
15804
15805 if (!TARGET_FLOAT)
15806 {
15807 gcc_assert (local_cum.aapcs_nvrn == 0);
15808 vr_saved = 0;
15809 }
15810
15811 if (!no_rtl)
15812 {
15813 if (gr_saved > 0)
15814 {
15815 rtx ptr, mem;
15816
15817 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
15818 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
15819 - gr_saved * UNITS_PER_WORD);
15820 mem = gen_frame_mem (BLKmode, ptr);
15821 set_mem_alias_set (mem, get_varargs_alias_set ());
15822
15823 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
15824 mem, gr_saved);
15825 }
15826 if (vr_saved > 0)
15827 {
15828 /* We can't use move_block_from_reg, because it will use
15829 the wrong mode, storing D regs only. */
15830 machine_mode mode = TImode;
15831 int off, i, vr_start;
15832
15833 /* Set OFF to the offset from virtual_incoming_args_rtx of
15834 the first vector register. The VR save area lies below
15835 the GR one, and is aligned to 16 bytes. */
15836 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
15837 STACK_BOUNDARY / BITS_PER_UNIT);
15838 off -= vr_saved * UNITS_PER_VREG;
15839
15840 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
15841 for (i = 0; i < vr_saved; ++i)
15842 {
15843 rtx ptr, mem;
15844
15845 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
15846 mem = gen_frame_mem (mode, ptr);
15847 set_mem_alias_set (mem, get_varargs_alias_set ());
15848 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
15849 off += UNITS_PER_VREG;
15850 }
15851 }
15852 }
15853
15854 /* We don't save the size into *PRETEND_SIZE because we want to avoid
15855 any complication of having crtl->args.pretend_args_size changed. */
15856 cfun->machine->frame.saved_varargs_size
15857 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
15858 STACK_BOUNDARY / BITS_PER_UNIT)
15859 + vr_saved * UNITS_PER_VREG);
15860 }
15861
15862 static void
15863 aarch64_conditional_register_usage (void)
15864 {
15865 int i;
15866 if (!TARGET_FLOAT)
15867 {
15868 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
15869 {
15870 fixed_regs[i] = 1;
15871 call_used_regs[i] = 1;
15872 }
15873 }
15874 if (!TARGET_SVE)
15875 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
15876 {
15877 fixed_regs[i] = 1;
15878 call_used_regs[i] = 1;
15879 }
15880
15881 /* Only allow the FFR and FFRT to be accessed via special patterns. */
15882 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
15883 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
15884
15885 /* When tracking speculation, we need a couple of call-clobbered registers
15886 to track the speculation state. It would be nice to just use
15887 IP0 and IP1, but currently there are numerous places that just
15888 assume these registers are free for other uses (eg pointer
15889 authentication). */
15890 if (aarch64_track_speculation)
15891 {
15892 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
15893 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
15894 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15895 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15896 }
15897 }
15898
15899 /* Walk down the type tree of TYPE counting consecutive base elements.
15900 If *MODEP is VOIDmode, then set it to the first valid floating point
15901 type. If a non-floating point type is found, or if a floating point
15902 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
15903 otherwise return the count in the sub-tree. */
15904 static int
15905 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
15906 {
15907 machine_mode mode;
15908 HOST_WIDE_INT size;
15909
15910 /* SVE types (and types containing SVE types) must be handled
15911 before calling this function. */
15912 gcc_assert (!aarch64_sve::builtin_type_p (type));
15913
15914 switch (TREE_CODE (type))
15915 {
15916 case REAL_TYPE:
15917 mode = TYPE_MODE (type);
15918 if (mode != DFmode && mode != SFmode
15919 && mode != TFmode && mode != HFmode)
15920 return -1;
15921
15922 if (*modep == VOIDmode)
15923 *modep = mode;
15924
15925 if (*modep == mode)
15926 return 1;
15927
15928 break;
15929
15930 case COMPLEX_TYPE:
15931 mode = TYPE_MODE (TREE_TYPE (type));
15932 if (mode != DFmode && mode != SFmode
15933 && mode != TFmode && mode != HFmode)
15934 return -1;
15935
15936 if (*modep == VOIDmode)
15937 *modep = mode;
15938
15939 if (*modep == mode)
15940 return 2;
15941
15942 break;
15943
15944 case VECTOR_TYPE:
15945 /* Use V2SImode and V4SImode as representatives of all 64-bit
15946 and 128-bit vector types. */
15947 size = int_size_in_bytes (type);
15948 switch (size)
15949 {
15950 case 8:
15951 mode = V2SImode;
15952 break;
15953 case 16:
15954 mode = V4SImode;
15955 break;
15956 default:
15957 return -1;
15958 }
15959
15960 if (*modep == VOIDmode)
15961 *modep = mode;
15962
15963 /* Vector modes are considered to be opaque: two vectors are
15964 equivalent for the purposes of being homogeneous aggregates
15965 if they are the same size. */
15966 if (*modep == mode)
15967 return 1;
15968
15969 break;
15970
15971 case ARRAY_TYPE:
15972 {
15973 int count;
15974 tree index = TYPE_DOMAIN (type);
15975
15976 /* Can't handle incomplete types nor sizes that are not
15977 fixed. */
15978 if (!COMPLETE_TYPE_P (type)
15979 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15980 return -1;
15981
15982 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
15983 if (count == -1
15984 || !index
15985 || !TYPE_MAX_VALUE (index)
15986 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
15987 || !TYPE_MIN_VALUE (index)
15988 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
15989 || count < 0)
15990 return -1;
15991
15992 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
15993 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
15994
15995 /* There must be no padding. */
15996 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15997 count * GET_MODE_BITSIZE (*modep)))
15998 return -1;
15999
16000 return count;
16001 }
16002
16003 case RECORD_TYPE:
16004 {
16005 int count = 0;
16006 int sub_count;
16007 tree field;
16008
16009 /* Can't handle incomplete types nor sizes that are not
16010 fixed. */
16011 if (!COMPLETE_TYPE_P (type)
16012 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16013 return -1;
16014
16015 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16016 {
16017 if (TREE_CODE (field) != FIELD_DECL)
16018 continue;
16019
16020 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
16021 if (sub_count < 0)
16022 return -1;
16023 count += sub_count;
16024 }
16025
16026 /* There must be no padding. */
16027 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16028 count * GET_MODE_BITSIZE (*modep)))
16029 return -1;
16030
16031 return count;
16032 }
16033
16034 case UNION_TYPE:
16035 case QUAL_UNION_TYPE:
16036 {
16037 /* These aren't very interesting except in a degenerate case. */
16038 int count = 0;
16039 int sub_count;
16040 tree field;
16041
16042 /* Can't handle incomplete types nor sizes that are not
16043 fixed. */
16044 if (!COMPLETE_TYPE_P (type)
16045 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16046 return -1;
16047
16048 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16049 {
16050 if (TREE_CODE (field) != FIELD_DECL)
16051 continue;
16052
16053 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
16054 if (sub_count < 0)
16055 return -1;
16056 count = count > sub_count ? count : sub_count;
16057 }
16058
16059 /* There must be no padding. */
16060 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16061 count * GET_MODE_BITSIZE (*modep)))
16062 return -1;
16063
16064 return count;
16065 }
16066
16067 default:
16068 break;
16069 }
16070
16071 return -1;
16072 }
16073
16074 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
16075 type as described in AAPCS64 \S 4.1.2.
16076
16077 See the comment above aarch64_composite_type_p for the notes on MODE. */
16078
16079 static bool
16080 aarch64_short_vector_p (const_tree type,
16081 machine_mode mode)
16082 {
16083 poly_int64 size = -1;
16084
16085 if (type && aarch64_sve::builtin_type_p (type))
16086 return false;
16087
16088 if (type && TREE_CODE (type) == VECTOR_TYPE)
16089 size = int_size_in_bytes (type);
16090 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16091 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16092 size = GET_MODE_SIZE (mode);
16093
16094 return known_eq (size, 8) || known_eq (size, 16);
16095 }
16096
16097 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
16098 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
16099 array types. The C99 floating-point complex types are also considered
16100 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
16101 types, which are GCC extensions and out of the scope of AAPCS64, are
16102 treated as composite types here as well.
16103
16104 Note that MODE itself is not sufficient in determining whether a type
16105 is such a composite type or not. This is because
16106 stor-layout.c:compute_record_mode may have already changed the MODE
16107 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
16108 structure with only one field may have its MODE set to the mode of the
16109 field. Also an integer mode whose size matches the size of the
16110 RECORD_TYPE type may be used to substitute the original mode
16111 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
16112 solely relied on. */
16113
16114 static bool
16115 aarch64_composite_type_p (const_tree type,
16116 machine_mode mode)
16117 {
16118 if (aarch64_short_vector_p (type, mode))
16119 return false;
16120
16121 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
16122 return true;
16123
16124 if (mode == BLKmode
16125 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16126 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16127 return true;
16128
16129 return false;
16130 }
16131
16132 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16133 shall be passed or returned in simd/fp register(s) (providing these
16134 parameter passing registers are available).
16135
16136 Upon successful return, *COUNT returns the number of needed registers,
16137 *BASE_MODE returns the mode of the individual register and when IS_HAF
16138 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16139 floating-point aggregate or a homogeneous short-vector aggregate. */
16140
16141 static bool
16142 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
16143 const_tree type,
16144 machine_mode *base_mode,
16145 int *count,
16146 bool *is_ha)
16147 {
16148 if (is_ha != NULL) *is_ha = false;
16149
16150 if (type && aarch64_sve::builtin_type_p (type))
16151 return false;
16152
16153 machine_mode new_mode = VOIDmode;
16154 bool composite_p = aarch64_composite_type_p (type, mode);
16155
16156 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
16157 || aarch64_short_vector_p (type, mode))
16158 {
16159 *count = 1;
16160 new_mode = mode;
16161 }
16162 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
16163 {
16164 if (is_ha != NULL) *is_ha = true;
16165 *count = 2;
16166 new_mode = GET_MODE_INNER (mode);
16167 }
16168 else if (type && composite_p)
16169 {
16170 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
16171
16172 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
16173 {
16174 if (is_ha != NULL) *is_ha = true;
16175 *count = ag_count;
16176 }
16177 else
16178 return false;
16179 }
16180 else
16181 return false;
16182
16183 *base_mode = new_mode;
16184 return true;
16185 }
16186
16187 /* Implement TARGET_STRUCT_VALUE_RTX. */
16188
16189 static rtx
16190 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
16191 int incoming ATTRIBUTE_UNUSED)
16192 {
16193 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
16194 }
16195
16196 /* Implements target hook vector_mode_supported_p. */
16197 static bool
16198 aarch64_vector_mode_supported_p (machine_mode mode)
16199 {
16200 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16201 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
16202 }
16203
16204 /* Return the full-width SVE vector mode for element mode MODE, if one
16205 exists. */
16206 opt_machine_mode
16207 aarch64_full_sve_mode (scalar_mode mode)
16208 {
16209 switch (mode)
16210 {
16211 case E_DFmode:
16212 return VNx2DFmode;
16213 case E_SFmode:
16214 return VNx4SFmode;
16215 case E_HFmode:
16216 return VNx8HFmode;
16217 case E_BFmode:
16218 return VNx8BFmode;
16219 case E_DImode:
16220 return VNx2DImode;
16221 case E_SImode:
16222 return VNx4SImode;
16223 case E_HImode:
16224 return VNx8HImode;
16225 case E_QImode:
16226 return VNx16QImode;
16227 default:
16228 return opt_machine_mode ();
16229 }
16230 }
16231
16232 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16233 if it exists. */
16234 opt_machine_mode
16235 aarch64_vq_mode (scalar_mode mode)
16236 {
16237 switch (mode)
16238 {
16239 case E_DFmode:
16240 return V2DFmode;
16241 case E_SFmode:
16242 return V4SFmode;
16243 case E_HFmode:
16244 return V8HFmode;
16245 case E_BFmode:
16246 return V8BFmode;
16247 case E_SImode:
16248 return V4SImode;
16249 case E_HImode:
16250 return V8HImode;
16251 case E_QImode:
16252 return V16QImode;
16253 case E_DImode:
16254 return V2DImode;
16255 default:
16256 return opt_machine_mode ();
16257 }
16258 }
16259
16260 /* Return appropriate SIMD container
16261 for MODE within a vector of WIDTH bits. */
16262 static machine_mode
16263 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
16264 {
16265 if (TARGET_SVE
16266 && maybe_ne (width, 128)
16267 && known_eq (width, BITS_PER_SVE_VECTOR))
16268 return aarch64_full_sve_mode (mode).else_mode (word_mode);
16269
16270 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
16271 if (TARGET_SIMD)
16272 {
16273 if (known_eq (width, 128))
16274 return aarch64_vq_mode (mode).else_mode (word_mode);
16275 else
16276 switch (mode)
16277 {
16278 case E_SFmode:
16279 return V2SFmode;
16280 case E_HFmode:
16281 return V4HFmode;
16282 case E_BFmode:
16283 return V4BFmode;
16284 case E_SImode:
16285 return V2SImode;
16286 case E_HImode:
16287 return V4HImode;
16288 case E_QImode:
16289 return V8QImode;
16290 default:
16291 break;
16292 }
16293 }
16294 return word_mode;
16295 }
16296
16297 /* Return 128-bit container as the preferred SIMD mode for MODE. */
16298 static machine_mode
16299 aarch64_preferred_simd_mode (scalar_mode mode)
16300 {
16301 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
16302 return aarch64_simd_container_mode (mode, bits);
16303 }
16304
16305 /* Return a list of possible vector sizes for the vectorizer
16306 to iterate over. */
16307 static unsigned int
16308 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
16309 {
16310 static const machine_mode sve_modes[] = {
16311 /* Try using full vectors for all element types. */
16312 VNx16QImode,
16313
16314 /* Try using 16-bit containers for 8-bit elements and full vectors
16315 for wider elements. */
16316 VNx8QImode,
16317
16318 /* Try using 32-bit containers for 8-bit and 16-bit elements and
16319 full vectors for wider elements. */
16320 VNx4QImode,
16321
16322 /* Try using 64-bit containers for all element types. */
16323 VNx2QImode
16324 };
16325
16326 static const machine_mode advsimd_modes[] = {
16327 /* Try using 128-bit vectors for all element types. */
16328 V16QImode,
16329
16330 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16331 for wider elements. */
16332 V8QImode,
16333
16334 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16335 for wider elements.
16336
16337 TODO: We could support a limited form of V4QImode too, so that
16338 we use 32-bit vectors for 8-bit elements. */
16339 V4HImode,
16340
16341 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16342 for 64-bit elements.
16343
16344 TODO: We could similarly support limited forms of V2QImode and V2HImode
16345 for this case. */
16346 V2SImode
16347 };
16348
16349 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16350 This is because:
16351
16352 - If we can't use N-byte Advanced SIMD vectors then the placement
16353 doesn't matter; we'll just continue as though the Advanced SIMD
16354 entry didn't exist.
16355
16356 - If an SVE main loop with N bytes ends up being cheaper than an
16357 Advanced SIMD main loop with N bytes then by default we'll replace
16358 the Advanced SIMD version with the SVE one.
16359
16360 - If an Advanced SIMD main loop with N bytes ends up being cheaper
16361 than an SVE main loop with N bytes then by default we'll try to
16362 use the SVE loop to vectorize the epilogue instead. */
16363 unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
16364 unsigned int advsimd_i = 0;
16365 while (advsimd_i < ARRAY_SIZE (advsimd_modes))
16366 {
16367 if (sve_i < ARRAY_SIZE (sve_modes)
16368 && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
16369 GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
16370 modes->safe_push (sve_modes[sve_i++]);
16371 else
16372 modes->safe_push (advsimd_modes[advsimd_i++]);
16373 }
16374 while (sve_i < ARRAY_SIZE (sve_modes))
16375 modes->safe_push (sve_modes[sve_i++]);
16376
16377 unsigned int flags = 0;
16378 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16379 can compare SVE against Advanced SIMD and so that we can compare
16380 multiple SVE vectorization approaches against each other. There's
16381 not really any point doing this for Advanced SIMD only, since the
16382 first mode that works should always be the best. */
16383 if (TARGET_SVE && aarch64_sve_compare_costs)
16384 flags |= VECT_COMPARE_COSTS;
16385 return flags;
16386 }
16387
16388 /* Implement TARGET_MANGLE_TYPE. */
16389
16390 static const char *
16391 aarch64_mangle_type (const_tree type)
16392 {
16393 /* The AArch64 ABI documents say that "__va_list" has to be
16394 mangled as if it is in the "std" namespace. */
16395 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
16396 return "St9__va_list";
16397
16398 /* Half-precision floating point types. */
16399 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
16400 {
16401 if (TYPE_MODE (type) == BFmode)
16402 return "u6__bf16";
16403 else
16404 return "Dh";
16405 }
16406
16407 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
16408 builtin types. */
16409 if (TYPE_NAME (type) != NULL)
16410 {
16411 const char *res;
16412 if ((res = aarch64_general_mangle_builtin_type (type))
16413 || (res = aarch64_sve::mangle_builtin_type (type)))
16414 return res;
16415 }
16416
16417 /* Use the default mangling. */
16418 return NULL;
16419 }
16420
16421 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
16422
16423 static bool
16424 aarch64_verify_type_context (location_t loc, type_context_kind context,
16425 const_tree type, bool silent_p)
16426 {
16427 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
16428 }
16429
16430 /* Find the first rtx_insn before insn that will generate an assembly
16431 instruction. */
16432
16433 static rtx_insn *
16434 aarch64_prev_real_insn (rtx_insn *insn)
16435 {
16436 if (!insn)
16437 return NULL;
16438
16439 do
16440 {
16441 insn = prev_real_insn (insn);
16442 }
16443 while (insn && recog_memoized (insn) < 0);
16444
16445 return insn;
16446 }
16447
16448 static bool
16449 is_madd_op (enum attr_type t1)
16450 {
16451 unsigned int i;
16452 /* A number of these may be AArch32 only. */
16453 enum attr_type mlatypes[] = {
16454 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
16455 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
16456 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
16457 };
16458
16459 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
16460 {
16461 if (t1 == mlatypes[i])
16462 return true;
16463 }
16464
16465 return false;
16466 }
16467
16468 /* Check if there is a register dependency between a load and the insn
16469 for which we hold recog_data. */
16470
16471 static bool
16472 dep_between_memop_and_curr (rtx memop)
16473 {
16474 rtx load_reg;
16475 int opno;
16476
16477 gcc_assert (GET_CODE (memop) == SET);
16478
16479 if (!REG_P (SET_DEST (memop)))
16480 return false;
16481
16482 load_reg = SET_DEST (memop);
16483 for (opno = 1; opno < recog_data.n_operands; opno++)
16484 {
16485 rtx operand = recog_data.operand[opno];
16486 if (REG_P (operand)
16487 && reg_overlap_mentioned_p (load_reg, operand))
16488 return true;
16489
16490 }
16491 return false;
16492 }
16493
16494
16495 /* When working around the Cortex-A53 erratum 835769,
16496 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16497 instruction and has a preceding memory instruction such that a NOP
16498 should be inserted between them. */
16499
16500 bool
16501 aarch64_madd_needs_nop (rtx_insn* insn)
16502 {
16503 enum attr_type attr_type;
16504 rtx_insn *prev;
16505 rtx body;
16506
16507 if (!TARGET_FIX_ERR_A53_835769)
16508 return false;
16509
16510 if (!INSN_P (insn) || recog_memoized (insn) < 0)
16511 return false;
16512
16513 attr_type = get_attr_type (insn);
16514 if (!is_madd_op (attr_type))
16515 return false;
16516
16517 prev = aarch64_prev_real_insn (insn);
16518 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16519 Restore recog state to INSN to avoid state corruption. */
16520 extract_constrain_insn_cached (insn);
16521
16522 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
16523 return false;
16524
16525 body = single_set (prev);
16526
16527 /* If the previous insn is a memory op and there is no dependency between
16528 it and the DImode madd, emit a NOP between them. If body is NULL then we
16529 have a complex memory operation, probably a load/store pair.
16530 Be conservative for now and emit a NOP. */
16531 if (GET_MODE (recog_data.operand[0]) == DImode
16532 && (!body || !dep_between_memop_and_curr (body)))
16533 return true;
16534
16535 return false;
16536
16537 }
16538
16539
16540 /* Implement FINAL_PRESCAN_INSN. */
16541
16542 void
16543 aarch64_final_prescan_insn (rtx_insn *insn)
16544 {
16545 if (aarch64_madd_needs_nop (insn))
16546 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
16547 }
16548
16549
16550 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
16551 instruction. */
16552
16553 bool
16554 aarch64_sve_index_immediate_p (rtx base_or_step)
16555 {
16556 return (CONST_INT_P (base_or_step)
16557 && IN_RANGE (INTVAL (base_or_step), -16, 15));
16558 }
16559
16560 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
16561 when applied to mode MODE. Negate X first if NEGATE_P is true. */
16562
16563 bool
16564 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
16565 {
16566 rtx elt = unwrap_const_vec_duplicate (x);
16567 if (!CONST_INT_P (elt))
16568 return false;
16569
16570 HOST_WIDE_INT val = INTVAL (elt);
16571 if (negate_p)
16572 val = -val;
16573 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
16574
16575 if (val & 0xff)
16576 return IN_RANGE (val, 0, 0xff);
16577 return IN_RANGE (val, 0, 0xff00);
16578 }
16579
16580 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
16581 instructions when applied to mode MODE. Negate X first if NEGATE_P
16582 is true. */
16583
16584 bool
16585 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
16586 {
16587 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
16588 return false;
16589
16590 /* After the optional negation, the immediate must be nonnegative.
16591 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
16592 instead of SQADD Zn.B, Zn.B, #129. */
16593 rtx elt = unwrap_const_vec_duplicate (x);
16594 return negate_p == (INTVAL (elt) < 0);
16595 }
16596
16597 /* Return true if X is a valid immediate operand for an SVE logical
16598 instruction such as AND. */
16599
16600 bool
16601 aarch64_sve_bitmask_immediate_p (rtx x)
16602 {
16603 rtx elt;
16604
16605 return (const_vec_duplicate_p (x, &elt)
16606 && CONST_INT_P (elt)
16607 && aarch64_bitmask_imm (INTVAL (elt),
16608 GET_MODE_INNER (GET_MODE (x))));
16609 }
16610
16611 /* Return true if X is a valid immediate for the SVE DUP and CPY
16612 instructions. */
16613
16614 bool
16615 aarch64_sve_dup_immediate_p (rtx x)
16616 {
16617 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
16618 if (!CONST_INT_P (x))
16619 return false;
16620
16621 HOST_WIDE_INT val = INTVAL (x);
16622 if (val & 0xff)
16623 return IN_RANGE (val, -0x80, 0x7f);
16624 return IN_RANGE (val, -0x8000, 0x7f00);
16625 }
16626
16627 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
16628 SIGNED_P says whether the operand is signed rather than unsigned. */
16629
16630 bool
16631 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
16632 {
16633 x = unwrap_const_vec_duplicate (x);
16634 return (CONST_INT_P (x)
16635 && (signed_p
16636 ? IN_RANGE (INTVAL (x), -16, 15)
16637 : IN_RANGE (INTVAL (x), 0, 127)));
16638 }
16639
16640 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
16641 instruction. Negate X first if NEGATE_P is true. */
16642
16643 bool
16644 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
16645 {
16646 rtx elt;
16647 REAL_VALUE_TYPE r;
16648
16649 if (!const_vec_duplicate_p (x, &elt)
16650 || GET_CODE (elt) != CONST_DOUBLE)
16651 return false;
16652
16653 r = *CONST_DOUBLE_REAL_VALUE (elt);
16654
16655 if (negate_p)
16656 r = real_value_negate (&r);
16657
16658 if (real_equal (&r, &dconst1))
16659 return true;
16660 if (real_equal (&r, &dconsthalf))
16661 return true;
16662 return false;
16663 }
16664
16665 /* Return true if X is a valid immediate operand for an SVE FMUL
16666 instruction. */
16667
16668 bool
16669 aarch64_sve_float_mul_immediate_p (rtx x)
16670 {
16671 rtx elt;
16672
16673 return (const_vec_duplicate_p (x, &elt)
16674 && GET_CODE (elt) == CONST_DOUBLE
16675 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
16676 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
16677 }
16678
16679 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
16680 for the Advanced SIMD operation described by WHICH and INSN. If INFO
16681 is nonnull, use it to describe valid immediates. */
16682 static bool
16683 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
16684 simd_immediate_info *info,
16685 enum simd_immediate_check which,
16686 simd_immediate_info::insn_type insn)
16687 {
16688 /* Try a 4-byte immediate with LSL. */
16689 for (unsigned int shift = 0; shift < 32; shift += 8)
16690 if ((val32 & (0xff << shift)) == val32)
16691 {
16692 if (info)
16693 *info = simd_immediate_info (SImode, val32 >> shift, insn,
16694 simd_immediate_info::LSL, shift);
16695 return true;
16696 }
16697
16698 /* Try a 2-byte immediate with LSL. */
16699 unsigned int imm16 = val32 & 0xffff;
16700 if (imm16 == (val32 >> 16))
16701 for (unsigned int shift = 0; shift < 16; shift += 8)
16702 if ((imm16 & (0xff << shift)) == imm16)
16703 {
16704 if (info)
16705 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
16706 simd_immediate_info::LSL, shift);
16707 return true;
16708 }
16709
16710 /* Try a 4-byte immediate with MSL, except for cases that MVN
16711 can handle. */
16712 if (which == AARCH64_CHECK_MOV)
16713 for (unsigned int shift = 8; shift < 24; shift += 8)
16714 {
16715 unsigned int low = (1 << shift) - 1;
16716 if (((val32 & (0xff << shift)) | low) == val32)
16717 {
16718 if (info)
16719 *info = simd_immediate_info (SImode, val32 >> shift, insn,
16720 simd_immediate_info::MSL, shift);
16721 return true;
16722 }
16723 }
16724
16725 return false;
16726 }
16727
16728 /* Return true if replicating VAL64 is a valid immediate for the
16729 Advanced SIMD operation described by WHICH. If INFO is nonnull,
16730 use it to describe valid immediates. */
16731 static bool
16732 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
16733 simd_immediate_info *info,
16734 enum simd_immediate_check which)
16735 {
16736 unsigned int val32 = val64 & 0xffffffff;
16737 unsigned int val16 = val64 & 0xffff;
16738 unsigned int val8 = val64 & 0xff;
16739
16740 if (val32 == (val64 >> 32))
16741 {
16742 if ((which & AARCH64_CHECK_ORR) != 0
16743 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
16744 simd_immediate_info::MOV))
16745 return true;
16746
16747 if ((which & AARCH64_CHECK_BIC) != 0
16748 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
16749 simd_immediate_info::MVN))
16750 return true;
16751
16752 /* Try using a replicated byte. */
16753 if (which == AARCH64_CHECK_MOV
16754 && val16 == (val32 >> 16)
16755 && val8 == (val16 >> 8))
16756 {
16757 if (info)
16758 *info = simd_immediate_info (QImode, val8);
16759 return true;
16760 }
16761 }
16762
16763 /* Try using a bit-to-bytemask. */
16764 if (which == AARCH64_CHECK_MOV)
16765 {
16766 unsigned int i;
16767 for (i = 0; i < 64; i += 8)
16768 {
16769 unsigned char byte = (val64 >> i) & 0xff;
16770 if (byte != 0 && byte != 0xff)
16771 break;
16772 }
16773 if (i == 64)
16774 {
16775 if (info)
16776 *info = simd_immediate_info (DImode, val64);
16777 return true;
16778 }
16779 }
16780 return false;
16781 }
16782
16783 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
16784 instruction. If INFO is nonnull, use it to describe valid immediates. */
16785
16786 static bool
16787 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
16788 simd_immediate_info *info)
16789 {
16790 scalar_int_mode mode = DImode;
16791 unsigned int val32 = val64 & 0xffffffff;
16792 if (val32 == (val64 >> 32))
16793 {
16794 mode = SImode;
16795 unsigned int val16 = val32 & 0xffff;
16796 if (val16 == (val32 >> 16))
16797 {
16798 mode = HImode;
16799 unsigned int val8 = val16 & 0xff;
16800 if (val8 == (val16 >> 8))
16801 mode = QImode;
16802 }
16803 }
16804 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
16805 if (IN_RANGE (val, -0x80, 0x7f))
16806 {
16807 /* DUP with no shift. */
16808 if (info)
16809 *info = simd_immediate_info (mode, val);
16810 return true;
16811 }
16812 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
16813 {
16814 /* DUP with LSL #8. */
16815 if (info)
16816 *info = simd_immediate_info (mode, val);
16817 return true;
16818 }
16819 if (aarch64_bitmask_imm (val64, mode))
16820 {
16821 /* DUPM. */
16822 if (info)
16823 *info = simd_immediate_info (mode, val);
16824 return true;
16825 }
16826 return false;
16827 }
16828
16829 /* Return true if X is an UNSPEC_PTRUE constant of the form:
16830
16831 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
16832
16833 where PATTERN is the svpattern as a CONST_INT and where ZERO
16834 is a zero constant of the required PTRUE mode (which can have
16835 fewer elements than X's mode, if zero bits are significant).
16836
16837 If so, and if INFO is nonnull, describe the immediate in INFO. */
16838 bool
16839 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
16840 {
16841 if (GET_CODE (x) != CONST)
16842 return false;
16843
16844 x = XEXP (x, 0);
16845 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
16846 return false;
16847
16848 if (info)
16849 {
16850 aarch64_svpattern pattern
16851 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
16852 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
16853 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
16854 *info = simd_immediate_info (int_mode, pattern);
16855 }
16856 return true;
16857 }
16858
16859 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
16860 it to describe valid immediates. */
16861
16862 static bool
16863 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
16864 {
16865 if (aarch64_sve_ptrue_svpattern_p (x, info))
16866 return true;
16867
16868 if (x == CONST0_RTX (GET_MODE (x)))
16869 {
16870 if (info)
16871 *info = simd_immediate_info (DImode, 0);
16872 return true;
16873 }
16874
16875 /* Analyze the value as a VNx16BImode. This should be relatively
16876 efficient, since rtx_vector_builder has enough built-in capacity
16877 to store all VLA predicate constants without needing the heap. */
16878 rtx_vector_builder builder;
16879 if (!aarch64_get_sve_pred_bits (builder, x))
16880 return false;
16881
16882 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
16883 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
16884 {
16885 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
16886 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
16887 if (pattern != AARCH64_NUM_SVPATTERNS)
16888 {
16889 if (info)
16890 {
16891 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
16892 *info = simd_immediate_info (int_mode, pattern);
16893 }
16894 return true;
16895 }
16896 }
16897 return false;
16898 }
16899
16900 /* Return true if OP is a valid SIMD immediate for the operation
16901 described by WHICH. If INFO is nonnull, use it to describe valid
16902 immediates. */
16903 bool
16904 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
16905 enum simd_immediate_check which)
16906 {
16907 machine_mode mode = GET_MODE (op);
16908 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16909 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16910 return false;
16911
16912 if (vec_flags & VEC_SVE_PRED)
16913 return aarch64_sve_pred_valid_immediate (op, info);
16914
16915 scalar_mode elt_mode = GET_MODE_INNER (mode);
16916 rtx base, step;
16917 unsigned int n_elts;
16918 if (GET_CODE (op) == CONST_VECTOR
16919 && CONST_VECTOR_DUPLICATE_P (op))
16920 n_elts = CONST_VECTOR_NPATTERNS (op);
16921 else if ((vec_flags & VEC_SVE_DATA)
16922 && const_vec_series_p (op, &base, &step))
16923 {
16924 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
16925 if (!aarch64_sve_index_immediate_p (base)
16926 || !aarch64_sve_index_immediate_p (step))
16927 return false;
16928
16929 if (info)
16930 {
16931 /* Get the corresponding container mode. E.g. an INDEX on V2SI
16932 should yield two integer values per 128-bit block, meaning
16933 that we need to treat it in the same way as V2DI and then
16934 ignore the upper 32 bits of each element. */
16935 elt_mode = aarch64_sve_container_int_mode (mode);
16936 *info = simd_immediate_info (elt_mode, base, step);
16937 }
16938 return true;
16939 }
16940 else if (GET_CODE (op) == CONST_VECTOR
16941 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
16942 /* N_ELTS set above. */;
16943 else
16944 return false;
16945
16946 scalar_float_mode elt_float_mode;
16947 if (n_elts == 1
16948 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
16949 {
16950 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
16951 if (aarch64_float_const_zero_rtx_p (elt)
16952 || aarch64_float_const_representable_p (elt))
16953 {
16954 if (info)
16955 *info = simd_immediate_info (elt_float_mode, elt);
16956 return true;
16957 }
16958 }
16959
16960 /* If all elements in an SVE vector have the same value, we have a free
16961 choice between using the element mode and using the container mode.
16962 Using the element mode means that unused parts of the vector are
16963 duplicates of the used elements, while using the container mode means
16964 that the unused parts are an extension of the used elements. Using the
16965 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
16966 for its container mode VNx4SI while 0x00000101 isn't.
16967
16968 If not all elements in an SVE vector have the same value, we need the
16969 transition from one element to the next to occur at container boundaries.
16970 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
16971 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
16972 scalar_int_mode elt_int_mode;
16973 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
16974 elt_int_mode = aarch64_sve_container_int_mode (mode);
16975 else
16976 elt_int_mode = int_mode_for_mode (elt_mode).require ();
16977
16978 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
16979 if (elt_size > 8)
16980 return false;
16981
16982 /* Expand the vector constant out into a byte vector, with the least
16983 significant byte of the register first. */
16984 auto_vec<unsigned char, 16> bytes;
16985 bytes.reserve (n_elts * elt_size);
16986 for (unsigned int i = 0; i < n_elts; i++)
16987 {
16988 /* The vector is provided in gcc endian-neutral fashion.
16989 For aarch64_be Advanced SIMD, it must be laid out in the vector
16990 register in reverse order. */
16991 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
16992 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
16993
16994 if (elt_mode != elt_int_mode)
16995 elt = gen_lowpart (elt_int_mode, elt);
16996
16997 if (!CONST_INT_P (elt))
16998 return false;
16999
17000 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
17001 for (unsigned int byte = 0; byte < elt_size; byte++)
17002 {
17003 bytes.quick_push (elt_val & 0xff);
17004 elt_val >>= BITS_PER_UNIT;
17005 }
17006 }
17007
17008 /* The immediate must repeat every eight bytes. */
17009 unsigned int nbytes = bytes.length ();
17010 for (unsigned i = 8; i < nbytes; ++i)
17011 if (bytes[i] != bytes[i - 8])
17012 return false;
17013
17014 /* Get the repeating 8-byte value as an integer. No endian correction
17015 is needed here because bytes is already in lsb-first order. */
17016 unsigned HOST_WIDE_INT val64 = 0;
17017 for (unsigned int i = 0; i < 8; i++)
17018 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
17019 << (i * BITS_PER_UNIT));
17020
17021 if (vec_flags & VEC_SVE_DATA)
17022 return aarch64_sve_valid_immediate (val64, info);
17023 else
17024 return aarch64_advsimd_valid_immediate (val64, info, which);
17025 }
17026
17027 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
17028 has a step in the range of INDEX. Return the index expression if so,
17029 otherwise return null. */
17030 rtx
17031 aarch64_check_zero_based_sve_index_immediate (rtx x)
17032 {
17033 rtx base, step;
17034 if (const_vec_series_p (x, &base, &step)
17035 && base == const0_rtx
17036 && aarch64_sve_index_immediate_p (step))
17037 return step;
17038 return NULL_RTX;
17039 }
17040
17041 /* Check of immediate shift constants are within range. */
17042 bool
17043 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
17044 {
17045 x = unwrap_const_vec_duplicate (x);
17046 if (!CONST_INT_P (x))
17047 return false;
17048 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
17049 if (left)
17050 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
17051 else
17052 return IN_RANGE (INTVAL (x), 1, bit_width);
17053 }
17054
17055 /* Return the bitmask CONST_INT to select the bits required by a zero extract
17056 operation of width WIDTH at bit position POS. */
17057
17058 rtx
17059 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
17060 {
17061 gcc_assert (CONST_INT_P (width));
17062 gcc_assert (CONST_INT_P (pos));
17063
17064 unsigned HOST_WIDE_INT mask
17065 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
17066 return GEN_INT (mask << UINTVAL (pos));
17067 }
17068
17069 bool
17070 aarch64_mov_operand_p (rtx x, machine_mode mode)
17071 {
17072 if (GET_CODE (x) == HIGH
17073 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
17074 return true;
17075
17076 if (CONST_INT_P (x))
17077 return true;
17078
17079 if (VECTOR_MODE_P (GET_MODE (x)))
17080 {
17081 /* Require predicate constants to be VNx16BI before RA, so that we
17082 force everything to have a canonical form. */
17083 if (!lra_in_progress
17084 && !reload_completed
17085 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
17086 && GET_MODE (x) != VNx16BImode)
17087 return false;
17088
17089 return aarch64_simd_valid_immediate (x, NULL);
17090 }
17091
17092 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
17093 return true;
17094
17095 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
17096 return true;
17097
17098 return aarch64_classify_symbolic_expression (x)
17099 == SYMBOL_TINY_ABSOLUTE;
17100 }
17101
17102 /* Return a const_int vector of VAL. */
17103 rtx
17104 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
17105 {
17106 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
17107 return gen_const_vec_duplicate (mode, c);
17108 }
17109
17110 /* Check OP is a legal scalar immediate for the MOVI instruction. */
17111
17112 bool
17113 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
17114 {
17115 machine_mode vmode;
17116
17117 vmode = aarch64_simd_container_mode (mode, 64);
17118 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
17119 return aarch64_simd_valid_immediate (op_v, NULL);
17120 }
17121
17122 /* Construct and return a PARALLEL RTX vector with elements numbering the
17123 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
17124 the vector - from the perspective of the architecture. This does not
17125 line up with GCC's perspective on lane numbers, so we end up with
17126 different masks depending on our target endian-ness. The diagram
17127 below may help. We must draw the distinction when building masks
17128 which select one half of the vector. An instruction selecting
17129 architectural low-lanes for a big-endian target, must be described using
17130 a mask selecting GCC high-lanes.
17131
17132 Big-Endian Little-Endian
17133
17134 GCC 0 1 2 3 3 2 1 0
17135 | x | x | x | x | | x | x | x | x |
17136 Architecture 3 2 1 0 3 2 1 0
17137
17138 Low Mask: { 2, 3 } { 0, 1 }
17139 High Mask: { 0, 1 } { 2, 3 }
17140
17141 MODE Is the mode of the vector and NUNITS is the number of units in it. */
17142
17143 rtx
17144 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
17145 {
17146 rtvec v = rtvec_alloc (nunits / 2);
17147 int high_base = nunits / 2;
17148 int low_base = 0;
17149 int base;
17150 rtx t1;
17151 int i;
17152
17153 if (BYTES_BIG_ENDIAN)
17154 base = high ? low_base : high_base;
17155 else
17156 base = high ? high_base : low_base;
17157
17158 for (i = 0; i < nunits / 2; i++)
17159 RTVEC_ELT (v, i) = GEN_INT (base + i);
17160
17161 t1 = gen_rtx_PARALLEL (mode, v);
17162 return t1;
17163 }
17164
17165 /* Check OP for validity as a PARALLEL RTX vector with elements
17166 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17167 from the perspective of the architecture. See the diagram above
17168 aarch64_simd_vect_par_cnst_half for more details. */
17169
17170 bool
17171 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
17172 bool high)
17173 {
17174 int nelts;
17175 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
17176 return false;
17177
17178 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
17179 HOST_WIDE_INT count_op = XVECLEN (op, 0);
17180 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
17181 int i = 0;
17182
17183 if (count_op != count_ideal)
17184 return false;
17185
17186 for (i = 0; i < count_ideal; i++)
17187 {
17188 rtx elt_op = XVECEXP (op, 0, i);
17189 rtx elt_ideal = XVECEXP (ideal, 0, i);
17190
17191 if (!CONST_INT_P (elt_op)
17192 || INTVAL (elt_ideal) != INTVAL (elt_op))
17193 return false;
17194 }
17195 return true;
17196 }
17197
17198 /* Return a PARALLEL containing NELTS elements, with element I equal
17199 to BASE + I * STEP. */
17200
17201 rtx
17202 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
17203 {
17204 rtvec vec = rtvec_alloc (nelts);
17205 for (unsigned int i = 0; i < nelts; ++i)
17206 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
17207 return gen_rtx_PARALLEL (VOIDmode, vec);
17208 }
17209
17210 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17211 series with step STEP. */
17212
17213 bool
17214 aarch64_stepped_int_parallel_p (rtx op, int step)
17215 {
17216 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
17217 return false;
17218
17219 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
17220 for (int i = 1; i < XVECLEN (op, 0); ++i)
17221 if (!CONST_INT_P (XVECEXP (op, 0, i))
17222 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
17223 return false;
17224
17225 return true;
17226 }
17227
17228 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
17229 HIGH (exclusive). */
17230 void
17231 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
17232 const_tree exp)
17233 {
17234 HOST_WIDE_INT lane;
17235 gcc_assert (CONST_INT_P (operand));
17236 lane = INTVAL (operand);
17237
17238 if (lane < low || lane >= high)
17239 {
17240 if (exp)
17241 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
17242 else
17243 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
17244 }
17245 }
17246
17247 /* Peform endian correction on lane number N, which indexes a vector
17248 of mode MODE, and return the result as an SImode rtx. */
17249
17250 rtx
17251 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
17252 {
17253 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
17254 }
17255
17256 /* Return TRUE if OP is a valid vector addressing mode. */
17257
17258 bool
17259 aarch64_simd_mem_operand_p (rtx op)
17260 {
17261 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
17262 || REG_P (XEXP (op, 0)));
17263 }
17264
17265 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
17266
17267 bool
17268 aarch64_sve_ld1r_operand_p (rtx op)
17269 {
17270 struct aarch64_address_info addr;
17271 scalar_mode mode;
17272
17273 return (MEM_P (op)
17274 && is_a <scalar_mode> (GET_MODE (op), &mode)
17275 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
17276 && addr.type == ADDRESS_REG_IMM
17277 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
17278 }
17279
17280 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
17281 where the size of the read data is specified by `mode` and the size of the
17282 vector elements are specified by `elem_mode`. */
17283 bool
17284 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
17285 scalar_mode elem_mode)
17286 {
17287 struct aarch64_address_info addr;
17288 if (!MEM_P (op)
17289 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
17290 return false;
17291
17292 if (addr.type == ADDRESS_REG_IMM)
17293 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
17294
17295 if (addr.type == ADDRESS_REG_REG)
17296 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
17297
17298 return false;
17299 }
17300
17301 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
17302 bool
17303 aarch64_sve_ld1rq_operand_p (rtx op)
17304 {
17305 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
17306 GET_MODE_INNER (GET_MODE (op)));
17307 }
17308
17309 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
17310 accessing a vector where the element size is specified by `elem_mode`. */
17311 bool
17312 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
17313 {
17314 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
17315 }
17316
17317 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
17318 bool
17319 aarch64_sve_ldff1_operand_p (rtx op)
17320 {
17321 if (!MEM_P (op))
17322 return false;
17323
17324 struct aarch64_address_info addr;
17325 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
17326 return false;
17327
17328 if (addr.type == ADDRESS_REG_IMM)
17329 return known_eq (addr.const_offset, 0);
17330
17331 return addr.type == ADDRESS_REG_REG;
17332 }
17333
17334 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
17335 bool
17336 aarch64_sve_ldnf1_operand_p (rtx op)
17337 {
17338 struct aarch64_address_info addr;
17339
17340 return (MEM_P (op)
17341 && aarch64_classify_address (&addr, XEXP (op, 0),
17342 GET_MODE (op), false)
17343 && addr.type == ADDRESS_REG_IMM);
17344 }
17345
17346 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17347 The conditions for STR are the same. */
17348 bool
17349 aarch64_sve_ldr_operand_p (rtx op)
17350 {
17351 struct aarch64_address_info addr;
17352
17353 return (MEM_P (op)
17354 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
17355 false, ADDR_QUERY_ANY)
17356 && addr.type == ADDRESS_REG_IMM);
17357 }
17358
17359 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17360 addressing memory of mode MODE. */
17361 bool
17362 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
17363 {
17364 struct aarch64_address_info addr;
17365 if (!aarch64_classify_address (&addr, op, mode, false))
17366 return false;
17367
17368 if (addr.type == ADDRESS_REG_IMM)
17369 return known_eq (addr.const_offset, 0);
17370
17371 return addr.type == ADDRESS_REG_REG;
17372 }
17373
17374 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17375 We need to be able to access the individual pieces, so the range
17376 is different from LD[234] and ST[234]. */
17377 bool
17378 aarch64_sve_struct_memory_operand_p (rtx op)
17379 {
17380 if (!MEM_P (op))
17381 return false;
17382
17383 machine_mode mode = GET_MODE (op);
17384 struct aarch64_address_info addr;
17385 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
17386 ADDR_QUERY_ANY)
17387 || addr.type != ADDRESS_REG_IMM)
17388 return false;
17389
17390 poly_int64 first = addr.const_offset;
17391 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
17392 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
17393 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
17394 }
17395
17396 /* Emit a register copy from operand to operand, taking care not to
17397 early-clobber source registers in the process.
17398
17399 COUNT is the number of components into which the copy needs to be
17400 decomposed. */
17401 void
17402 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
17403 unsigned int count)
17404 {
17405 unsigned int i;
17406 int rdest = REGNO (operands[0]);
17407 int rsrc = REGNO (operands[1]);
17408
17409 if (!reg_overlap_mentioned_p (operands[0], operands[1])
17410 || rdest < rsrc)
17411 for (i = 0; i < count; i++)
17412 emit_move_insn (gen_rtx_REG (mode, rdest + i),
17413 gen_rtx_REG (mode, rsrc + i));
17414 else
17415 for (i = 0; i < count; i++)
17416 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
17417 gen_rtx_REG (mode, rsrc + count - i - 1));
17418 }
17419
17420 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17421 one of VSTRUCT modes: OI, CI, or XI. */
17422 int
17423 aarch64_simd_attr_length_rglist (machine_mode mode)
17424 {
17425 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
17426 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
17427 }
17428
17429 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
17430 alignment of a vector to 128 bits. SVE predicates have an alignment of
17431 16 bits. */
17432 static HOST_WIDE_INT
17433 aarch64_simd_vector_alignment (const_tree type)
17434 {
17435 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17436 be set for non-predicate vectors of booleans. Modes are the most
17437 direct way we have of identifying real SVE predicate types. */
17438 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
17439 return 16;
17440 widest_int min_size
17441 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
17442 return wi::umin (min_size, 128).to_uhwi ();
17443 }
17444
17445 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
17446 static poly_uint64
17447 aarch64_vectorize_preferred_vector_alignment (const_tree type)
17448 {
17449 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
17450 {
17451 /* If the length of the vector is fixed, try to align to that length,
17452 otherwise don't try to align at all. */
17453 HOST_WIDE_INT result;
17454 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
17455 result = TYPE_ALIGN (TREE_TYPE (type));
17456 return result;
17457 }
17458 return TYPE_ALIGN (type);
17459 }
17460
17461 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
17462 static bool
17463 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
17464 {
17465 if (is_packed)
17466 return false;
17467
17468 /* For fixed-length vectors, check that the vectorizer will aim for
17469 full-vector alignment. This isn't true for generic GCC vectors
17470 that are wider than the ABI maximum of 128 bits. */
17471 poly_uint64 preferred_alignment =
17472 aarch64_vectorize_preferred_vector_alignment (type);
17473 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
17474 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
17475 preferred_alignment))
17476 return false;
17477
17478 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
17479 return true;
17480 }
17481
17482 /* Return true if the vector misalignment factor is supported by the
17483 target. */
17484 static bool
17485 aarch64_builtin_support_vector_misalignment (machine_mode mode,
17486 const_tree type, int misalignment,
17487 bool is_packed)
17488 {
17489 if (TARGET_SIMD && STRICT_ALIGNMENT)
17490 {
17491 /* Return if movmisalign pattern is not supported for this mode. */
17492 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
17493 return false;
17494
17495 /* Misalignment factor is unknown at compile time. */
17496 if (misalignment == -1)
17497 return false;
17498 }
17499 return default_builtin_support_vector_misalignment (mode, type, misalignment,
17500 is_packed);
17501 }
17502
17503 /* If VALS is a vector constant that can be loaded into a register
17504 using DUP, generate instructions to do so and return an RTX to
17505 assign to the register. Otherwise return NULL_RTX. */
17506 static rtx
17507 aarch64_simd_dup_constant (rtx vals)
17508 {
17509 machine_mode mode = GET_MODE (vals);
17510 machine_mode inner_mode = GET_MODE_INNER (mode);
17511 rtx x;
17512
17513 if (!const_vec_duplicate_p (vals, &x))
17514 return NULL_RTX;
17515
17516 /* We can load this constant by using DUP and a constant in a
17517 single ARM register. This will be cheaper than a vector
17518 load. */
17519 x = copy_to_mode_reg (inner_mode, x);
17520 return gen_vec_duplicate (mode, x);
17521 }
17522
17523
17524 /* Generate code to load VALS, which is a PARALLEL containing only
17525 constants (for vec_init) or CONST_VECTOR, efficiently into a
17526 register. Returns an RTX to copy into the register, or NULL_RTX
17527 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
17528 static rtx
17529 aarch64_simd_make_constant (rtx vals)
17530 {
17531 machine_mode mode = GET_MODE (vals);
17532 rtx const_dup;
17533 rtx const_vec = NULL_RTX;
17534 int n_const = 0;
17535 int i;
17536
17537 if (GET_CODE (vals) == CONST_VECTOR)
17538 const_vec = vals;
17539 else if (GET_CODE (vals) == PARALLEL)
17540 {
17541 /* A CONST_VECTOR must contain only CONST_INTs and
17542 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
17543 Only store valid constants in a CONST_VECTOR. */
17544 int n_elts = XVECLEN (vals, 0);
17545 for (i = 0; i < n_elts; ++i)
17546 {
17547 rtx x = XVECEXP (vals, 0, i);
17548 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17549 n_const++;
17550 }
17551 if (n_const == n_elts)
17552 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
17553 }
17554 else
17555 gcc_unreachable ();
17556
17557 if (const_vec != NULL_RTX
17558 && aarch64_simd_valid_immediate (const_vec, NULL))
17559 /* Load using MOVI/MVNI. */
17560 return const_vec;
17561 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
17562 /* Loaded using DUP. */
17563 return const_dup;
17564 else if (const_vec != NULL_RTX)
17565 /* Load from constant pool. We cannot take advantage of single-cycle
17566 LD1 because we need a PC-relative addressing mode. */
17567 return const_vec;
17568 else
17569 /* A PARALLEL containing something not valid inside CONST_VECTOR.
17570 We cannot construct an initializer. */
17571 return NULL_RTX;
17572 }
17573
17574 /* Expand a vector initialisation sequence, such that TARGET is
17575 initialised to contain VALS. */
17576
17577 void
17578 aarch64_expand_vector_init (rtx target, rtx vals)
17579 {
17580 machine_mode mode = GET_MODE (target);
17581 scalar_mode inner_mode = GET_MODE_INNER (mode);
17582 /* The number of vector elements. */
17583 int n_elts = XVECLEN (vals, 0);
17584 /* The number of vector elements which are not constant. */
17585 int n_var = 0;
17586 rtx any_const = NULL_RTX;
17587 /* The first element of vals. */
17588 rtx v0 = XVECEXP (vals, 0, 0);
17589 bool all_same = true;
17590
17591 /* This is a special vec_init<M><N> where N is not an element mode but a
17592 vector mode with half the elements of M. We expect to find two entries
17593 of mode N in VALS and we must put their concatentation into TARGET. */
17594 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
17595 {
17596 gcc_assert (known_eq (GET_MODE_SIZE (mode),
17597 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
17598 rtx lo = XVECEXP (vals, 0, 0);
17599 rtx hi = XVECEXP (vals, 0, 1);
17600 machine_mode narrow_mode = GET_MODE (lo);
17601 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
17602 gcc_assert (narrow_mode == GET_MODE (hi));
17603
17604 /* When we want to concatenate a half-width vector with zeroes we can
17605 use the aarch64_combinez[_be] patterns. Just make sure that the
17606 zeroes are in the right half. */
17607 if (BYTES_BIG_ENDIAN
17608 && aarch64_simd_imm_zero (lo, narrow_mode)
17609 && general_operand (hi, narrow_mode))
17610 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
17611 else if (!BYTES_BIG_ENDIAN
17612 && aarch64_simd_imm_zero (hi, narrow_mode)
17613 && general_operand (lo, narrow_mode))
17614 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
17615 else
17616 {
17617 /* Else create the two half-width registers and combine them. */
17618 if (!REG_P (lo))
17619 lo = force_reg (GET_MODE (lo), lo);
17620 if (!REG_P (hi))
17621 hi = force_reg (GET_MODE (hi), hi);
17622
17623 if (BYTES_BIG_ENDIAN)
17624 std::swap (lo, hi);
17625 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
17626 }
17627 return;
17628 }
17629
17630 /* Count the number of variable elements to initialise. */
17631 for (int i = 0; i < n_elts; ++i)
17632 {
17633 rtx x = XVECEXP (vals, 0, i);
17634 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
17635 ++n_var;
17636 else
17637 any_const = x;
17638
17639 all_same &= rtx_equal_p (x, v0);
17640 }
17641
17642 /* No variable elements, hand off to aarch64_simd_make_constant which knows
17643 how best to handle this. */
17644 if (n_var == 0)
17645 {
17646 rtx constant = aarch64_simd_make_constant (vals);
17647 if (constant != NULL_RTX)
17648 {
17649 emit_move_insn (target, constant);
17650 return;
17651 }
17652 }
17653
17654 /* Splat a single non-constant element if we can. */
17655 if (all_same)
17656 {
17657 rtx x = copy_to_mode_reg (inner_mode, v0);
17658 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17659 return;
17660 }
17661
17662 enum insn_code icode = optab_handler (vec_set_optab, mode);
17663 gcc_assert (icode != CODE_FOR_nothing);
17664
17665 /* If there are only variable elements, try to optimize
17666 the insertion using dup for the most common element
17667 followed by insertions. */
17668
17669 /* The algorithm will fill matches[*][0] with the earliest matching element,
17670 and matches[X][1] with the count of duplicate elements (if X is the
17671 earliest element which has duplicates). */
17672
17673 if (n_var == n_elts && n_elts <= 16)
17674 {
17675 int matches[16][2] = {0};
17676 for (int i = 0; i < n_elts; i++)
17677 {
17678 for (int j = 0; j <= i; j++)
17679 {
17680 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
17681 {
17682 matches[i][0] = j;
17683 matches[j][1]++;
17684 break;
17685 }
17686 }
17687 }
17688 int maxelement = 0;
17689 int maxv = 0;
17690 for (int i = 0; i < n_elts; i++)
17691 if (matches[i][1] > maxv)
17692 {
17693 maxelement = i;
17694 maxv = matches[i][1];
17695 }
17696
17697 /* Create a duplicate of the most common element, unless all elements
17698 are equally useless to us, in which case just immediately set the
17699 vector register using the first element. */
17700
17701 if (maxv == 1)
17702 {
17703 /* For vectors of two 64-bit elements, we can do even better. */
17704 if (n_elts == 2
17705 && (inner_mode == E_DImode
17706 || inner_mode == E_DFmode))
17707
17708 {
17709 rtx x0 = XVECEXP (vals, 0, 0);
17710 rtx x1 = XVECEXP (vals, 0, 1);
17711 /* Combine can pick up this case, but handling it directly
17712 here leaves clearer RTL.
17713
17714 This is load_pair_lanes<mode>, and also gives us a clean-up
17715 for store_pair_lanes<mode>. */
17716 if (memory_operand (x0, inner_mode)
17717 && memory_operand (x1, inner_mode)
17718 && !STRICT_ALIGNMENT
17719 && rtx_equal_p (XEXP (x1, 0),
17720 plus_constant (Pmode,
17721 XEXP (x0, 0),
17722 GET_MODE_SIZE (inner_mode))))
17723 {
17724 rtx t;
17725 if (inner_mode == DFmode)
17726 t = gen_load_pair_lanesdf (target, x0, x1);
17727 else
17728 t = gen_load_pair_lanesdi (target, x0, x1);
17729 emit_insn (t);
17730 return;
17731 }
17732 }
17733 /* The subreg-move sequence below will move into lane zero of the
17734 vector register. For big-endian we want that position to hold
17735 the last element of VALS. */
17736 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
17737 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17738 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
17739 }
17740 else
17741 {
17742 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17743 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17744 }
17745
17746 /* Insert the rest. */
17747 for (int i = 0; i < n_elts; i++)
17748 {
17749 rtx x = XVECEXP (vals, 0, i);
17750 if (matches[i][0] == maxelement)
17751 continue;
17752 x = copy_to_mode_reg (inner_mode, x);
17753 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17754 }
17755 return;
17756 }
17757
17758 /* Initialise a vector which is part-variable. We want to first try
17759 to build those lanes which are constant in the most efficient way we
17760 can. */
17761 if (n_var != n_elts)
17762 {
17763 rtx copy = copy_rtx (vals);
17764
17765 /* Load constant part of vector. We really don't care what goes into the
17766 parts we will overwrite, but we're more likely to be able to load the
17767 constant efficiently if it has fewer, larger, repeating parts
17768 (see aarch64_simd_valid_immediate). */
17769 for (int i = 0; i < n_elts; i++)
17770 {
17771 rtx x = XVECEXP (vals, 0, i);
17772 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17773 continue;
17774 rtx subst = any_const;
17775 for (int bit = n_elts / 2; bit > 0; bit /= 2)
17776 {
17777 /* Look in the copied vector, as more elements are const. */
17778 rtx test = XVECEXP (copy, 0, i ^ bit);
17779 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
17780 {
17781 subst = test;
17782 break;
17783 }
17784 }
17785 XVECEXP (copy, 0, i) = subst;
17786 }
17787 aarch64_expand_vector_init (target, copy);
17788 }
17789
17790 /* Insert the variable lanes directly. */
17791 for (int i = 0; i < n_elts; i++)
17792 {
17793 rtx x = XVECEXP (vals, 0, i);
17794 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17795 continue;
17796 x = copy_to_mode_reg (inner_mode, x);
17797 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17798 }
17799 }
17800
17801 /* Emit RTL corresponding to:
17802 insr TARGET, ELEM. */
17803
17804 static void
17805 emit_insr (rtx target, rtx elem)
17806 {
17807 machine_mode mode = GET_MODE (target);
17808 scalar_mode elem_mode = GET_MODE_INNER (mode);
17809 elem = force_reg (elem_mode, elem);
17810
17811 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
17812 gcc_assert (icode != CODE_FOR_nothing);
17813 emit_insn (GEN_FCN (icode) (target, target, elem));
17814 }
17815
17816 /* Subroutine of aarch64_sve_expand_vector_init for handling
17817 trailing constants.
17818 This function works as follows:
17819 (a) Create a new vector consisting of trailing constants.
17820 (b) Initialize TARGET with the constant vector using emit_move_insn.
17821 (c) Insert remaining elements in TARGET using insr.
17822 NELTS is the total number of elements in original vector while
17823 while NELTS_REQD is the number of elements that are actually
17824 significant.
17825
17826 ??? The heuristic used is to do above only if number of constants
17827 is at least half the total number of elements. May need fine tuning. */
17828
17829 static bool
17830 aarch64_sve_expand_vector_init_handle_trailing_constants
17831 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
17832 {
17833 machine_mode mode = GET_MODE (target);
17834 scalar_mode elem_mode = GET_MODE_INNER (mode);
17835 int n_trailing_constants = 0;
17836
17837 for (int i = nelts_reqd - 1;
17838 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
17839 i--)
17840 n_trailing_constants++;
17841
17842 if (n_trailing_constants >= nelts_reqd / 2)
17843 {
17844 rtx_vector_builder v (mode, 1, nelts);
17845 for (int i = 0; i < nelts; i++)
17846 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
17847 rtx const_vec = v.build ();
17848 emit_move_insn (target, const_vec);
17849
17850 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
17851 emit_insr (target, builder.elt (i));
17852
17853 return true;
17854 }
17855
17856 return false;
17857 }
17858
17859 /* Subroutine of aarch64_sve_expand_vector_init.
17860 Works as follows:
17861 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
17862 (b) Skip trailing elements from BUILDER, which are the same as
17863 element NELTS_REQD - 1.
17864 (c) Insert earlier elements in reverse order in TARGET using insr. */
17865
17866 static void
17867 aarch64_sve_expand_vector_init_insert_elems (rtx target,
17868 const rtx_vector_builder &builder,
17869 int nelts_reqd)
17870 {
17871 machine_mode mode = GET_MODE (target);
17872 scalar_mode elem_mode = GET_MODE_INNER (mode);
17873
17874 struct expand_operand ops[2];
17875 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
17876 gcc_assert (icode != CODE_FOR_nothing);
17877
17878 create_output_operand (&ops[0], target, mode);
17879 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
17880 expand_insn (icode, 2, ops);
17881
17882 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17883 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
17884 emit_insr (target, builder.elt (i));
17885 }
17886
17887 /* Subroutine of aarch64_sve_expand_vector_init to handle case
17888 when all trailing elements of builder are same.
17889 This works as follows:
17890 (a) Use expand_insn interface to broadcast last vector element in TARGET.
17891 (b) Insert remaining elements in TARGET using insr.
17892
17893 ??? The heuristic used is to do above if number of same trailing elements
17894 is at least 3/4 of total number of elements, loosely based on
17895 heuristic from mostly_zeros_p. May need fine-tuning. */
17896
17897 static bool
17898 aarch64_sve_expand_vector_init_handle_trailing_same_elem
17899 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
17900 {
17901 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17902 if (ndups >= (3 * nelts_reqd) / 4)
17903 {
17904 aarch64_sve_expand_vector_init_insert_elems (target, builder,
17905 nelts_reqd - ndups + 1);
17906 return true;
17907 }
17908
17909 return false;
17910 }
17911
17912 /* Initialize register TARGET from BUILDER. NELTS is the constant number
17913 of elements in BUILDER.
17914
17915 The function tries to initialize TARGET from BUILDER if it fits one
17916 of the special cases outlined below.
17917
17918 Failing that, the function divides BUILDER into two sub-vectors:
17919 v_even = even elements of BUILDER;
17920 v_odd = odd elements of BUILDER;
17921
17922 and recursively calls itself with v_even and v_odd.
17923
17924 if (recursive call succeeded for v_even or v_odd)
17925 TARGET = zip (v_even, v_odd)
17926
17927 The function returns true if it managed to build TARGET from BUILDER
17928 with one of the special cases, false otherwise.
17929
17930 Example: {a, 1, b, 2, c, 3, d, 4}
17931
17932 The vector gets divided into:
17933 v_even = {a, b, c, d}
17934 v_odd = {1, 2, 3, 4}
17935
17936 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
17937 initialize tmp2 from constant vector v_odd using emit_move_insn.
17938
17939 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
17940 4 elements, so we construct tmp1 from v_even using insr:
17941 tmp1 = dup(d)
17942 insr tmp1, c
17943 insr tmp1, b
17944 insr tmp1, a
17945
17946 And finally:
17947 TARGET = zip (tmp1, tmp2)
17948 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
17949
17950 static bool
17951 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
17952 int nelts, int nelts_reqd)
17953 {
17954 machine_mode mode = GET_MODE (target);
17955
17956 /* Case 1: Vector contains trailing constants. */
17957
17958 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17959 (target, builder, nelts, nelts_reqd))
17960 return true;
17961
17962 /* Case 2: Vector contains leading constants. */
17963
17964 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
17965 for (int i = 0; i < nelts_reqd; i++)
17966 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
17967 rev_builder.finalize ();
17968
17969 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17970 (target, rev_builder, nelts, nelts_reqd))
17971 {
17972 emit_insn (gen_aarch64_sve_rev (mode, target, target));
17973 return true;
17974 }
17975
17976 /* Case 3: Vector contains trailing same element. */
17977
17978 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17979 (target, builder, nelts_reqd))
17980 return true;
17981
17982 /* Case 4: Vector contains leading same element. */
17983
17984 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17985 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
17986 {
17987 emit_insn (gen_aarch64_sve_rev (mode, target, target));
17988 return true;
17989 }
17990
17991 /* Avoid recursing below 4-elements.
17992 ??? The threshold 4 may need fine-tuning. */
17993
17994 if (nelts_reqd <= 4)
17995 return false;
17996
17997 rtx_vector_builder v_even (mode, 1, nelts);
17998 rtx_vector_builder v_odd (mode, 1, nelts);
17999
18000 for (int i = 0; i < nelts * 2; i += 2)
18001 {
18002 v_even.quick_push (builder.elt (i));
18003 v_odd.quick_push (builder.elt (i + 1));
18004 }
18005
18006 v_even.finalize ();
18007 v_odd.finalize ();
18008
18009 rtx tmp1 = gen_reg_rtx (mode);
18010 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
18011 nelts, nelts_reqd / 2);
18012
18013 rtx tmp2 = gen_reg_rtx (mode);
18014 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
18015 nelts, nelts_reqd / 2);
18016
18017 if (!did_even_p && !did_odd_p)
18018 return false;
18019
18020 /* Initialize v_even and v_odd using INSR if it didn't match any of the
18021 special cases and zip v_even, v_odd. */
18022
18023 if (!did_even_p)
18024 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
18025
18026 if (!did_odd_p)
18027 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
18028
18029 rtvec v = gen_rtvec (2, tmp1, tmp2);
18030 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
18031 return true;
18032 }
18033
18034 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
18035
18036 void
18037 aarch64_sve_expand_vector_init (rtx target, rtx vals)
18038 {
18039 machine_mode mode = GET_MODE (target);
18040 int nelts = XVECLEN (vals, 0);
18041
18042 rtx_vector_builder v (mode, 1, nelts);
18043 for (int i = 0; i < nelts; i++)
18044 v.quick_push (XVECEXP (vals, 0, i));
18045 v.finalize ();
18046
18047 /* If neither sub-vectors of v could be initialized specially,
18048 then use INSR to insert all elements from v into TARGET.
18049 ??? This might not be optimal for vectors with large
18050 initializers like 16-element or above.
18051 For nelts < 4, it probably isn't useful to handle specially. */
18052
18053 if (nelts < 4
18054 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
18055 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
18056 }
18057
18058 /* Check whether VALUE is a vector constant in which every element
18059 is either a power of 2 or a negated power of 2. If so, return
18060 a constant vector of log2s, and flip CODE between PLUS and MINUS
18061 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
18062
18063 static rtx
18064 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
18065 {
18066 if (GET_CODE (value) != CONST_VECTOR)
18067 return NULL_RTX;
18068
18069 rtx_vector_builder builder;
18070 if (!builder.new_unary_operation (GET_MODE (value), value, false))
18071 return NULL_RTX;
18072
18073 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
18074 /* 1 if the result of the multiplication must be negated,
18075 0 if it mustn't, or -1 if we don't yet care. */
18076 int negate = -1;
18077 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
18078 for (unsigned int i = 0; i < encoded_nelts; ++i)
18079 {
18080 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
18081 if (!CONST_SCALAR_INT_P (elt))
18082 return NULL_RTX;
18083 rtx_mode_t val (elt, int_mode);
18084 wide_int pow2 = wi::neg (val);
18085 if (val != pow2)
18086 {
18087 /* It matters whether we negate or not. Make that choice,
18088 and make sure that it's consistent with previous elements. */
18089 if (negate == !wi::neg_p (val))
18090 return NULL_RTX;
18091 negate = wi::neg_p (val);
18092 if (!negate)
18093 pow2 = val;
18094 }
18095 /* POW2 is now the value that we want to be a power of 2. */
18096 int shift = wi::exact_log2 (pow2);
18097 if (shift < 0)
18098 return NULL_RTX;
18099 builder.quick_push (gen_int_mode (shift, int_mode));
18100 }
18101 if (negate == -1)
18102 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
18103 code = PLUS;
18104 else if (negate == 1)
18105 code = code == PLUS ? MINUS : PLUS;
18106 return builder.build ();
18107 }
18108
18109 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
18110 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
18111 operands array, in the same order as for fma_optab. Return true if
18112 the function emitted all the necessary instructions, false if the caller
18113 should generate the pattern normally with the new OPERANDS array. */
18114
18115 bool
18116 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
18117 {
18118 machine_mode mode = GET_MODE (operands[0]);
18119 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
18120 {
18121 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
18122 NULL_RTX, true, OPTAB_DIRECT);
18123 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
18124 operands[3], product, operands[0], true,
18125 OPTAB_DIRECT);
18126 return true;
18127 }
18128 operands[2] = force_reg (mode, operands[2]);
18129 return false;
18130 }
18131
18132 /* Likewise, but for a conditional pattern. */
18133
18134 bool
18135 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
18136 {
18137 machine_mode mode = GET_MODE (operands[0]);
18138 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
18139 {
18140 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
18141 NULL_RTX, true, OPTAB_DIRECT);
18142 emit_insn (gen_cond (code, mode, operands[0], operands[1],
18143 operands[4], product, operands[5]));
18144 return true;
18145 }
18146 operands[3] = force_reg (mode, operands[3]);
18147 return false;
18148 }
18149
18150 static unsigned HOST_WIDE_INT
18151 aarch64_shift_truncation_mask (machine_mode mode)
18152 {
18153 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
18154 return 0;
18155 return GET_MODE_UNIT_BITSIZE (mode) - 1;
18156 }
18157
18158 /* Select a format to encode pointers in exception handling data. */
18159 int
18160 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
18161 {
18162 int type;
18163 switch (aarch64_cmodel)
18164 {
18165 case AARCH64_CMODEL_TINY:
18166 case AARCH64_CMODEL_TINY_PIC:
18167 case AARCH64_CMODEL_SMALL:
18168 case AARCH64_CMODEL_SMALL_PIC:
18169 case AARCH64_CMODEL_SMALL_SPIC:
18170 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
18171 for everything. */
18172 type = DW_EH_PE_sdata4;
18173 break;
18174 default:
18175 /* No assumptions here. 8-byte relocs required. */
18176 type = DW_EH_PE_sdata8;
18177 break;
18178 }
18179 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
18180 }
18181
18182 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
18183
18184 static void
18185 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
18186 {
18187 if (TREE_CODE (decl) == FUNCTION_DECL)
18188 {
18189 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
18190 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
18191 {
18192 fprintf (stream, "\t.variant_pcs\t");
18193 assemble_name (stream, name);
18194 fprintf (stream, "\n");
18195 }
18196 }
18197 }
18198
18199 /* The last .arch and .tune assembly strings that we printed. */
18200 static std::string aarch64_last_printed_arch_string;
18201 static std::string aarch64_last_printed_tune_string;
18202
18203 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
18204 by the function fndecl. */
18205
18206 void
18207 aarch64_declare_function_name (FILE *stream, const char* name,
18208 tree fndecl)
18209 {
18210 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18211
18212 struct cl_target_option *targ_options;
18213 if (target_parts)
18214 targ_options = TREE_TARGET_OPTION (target_parts);
18215 else
18216 targ_options = TREE_TARGET_OPTION (target_option_current_node);
18217 gcc_assert (targ_options);
18218
18219 const struct processor *this_arch
18220 = aarch64_get_arch (targ_options->x_explicit_arch);
18221
18222 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
18223 std::string extension
18224 = aarch64_get_extension_string_for_isa_flags (isa_flags,
18225 this_arch->flags);
18226 /* Only update the assembler .arch string if it is distinct from the last
18227 such string we printed. */
18228 std::string to_print = this_arch->name + extension;
18229 if (to_print != aarch64_last_printed_arch_string)
18230 {
18231 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
18232 aarch64_last_printed_arch_string = to_print;
18233 }
18234
18235 /* Print the cpu name we're tuning for in the comments, might be
18236 useful to readers of the generated asm. Do it only when it changes
18237 from function to function and verbose assembly is requested. */
18238 const struct processor *this_tune
18239 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
18240
18241 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
18242 {
18243 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
18244 this_tune->name);
18245 aarch64_last_printed_tune_string = this_tune->name;
18246 }
18247
18248 aarch64_asm_output_variant_pcs (stream, fndecl, name);
18249
18250 /* Don't forget the type directive for ELF. */
18251 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
18252 ASM_OUTPUT_LABEL (stream, name);
18253
18254 cfun->machine->label_is_assembled = true;
18255 }
18256
18257 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
18258 the function label and emit a BTI if necessary. */
18259
18260 void
18261 aarch64_print_patchable_function_entry (FILE *file,
18262 unsigned HOST_WIDE_INT patch_area_size,
18263 bool record_p)
18264 {
18265 if (cfun->machine->label_is_assembled
18266 && aarch64_bti_enabled ()
18267 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
18268 {
18269 /* Remove the BTI that follows the patch area and insert a new BTI
18270 before the patch area right after the function label. */
18271 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
18272 if (insn
18273 && INSN_P (insn)
18274 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
18275 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
18276 delete_insn (insn);
18277 asm_fprintf (file, "\thint\t34 // bti c\n");
18278 }
18279
18280 default_print_patchable_function_entry (file, patch_area_size, record_p);
18281 }
18282
18283 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
18284
18285 void
18286 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
18287 {
18288 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
18289 const char *value = IDENTIFIER_POINTER (target);
18290 aarch64_asm_output_variant_pcs (stream, decl, name);
18291 ASM_OUTPUT_DEF (stream, name, value);
18292 }
18293
18294 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
18295 function symbol references. */
18296
18297 void
18298 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
18299 {
18300 default_elf_asm_output_external (stream, decl, name);
18301 aarch64_asm_output_variant_pcs (stream, decl, name);
18302 }
18303
18304 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18305 Used to output the .cfi_b_key_frame directive when signing the current
18306 function with the B key. */
18307
18308 void
18309 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
18310 {
18311 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
18312 && aarch64_ra_sign_key == AARCH64_KEY_B)
18313 asm_fprintf (f, "\t.cfi_b_key_frame\n");
18314 }
18315
18316 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
18317
18318 static void
18319 aarch64_start_file (void)
18320 {
18321 struct cl_target_option *default_options
18322 = TREE_TARGET_OPTION (target_option_default_node);
18323
18324 const struct processor *default_arch
18325 = aarch64_get_arch (default_options->x_explicit_arch);
18326 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
18327 std::string extension
18328 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
18329 default_arch->flags);
18330
18331 aarch64_last_printed_arch_string = default_arch->name + extension;
18332 aarch64_last_printed_tune_string = "";
18333 asm_fprintf (asm_out_file, "\t.arch %s\n",
18334 aarch64_last_printed_arch_string.c_str ());
18335
18336 default_file_start ();
18337 }
18338
18339 /* Emit load exclusive. */
18340
18341 static void
18342 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
18343 rtx mem, rtx model_rtx)
18344 {
18345 if (mode == TImode)
18346 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
18347 gen_highpart (DImode, rval),
18348 mem, model_rtx));
18349 else
18350 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
18351 }
18352
18353 /* Emit store exclusive. */
18354
18355 static void
18356 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
18357 rtx mem, rtx rval, rtx model_rtx)
18358 {
18359 if (mode == TImode)
18360 emit_insn (gen_aarch64_store_exclusive_pair
18361 (bval, mem, operand_subword (rval, 0, 0, TImode),
18362 operand_subword (rval, 1, 0, TImode), model_rtx));
18363 else
18364 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
18365 }
18366
18367 /* Mark the previous jump instruction as unlikely. */
18368
18369 static void
18370 aarch64_emit_unlikely_jump (rtx insn)
18371 {
18372 rtx_insn *jump = emit_jump_insn (insn);
18373 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
18374 }
18375
18376 /* We store the names of the various atomic helpers in a 5x4 array.
18377 Return the libcall function given MODE, MODEL and NAMES. */
18378
18379 rtx
18380 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
18381 const atomic_ool_names *names)
18382 {
18383 memmodel model = memmodel_base (INTVAL (model_rtx));
18384 int mode_idx, model_idx;
18385
18386 switch (mode)
18387 {
18388 case E_QImode:
18389 mode_idx = 0;
18390 break;
18391 case E_HImode:
18392 mode_idx = 1;
18393 break;
18394 case E_SImode:
18395 mode_idx = 2;
18396 break;
18397 case E_DImode:
18398 mode_idx = 3;
18399 break;
18400 case E_TImode:
18401 mode_idx = 4;
18402 break;
18403 default:
18404 gcc_unreachable ();
18405 }
18406
18407 switch (model)
18408 {
18409 case MEMMODEL_RELAXED:
18410 model_idx = 0;
18411 break;
18412 case MEMMODEL_CONSUME:
18413 case MEMMODEL_ACQUIRE:
18414 model_idx = 1;
18415 break;
18416 case MEMMODEL_RELEASE:
18417 model_idx = 2;
18418 break;
18419 case MEMMODEL_ACQ_REL:
18420 case MEMMODEL_SEQ_CST:
18421 model_idx = 3;
18422 break;
18423 default:
18424 gcc_unreachable ();
18425 }
18426
18427 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
18428 VISIBILITY_HIDDEN);
18429 }
18430
18431 #define DEF0(B, N) \
18432 { "__aarch64_" #B #N "_relax", \
18433 "__aarch64_" #B #N "_acq", \
18434 "__aarch64_" #B #N "_rel", \
18435 "__aarch64_" #B #N "_acq_rel" }
18436
18437 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18438 { NULL, NULL, NULL, NULL }
18439 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18440
18441 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
18442 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
18443 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
18444 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
18445 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
18446 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
18447
18448 #undef DEF0
18449 #undef DEF4
18450 #undef DEF5
18451
18452 /* Expand a compare and swap pattern. */
18453
18454 void
18455 aarch64_expand_compare_and_swap (rtx operands[])
18456 {
18457 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
18458 machine_mode mode, r_mode;
18459
18460 bval = operands[0];
18461 rval = operands[1];
18462 mem = operands[2];
18463 oldval = operands[3];
18464 newval = operands[4];
18465 is_weak = operands[5];
18466 mod_s = operands[6];
18467 mod_f = operands[7];
18468 mode = GET_MODE (mem);
18469
18470 /* Normally the succ memory model must be stronger than fail, but in the
18471 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18472 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
18473 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
18474 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
18475 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
18476
18477 r_mode = mode;
18478 if (mode == QImode || mode == HImode)
18479 {
18480 r_mode = SImode;
18481 rval = gen_reg_rtx (r_mode);
18482 }
18483
18484 if (TARGET_LSE)
18485 {
18486 /* The CAS insn requires oldval and rval overlap, but we need to
18487 have a copy of oldval saved across the operation to tell if
18488 the operation is successful. */
18489 if (reg_overlap_mentioned_p (rval, oldval))
18490 rval = copy_to_mode_reg (r_mode, oldval);
18491 else
18492 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
18493
18494 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
18495 newval, mod_s));
18496 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18497 }
18498 else if (TARGET_OUTLINE_ATOMICS)
18499 {
18500 /* Oldval must satisfy compare afterward. */
18501 if (!aarch64_plus_operand (oldval, mode))
18502 oldval = force_reg (mode, oldval);
18503 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
18504 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
18505 oldval, mode, newval, mode,
18506 XEXP (mem, 0), Pmode);
18507 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18508 }
18509 else
18510 {
18511 /* The oldval predicate varies by mode. Test it and force to reg. */
18512 insn_code code = code_for_aarch64_compare_and_swap (mode);
18513 if (!insn_data[code].operand[2].predicate (oldval, mode))
18514 oldval = force_reg (mode, oldval);
18515
18516 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
18517 is_weak, mod_s, mod_f));
18518 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
18519 }
18520
18521 if (r_mode != mode)
18522 rval = gen_lowpart (mode, rval);
18523 emit_move_insn (operands[1], rval);
18524
18525 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
18526 emit_insn (gen_rtx_SET (bval, x));
18527 }
18528
18529 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
18530 sequence implementing an atomic operation. */
18531
18532 static void
18533 aarch64_emit_post_barrier (enum memmodel model)
18534 {
18535 const enum memmodel base_model = memmodel_base (model);
18536
18537 if (is_mm_sync (model)
18538 && (base_model == MEMMODEL_ACQUIRE
18539 || base_model == MEMMODEL_ACQ_REL
18540 || base_model == MEMMODEL_SEQ_CST))
18541 {
18542 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
18543 }
18544 }
18545
18546 /* Split a compare and swap pattern. */
18547
18548 void
18549 aarch64_split_compare_and_swap (rtx operands[])
18550 {
18551 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
18552 gcc_assert (epilogue_completed);
18553
18554 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
18555 machine_mode mode;
18556 bool is_weak;
18557 rtx_code_label *label1, *label2;
18558 enum memmodel model;
18559
18560 rval = operands[0];
18561 mem = operands[1];
18562 oldval = operands[2];
18563 newval = operands[3];
18564 is_weak = (operands[4] != const0_rtx);
18565 model_rtx = operands[5];
18566 scratch = operands[7];
18567 mode = GET_MODE (mem);
18568 model = memmodel_from_int (INTVAL (model_rtx));
18569
18570 /* When OLDVAL is zero and we want the strong version we can emit a tighter
18571 loop:
18572 .label1:
18573 LD[A]XR rval, [mem]
18574 CBNZ rval, .label2
18575 ST[L]XR scratch, newval, [mem]
18576 CBNZ scratch, .label1
18577 .label2:
18578 CMP rval, 0. */
18579 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
18580 oldval == const0_rtx && mode != TImode);
18581
18582 label1 = NULL;
18583 if (!is_weak)
18584 {
18585 label1 = gen_label_rtx ();
18586 emit_label (label1);
18587 }
18588 label2 = gen_label_rtx ();
18589
18590 /* The initial load can be relaxed for a __sync operation since a final
18591 barrier will be emitted to stop code hoisting. */
18592 if (is_mm_sync (model))
18593 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
18594 else
18595 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
18596
18597 if (strong_zero_p)
18598 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
18599 else
18600 {
18601 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18602 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
18603 }
18604 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18605 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
18606 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18607
18608 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
18609
18610 if (!is_weak)
18611 {
18612 if (aarch64_track_speculation)
18613 {
18614 /* Emit an explicit compare instruction, so that we can correctly
18615 track the condition codes. */
18616 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18617 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18618 }
18619 else
18620 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
18621
18622 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18623 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
18624 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18625 }
18626 else
18627 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18628
18629 emit_label (label2);
18630
18631 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
18632 to set the condition flags. If this is not used it will be removed by
18633 later passes. */
18634 if (strong_zero_p)
18635 aarch64_gen_compare_reg (NE, rval, const0_rtx);
18636
18637 /* Emit any final barrier needed for a __sync operation. */
18638 if (is_mm_sync (model))
18639 aarch64_emit_post_barrier (model);
18640 }
18641
18642 /* Split an atomic operation. */
18643
18644 void
18645 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
18646 rtx value, rtx model_rtx, rtx cond)
18647 {
18648 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
18649 gcc_assert (epilogue_completed);
18650
18651 machine_mode mode = GET_MODE (mem);
18652 machine_mode wmode = (mode == DImode ? DImode : SImode);
18653 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
18654 const bool is_sync = is_mm_sync (model);
18655 rtx_code_label *label;
18656 rtx x;
18657
18658 /* Split the atomic operation into a sequence. */
18659 label = gen_label_rtx ();
18660 emit_label (label);
18661
18662 if (new_out)
18663 new_out = gen_lowpart (wmode, new_out);
18664 if (old_out)
18665 old_out = gen_lowpart (wmode, old_out);
18666 else
18667 old_out = new_out;
18668 value = simplify_gen_subreg (wmode, value, mode, 0);
18669
18670 /* The initial load can be relaxed for a __sync operation since a final
18671 barrier will be emitted to stop code hoisting. */
18672 if (is_sync)
18673 aarch64_emit_load_exclusive (mode, old_out, mem,
18674 GEN_INT (MEMMODEL_RELAXED));
18675 else
18676 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
18677
18678 switch (code)
18679 {
18680 case SET:
18681 new_out = value;
18682 break;
18683
18684 case NOT:
18685 x = gen_rtx_AND (wmode, old_out, value);
18686 emit_insn (gen_rtx_SET (new_out, x));
18687 x = gen_rtx_NOT (wmode, new_out);
18688 emit_insn (gen_rtx_SET (new_out, x));
18689 break;
18690
18691 case MINUS:
18692 if (CONST_INT_P (value))
18693 {
18694 value = GEN_INT (-INTVAL (value));
18695 code = PLUS;
18696 }
18697 /* Fall through. */
18698
18699 default:
18700 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
18701 emit_insn (gen_rtx_SET (new_out, x));
18702 break;
18703 }
18704
18705 aarch64_emit_store_exclusive (mode, cond, mem,
18706 gen_lowpart (mode, new_out), model_rtx);
18707
18708 if (aarch64_track_speculation)
18709 {
18710 /* Emit an explicit compare instruction, so that we can correctly
18711 track the condition codes. */
18712 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
18713 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18714 }
18715 else
18716 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
18717
18718 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18719 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
18720 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18721
18722 /* Emit any final barrier needed for a __sync operation. */
18723 if (is_sync)
18724 aarch64_emit_post_barrier (model);
18725 }
18726
18727 static void
18728 aarch64_init_libfuncs (void)
18729 {
18730 /* Half-precision float operations. The compiler handles all operations
18731 with NULL libfuncs by converting to SFmode. */
18732
18733 /* Conversions. */
18734 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
18735 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
18736
18737 /* Arithmetic. */
18738 set_optab_libfunc (add_optab, HFmode, NULL);
18739 set_optab_libfunc (sdiv_optab, HFmode, NULL);
18740 set_optab_libfunc (smul_optab, HFmode, NULL);
18741 set_optab_libfunc (neg_optab, HFmode, NULL);
18742 set_optab_libfunc (sub_optab, HFmode, NULL);
18743
18744 /* Comparisons. */
18745 set_optab_libfunc (eq_optab, HFmode, NULL);
18746 set_optab_libfunc (ne_optab, HFmode, NULL);
18747 set_optab_libfunc (lt_optab, HFmode, NULL);
18748 set_optab_libfunc (le_optab, HFmode, NULL);
18749 set_optab_libfunc (ge_optab, HFmode, NULL);
18750 set_optab_libfunc (gt_optab, HFmode, NULL);
18751 set_optab_libfunc (unord_optab, HFmode, NULL);
18752 }
18753
18754 /* Target hook for c_mode_for_suffix. */
18755 static machine_mode
18756 aarch64_c_mode_for_suffix (char suffix)
18757 {
18758 if (suffix == 'q')
18759 return TFmode;
18760
18761 return VOIDmode;
18762 }
18763
18764 /* We can only represent floating point constants which will fit in
18765 "quarter-precision" values. These values are characterised by
18766 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
18767 by:
18768
18769 (-1)^s * (n/16) * 2^r
18770
18771 Where:
18772 's' is the sign bit.
18773 'n' is an integer in the range 16 <= n <= 31.
18774 'r' is an integer in the range -3 <= r <= 4. */
18775
18776 /* Return true iff X can be represented by a quarter-precision
18777 floating point immediate operand X. Note, we cannot represent 0.0. */
18778 bool
18779 aarch64_float_const_representable_p (rtx x)
18780 {
18781 /* This represents our current view of how many bits
18782 make up the mantissa. */
18783 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
18784 int exponent;
18785 unsigned HOST_WIDE_INT mantissa, mask;
18786 REAL_VALUE_TYPE r, m;
18787 bool fail;
18788
18789 x = unwrap_const_vec_duplicate (x);
18790 if (!CONST_DOUBLE_P (x))
18791 return false;
18792
18793 if (GET_MODE (x) == VOIDmode
18794 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
18795 return false;
18796
18797 r = *CONST_DOUBLE_REAL_VALUE (x);
18798
18799 /* We cannot represent infinities, NaNs or +/-zero. We won't
18800 know if we have +zero until we analyse the mantissa, but we
18801 can reject the other invalid values. */
18802 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
18803 || REAL_VALUE_MINUS_ZERO (r))
18804 return false;
18805
18806 /* Extract exponent. */
18807 r = real_value_abs (&r);
18808 exponent = REAL_EXP (&r);
18809
18810 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
18811 highest (sign) bit, with a fixed binary point at bit point_pos.
18812 m1 holds the low part of the mantissa, m2 the high part.
18813 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
18814 bits for the mantissa, this can fail (low bits will be lost). */
18815 real_ldexp (&m, &r, point_pos - exponent);
18816 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
18817
18818 /* If the low part of the mantissa has bits set we cannot represent
18819 the value. */
18820 if (w.ulow () != 0)
18821 return false;
18822 /* We have rejected the lower HOST_WIDE_INT, so update our
18823 understanding of how many bits lie in the mantissa and
18824 look only at the high HOST_WIDE_INT. */
18825 mantissa = w.elt (1);
18826 point_pos -= HOST_BITS_PER_WIDE_INT;
18827
18828 /* We can only represent values with a mantissa of the form 1.xxxx. */
18829 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
18830 if ((mantissa & mask) != 0)
18831 return false;
18832
18833 /* Having filtered unrepresentable values, we may now remove all
18834 but the highest 5 bits. */
18835 mantissa >>= point_pos - 5;
18836
18837 /* We cannot represent the value 0.0, so reject it. This is handled
18838 elsewhere. */
18839 if (mantissa == 0)
18840 return false;
18841
18842 /* Then, as bit 4 is always set, we can mask it off, leaving
18843 the mantissa in the range [0, 15]. */
18844 mantissa &= ~(1 << 4);
18845 gcc_assert (mantissa <= 15);
18846
18847 /* GCC internally does not use IEEE754-like encoding (where normalized
18848 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
18849 Our mantissa values are shifted 4 places to the left relative to
18850 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
18851 by 5 places to correct for GCC's representation. */
18852 exponent = 5 - exponent;
18853
18854 return (exponent >= 0 && exponent <= 7);
18855 }
18856
18857 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
18858 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
18859 output MOVI/MVNI, ORR or BIC immediate. */
18860 char*
18861 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
18862 enum simd_immediate_check which)
18863 {
18864 bool is_valid;
18865 static char templ[40];
18866 const char *mnemonic;
18867 const char *shift_op;
18868 unsigned int lane_count = 0;
18869 char element_char;
18870
18871 struct simd_immediate_info info;
18872
18873 /* This will return true to show const_vector is legal for use as either
18874 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
18875 It will also update INFO to show how the immediate should be generated.
18876 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
18877 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
18878 gcc_assert (is_valid);
18879
18880 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18881 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
18882
18883 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18884 {
18885 gcc_assert (info.insn == simd_immediate_info::MOV
18886 && info.u.mov.shift == 0);
18887 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
18888 move immediate path. */
18889 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18890 info.u.mov.value = GEN_INT (0);
18891 else
18892 {
18893 const unsigned int buf_size = 20;
18894 char float_buf[buf_size] = {'\0'};
18895 real_to_decimal_for_mode (float_buf,
18896 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18897 buf_size, buf_size, 1, info.elt_mode);
18898
18899 if (lane_count == 1)
18900 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
18901 else
18902 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
18903 lane_count, element_char, float_buf);
18904 return templ;
18905 }
18906 }
18907
18908 gcc_assert (CONST_INT_P (info.u.mov.value));
18909
18910 if (which == AARCH64_CHECK_MOV)
18911 {
18912 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
18913 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
18914 ? "msl" : "lsl");
18915 if (lane_count == 1)
18916 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
18917 mnemonic, UINTVAL (info.u.mov.value));
18918 else if (info.u.mov.shift)
18919 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18920 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
18921 element_char, UINTVAL (info.u.mov.value), shift_op,
18922 info.u.mov.shift);
18923 else
18924 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18925 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
18926 element_char, UINTVAL (info.u.mov.value));
18927 }
18928 else
18929 {
18930 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
18931 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
18932 if (info.u.mov.shift)
18933 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18934 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
18935 element_char, UINTVAL (info.u.mov.value), "lsl",
18936 info.u.mov.shift);
18937 else
18938 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18939 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
18940 element_char, UINTVAL (info.u.mov.value));
18941 }
18942 return templ;
18943 }
18944
18945 char*
18946 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
18947 {
18948
18949 /* If a floating point number was passed and we desire to use it in an
18950 integer mode do the conversion to integer. */
18951 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
18952 {
18953 unsigned HOST_WIDE_INT ival;
18954 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
18955 gcc_unreachable ();
18956 immediate = gen_int_mode (ival, mode);
18957 }
18958
18959 machine_mode vmode;
18960 /* use a 64 bit mode for everything except for DI/DF mode, where we use
18961 a 128 bit vector mode. */
18962 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
18963
18964 vmode = aarch64_simd_container_mode (mode, width);
18965 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
18966 return aarch64_output_simd_mov_immediate (v_op, width);
18967 }
18968
18969 /* Return the output string to use for moving immediate CONST_VECTOR
18970 into an SVE register. */
18971
18972 char *
18973 aarch64_output_sve_mov_immediate (rtx const_vector)
18974 {
18975 static char templ[40];
18976 struct simd_immediate_info info;
18977 char element_char;
18978
18979 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
18980 gcc_assert (is_valid);
18981
18982 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18983
18984 machine_mode vec_mode = GET_MODE (const_vector);
18985 if (aarch64_sve_pred_mode_p (vec_mode))
18986 {
18987 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
18988 if (info.insn == simd_immediate_info::MOV)
18989 {
18990 gcc_assert (info.u.mov.value == const0_rtx);
18991 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
18992 }
18993 else
18994 {
18995 gcc_assert (info.insn == simd_immediate_info::PTRUE);
18996 unsigned int total_bytes;
18997 if (info.u.pattern == AARCH64_SV_ALL
18998 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
18999 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
19000 total_bytes / GET_MODE_SIZE (info.elt_mode));
19001 else
19002 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
19003 svpattern_token (info.u.pattern));
19004 }
19005 return buf;
19006 }
19007
19008 if (info.insn == simd_immediate_info::INDEX)
19009 {
19010 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
19011 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
19012 element_char, INTVAL (info.u.index.base),
19013 INTVAL (info.u.index.step));
19014 return templ;
19015 }
19016
19017 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19018 {
19019 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19020 info.u.mov.value = GEN_INT (0);
19021 else
19022 {
19023 const int buf_size = 20;
19024 char float_buf[buf_size] = {};
19025 real_to_decimal_for_mode (float_buf,
19026 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19027 buf_size, buf_size, 1, info.elt_mode);
19028
19029 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
19030 element_char, float_buf);
19031 return templ;
19032 }
19033 }
19034
19035 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
19036 element_char, INTVAL (info.u.mov.value));
19037 return templ;
19038 }
19039
19040 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
19041 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
19042 pattern. */
19043
19044 char *
19045 aarch64_output_sve_ptrues (rtx const_unspec)
19046 {
19047 static char templ[40];
19048
19049 struct simd_immediate_info info;
19050 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
19051 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
19052
19053 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19054 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
19055 svpattern_token (info.u.pattern));
19056 return templ;
19057 }
19058
19059 /* Split operands into moves from op[1] + op[2] into op[0]. */
19060
19061 void
19062 aarch64_split_combinev16qi (rtx operands[3])
19063 {
19064 unsigned int dest = REGNO (operands[0]);
19065 unsigned int src1 = REGNO (operands[1]);
19066 unsigned int src2 = REGNO (operands[2]);
19067 machine_mode halfmode = GET_MODE (operands[1]);
19068 unsigned int halfregs = REG_NREGS (operands[1]);
19069 rtx destlo, desthi;
19070
19071 gcc_assert (halfmode == V16QImode);
19072
19073 if (src1 == dest && src2 == dest + halfregs)
19074 {
19075 /* No-op move. Can't split to nothing; emit something. */
19076 emit_note (NOTE_INSN_DELETED);
19077 return;
19078 }
19079
19080 /* Preserve register attributes for variable tracking. */
19081 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
19082 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
19083 GET_MODE_SIZE (halfmode));
19084
19085 /* Special case of reversed high/low parts. */
19086 if (reg_overlap_mentioned_p (operands[2], destlo)
19087 && reg_overlap_mentioned_p (operands[1], desthi))
19088 {
19089 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19090 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
19091 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19092 }
19093 else if (!reg_overlap_mentioned_p (operands[2], destlo))
19094 {
19095 /* Try to avoid unnecessary moves if part of the result
19096 is in the right place already. */
19097 if (src1 != dest)
19098 emit_move_insn (destlo, operands[1]);
19099 if (src2 != dest + halfregs)
19100 emit_move_insn (desthi, operands[2]);
19101 }
19102 else
19103 {
19104 if (src2 != dest + halfregs)
19105 emit_move_insn (desthi, operands[2]);
19106 if (src1 != dest)
19107 emit_move_insn (destlo, operands[1]);
19108 }
19109 }
19110
19111 /* vec_perm support. */
19112
19113 struct expand_vec_perm_d
19114 {
19115 rtx target, op0, op1;
19116 vec_perm_indices perm;
19117 machine_mode vmode;
19118 unsigned int vec_flags;
19119 bool one_vector_p;
19120 bool testing_p;
19121 };
19122
19123 /* Generate a variable permutation. */
19124
19125 static void
19126 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
19127 {
19128 machine_mode vmode = GET_MODE (target);
19129 bool one_vector_p = rtx_equal_p (op0, op1);
19130
19131 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
19132 gcc_checking_assert (GET_MODE (op0) == vmode);
19133 gcc_checking_assert (GET_MODE (op1) == vmode);
19134 gcc_checking_assert (GET_MODE (sel) == vmode);
19135 gcc_checking_assert (TARGET_SIMD);
19136
19137 if (one_vector_p)
19138 {
19139 if (vmode == V8QImode)
19140 {
19141 /* Expand the argument to a V16QI mode by duplicating it. */
19142 rtx pair = gen_reg_rtx (V16QImode);
19143 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
19144 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19145 }
19146 else
19147 {
19148 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
19149 }
19150 }
19151 else
19152 {
19153 rtx pair;
19154
19155 if (vmode == V8QImode)
19156 {
19157 pair = gen_reg_rtx (V16QImode);
19158 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
19159 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19160 }
19161 else
19162 {
19163 pair = gen_reg_rtx (OImode);
19164 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
19165 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
19166 }
19167 }
19168 }
19169
19170 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
19171 NELT is the number of elements in the vector. */
19172
19173 void
19174 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
19175 unsigned int nelt)
19176 {
19177 machine_mode vmode = GET_MODE (target);
19178 bool one_vector_p = rtx_equal_p (op0, op1);
19179 rtx mask;
19180
19181 /* The TBL instruction does not use a modulo index, so we must take care
19182 of that ourselves. */
19183 mask = aarch64_simd_gen_const_vector_dup (vmode,
19184 one_vector_p ? nelt - 1 : 2 * nelt - 1);
19185 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
19186
19187 /* For big-endian, we also need to reverse the index within the vector
19188 (but not which vector). */
19189 if (BYTES_BIG_ENDIAN)
19190 {
19191 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
19192 if (!one_vector_p)
19193 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
19194 sel = expand_simple_binop (vmode, XOR, sel, mask,
19195 NULL, 0, OPTAB_LIB_WIDEN);
19196 }
19197 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
19198 }
19199
19200 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
19201
19202 static void
19203 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
19204 {
19205 emit_insn (gen_rtx_SET (target,
19206 gen_rtx_UNSPEC (GET_MODE (target),
19207 gen_rtvec (2, op0, op1), code)));
19208 }
19209
19210 /* Expand an SVE vec_perm with the given operands. */
19211
19212 void
19213 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
19214 {
19215 machine_mode data_mode = GET_MODE (target);
19216 machine_mode sel_mode = GET_MODE (sel);
19217 /* Enforced by the pattern condition. */
19218 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
19219
19220 /* Note: vec_perm indices are supposed to wrap when they go beyond the
19221 size of the two value vectors, i.e. the upper bits of the indices
19222 are effectively ignored. SVE TBL instead produces 0 for any
19223 out-of-range indices, so we need to modulo all the vec_perm indices
19224 to ensure they are all in range. */
19225 rtx sel_reg = force_reg (sel_mode, sel);
19226
19227 /* Check if the sel only references the first values vector. */
19228 if (GET_CODE (sel) == CONST_VECTOR
19229 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
19230 {
19231 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
19232 return;
19233 }
19234
19235 /* Check if the two values vectors are the same. */
19236 if (rtx_equal_p (op0, op1))
19237 {
19238 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
19239 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19240 NULL, 0, OPTAB_DIRECT);
19241 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
19242 return;
19243 }
19244
19245 /* Run TBL on for each value vector and combine the results. */
19246
19247 rtx res0 = gen_reg_rtx (data_mode);
19248 rtx res1 = gen_reg_rtx (data_mode);
19249 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
19250 if (GET_CODE (sel) != CONST_VECTOR
19251 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
19252 {
19253 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
19254 2 * nunits - 1);
19255 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19256 NULL, 0, OPTAB_DIRECT);
19257 }
19258 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
19259 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
19260 NULL, 0, OPTAB_DIRECT);
19261 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
19262 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
19263 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
19264 else
19265 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
19266 }
19267
19268 /* Recognize patterns suitable for the TRN instructions. */
19269 static bool
19270 aarch64_evpc_trn (struct expand_vec_perm_d *d)
19271 {
19272 HOST_WIDE_INT odd;
19273 poly_uint64 nelt = d->perm.length ();
19274 rtx out, in0, in1, x;
19275 machine_mode vmode = d->vmode;
19276
19277 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19278 return false;
19279
19280 /* Note that these are little-endian tests.
19281 We correct for big-endian later. */
19282 if (!d->perm[0].is_constant (&odd)
19283 || (odd != 0 && odd != 1)
19284 || !d->perm.series_p (0, 2, odd, 2)
19285 || !d->perm.series_p (1, 2, nelt + odd, 2))
19286 return false;
19287
19288 /* Success! */
19289 if (d->testing_p)
19290 return true;
19291
19292 in0 = d->op0;
19293 in1 = d->op1;
19294 /* We don't need a big-endian lane correction for SVE; see the comment
19295 at the head of aarch64-sve.md for details. */
19296 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19297 {
19298 x = in0, in0 = in1, in1 = x;
19299 odd = !odd;
19300 }
19301 out = d->target;
19302
19303 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19304 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
19305 return true;
19306 }
19307
19308 /* Recognize patterns suitable for the UZP instructions. */
19309 static bool
19310 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
19311 {
19312 HOST_WIDE_INT odd;
19313 rtx out, in0, in1, x;
19314 machine_mode vmode = d->vmode;
19315
19316 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19317 return false;
19318
19319 /* Note that these are little-endian tests.
19320 We correct for big-endian later. */
19321 if (!d->perm[0].is_constant (&odd)
19322 || (odd != 0 && odd != 1)
19323 || !d->perm.series_p (0, 1, odd, 2))
19324 return false;
19325
19326 /* Success! */
19327 if (d->testing_p)
19328 return true;
19329
19330 in0 = d->op0;
19331 in1 = d->op1;
19332 /* We don't need a big-endian lane correction for SVE; see the comment
19333 at the head of aarch64-sve.md for details. */
19334 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19335 {
19336 x = in0, in0 = in1, in1 = x;
19337 odd = !odd;
19338 }
19339 out = d->target;
19340
19341 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19342 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
19343 return true;
19344 }
19345
19346 /* Recognize patterns suitable for the ZIP instructions. */
19347 static bool
19348 aarch64_evpc_zip (struct expand_vec_perm_d *d)
19349 {
19350 unsigned int high;
19351 poly_uint64 nelt = d->perm.length ();
19352 rtx out, in0, in1, x;
19353 machine_mode vmode = d->vmode;
19354
19355 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19356 return false;
19357
19358 /* Note that these are little-endian tests.
19359 We correct for big-endian later. */
19360 poly_uint64 first = d->perm[0];
19361 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
19362 || !d->perm.series_p (0, 2, first, 1)
19363 || !d->perm.series_p (1, 2, first + nelt, 1))
19364 return false;
19365 high = maybe_ne (first, 0U);
19366
19367 /* Success! */
19368 if (d->testing_p)
19369 return true;
19370
19371 in0 = d->op0;
19372 in1 = d->op1;
19373 /* We don't need a big-endian lane correction for SVE; see the comment
19374 at the head of aarch64-sve.md for details. */
19375 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19376 {
19377 x = in0, in0 = in1, in1 = x;
19378 high = !high;
19379 }
19380 out = d->target;
19381
19382 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19383 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
19384 return true;
19385 }
19386
19387 /* Recognize patterns for the EXT insn. */
19388
19389 static bool
19390 aarch64_evpc_ext (struct expand_vec_perm_d *d)
19391 {
19392 HOST_WIDE_INT location;
19393 rtx offset;
19394
19395 /* The first element always refers to the first vector.
19396 Check if the extracted indices are increasing by one. */
19397 if (d->vec_flags == VEC_SVE_PRED
19398 || !d->perm[0].is_constant (&location)
19399 || !d->perm.series_p (0, 1, location, 1))
19400 return false;
19401
19402 /* Success! */
19403 if (d->testing_p)
19404 return true;
19405
19406 /* The case where (location == 0) is a no-op for both big- and little-endian,
19407 and is removed by the mid-end at optimization levels -O1 and higher.
19408
19409 We don't need a big-endian lane correction for SVE; see the comment
19410 at the head of aarch64-sve.md for details. */
19411 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
19412 {
19413 /* After setup, we want the high elements of the first vector (stored
19414 at the LSB end of the register), and the low elements of the second
19415 vector (stored at the MSB end of the register). So swap. */
19416 std::swap (d->op0, d->op1);
19417 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19418 to_constant () is safe since this is restricted to Advanced SIMD
19419 vectors. */
19420 location = d->perm.length ().to_constant () - location;
19421 }
19422
19423 offset = GEN_INT (location);
19424 emit_set_insn (d->target,
19425 gen_rtx_UNSPEC (d->vmode,
19426 gen_rtvec (3, d->op0, d->op1, offset),
19427 UNSPEC_EXT));
19428 return true;
19429 }
19430
19431 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19432 within each 64-bit, 32-bit or 16-bit granule. */
19433
19434 static bool
19435 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
19436 {
19437 HOST_WIDE_INT diff;
19438 unsigned int i, size, unspec;
19439 machine_mode pred_mode;
19440
19441 if (d->vec_flags == VEC_SVE_PRED
19442 || !d->one_vector_p
19443 || !d->perm[0].is_constant (&diff))
19444 return false;
19445
19446 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
19447 if (size == 8)
19448 {
19449 unspec = UNSPEC_REV64;
19450 pred_mode = VNx2BImode;
19451 }
19452 else if (size == 4)
19453 {
19454 unspec = UNSPEC_REV32;
19455 pred_mode = VNx4BImode;
19456 }
19457 else if (size == 2)
19458 {
19459 unspec = UNSPEC_REV16;
19460 pred_mode = VNx8BImode;
19461 }
19462 else
19463 return false;
19464
19465 unsigned int step = diff + 1;
19466 for (i = 0; i < step; ++i)
19467 if (!d->perm.series_p (i, step, diff - i, step))
19468 return false;
19469
19470 /* Success! */
19471 if (d->testing_p)
19472 return true;
19473
19474 if (d->vec_flags == VEC_SVE_DATA)
19475 {
19476 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
19477 rtx target = gen_reg_rtx (int_mode);
19478 if (BYTES_BIG_ENDIAN)
19479 /* The act of taking a subreg between INT_MODE and d->vmode
19480 is itself a reversing operation on big-endian targets;
19481 see the comment at the head of aarch64-sve.md for details.
19482 First reinterpret OP0 as INT_MODE without using a subreg
19483 and without changing the contents. */
19484 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
19485 else
19486 {
19487 /* For SVE we use REV[BHW] unspecs derived from the element size
19488 of v->mode and vector modes whose elements have SIZE bytes.
19489 This ensures that the vector modes match the predicate modes. */
19490 int unspec = aarch64_sve_rev_unspec (d->vmode);
19491 rtx pred = aarch64_ptrue_reg (pred_mode);
19492 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
19493 gen_lowpart (int_mode, d->op0)));
19494 }
19495 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19496 return true;
19497 }
19498 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
19499 emit_set_insn (d->target, src);
19500 return true;
19501 }
19502
19503 /* Recognize patterns for the REV insn, which reverses elements within
19504 a full vector. */
19505
19506 static bool
19507 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
19508 {
19509 poly_uint64 nelt = d->perm.length ();
19510
19511 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
19512 return false;
19513
19514 if (!d->perm.series_p (0, 1, nelt - 1, -1))
19515 return false;
19516
19517 /* Success! */
19518 if (d->testing_p)
19519 return true;
19520
19521 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
19522 emit_set_insn (d->target, src);
19523 return true;
19524 }
19525
19526 static bool
19527 aarch64_evpc_dup (struct expand_vec_perm_d *d)
19528 {
19529 rtx out = d->target;
19530 rtx in0;
19531 HOST_WIDE_INT elt;
19532 machine_mode vmode = d->vmode;
19533 rtx lane;
19534
19535 if (d->vec_flags == VEC_SVE_PRED
19536 || d->perm.encoding ().encoded_nelts () != 1
19537 || !d->perm[0].is_constant (&elt))
19538 return false;
19539
19540 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
19541 return false;
19542
19543 /* Success! */
19544 if (d->testing_p)
19545 return true;
19546
19547 /* The generic preparation in aarch64_expand_vec_perm_const_1
19548 swaps the operand order and the permute indices if it finds
19549 d->perm[0] to be in the second operand. Thus, we can always
19550 use d->op0 and need not do any extra arithmetic to get the
19551 correct lane number. */
19552 in0 = d->op0;
19553 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
19554
19555 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
19556 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
19557 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
19558 return true;
19559 }
19560
19561 static bool
19562 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
19563 {
19564 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
19565 machine_mode vmode = d->vmode;
19566
19567 /* Make sure that the indices are constant. */
19568 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
19569 for (unsigned int i = 0; i < encoded_nelts; ++i)
19570 if (!d->perm[i].is_constant ())
19571 return false;
19572
19573 if (d->testing_p)
19574 return true;
19575
19576 /* Generic code will try constant permutation twice. Once with the
19577 original mode and again with the elements lowered to QImode.
19578 So wait and don't do the selector expansion ourselves. */
19579 if (vmode != V8QImode && vmode != V16QImode)
19580 return false;
19581
19582 /* to_constant is safe since this routine is specific to Advanced SIMD
19583 vectors. */
19584 unsigned int nelt = d->perm.length ().to_constant ();
19585 for (unsigned int i = 0; i < nelt; ++i)
19586 /* If big-endian and two vectors we end up with a weird mixed-endian
19587 mode on NEON. Reverse the index within each word but not the word
19588 itself. to_constant is safe because we checked is_constant above. */
19589 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
19590 ? d->perm[i].to_constant () ^ (nelt - 1)
19591 : d->perm[i].to_constant ());
19592
19593 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19594 sel = force_reg (vmode, sel);
19595
19596 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
19597 return true;
19598 }
19599
19600 /* Try to implement D using an SVE TBL instruction. */
19601
19602 static bool
19603 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
19604 {
19605 unsigned HOST_WIDE_INT nelt;
19606
19607 /* Permuting two variable-length vectors could overflow the
19608 index range. */
19609 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
19610 return false;
19611
19612 if (d->testing_p)
19613 return true;
19614
19615 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
19616 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
19617 if (d->one_vector_p)
19618 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
19619 else
19620 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
19621 return true;
19622 }
19623
19624 /* Try to implement D using SVE SEL instruction. */
19625
19626 static bool
19627 aarch64_evpc_sel (struct expand_vec_perm_d *d)
19628 {
19629 machine_mode vmode = d->vmode;
19630 int unit_size = GET_MODE_UNIT_SIZE (vmode);
19631
19632 if (d->vec_flags != VEC_SVE_DATA
19633 || unit_size > 8)
19634 return false;
19635
19636 int n_patterns = d->perm.encoding ().npatterns ();
19637 poly_int64 vec_len = d->perm.length ();
19638
19639 for (int i = 0; i < n_patterns; ++i)
19640 if (!known_eq (d->perm[i], i)
19641 && !known_eq (d->perm[i], vec_len + i))
19642 return false;
19643
19644 for (int i = n_patterns; i < n_patterns * 2; i++)
19645 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
19646 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
19647 return false;
19648
19649 if (d->testing_p)
19650 return true;
19651
19652 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
19653
19654 /* Build a predicate that is true when op0 elements should be used. */
19655 rtx_vector_builder builder (pred_mode, n_patterns, 2);
19656 for (int i = 0; i < n_patterns * 2; i++)
19657 {
19658 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
19659 : CONST0_RTX (BImode);
19660 builder.quick_push (elem);
19661 }
19662
19663 rtx const_vec = builder.build ();
19664 rtx pred = force_reg (pred_mode, const_vec);
19665 /* TARGET = PRED ? OP0 : OP1. */
19666 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
19667 return true;
19668 }
19669
19670 static bool
19671 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19672 {
19673 /* The pattern matching functions above are written to look for a small
19674 number to begin the sequence (0, 1, N/2). If we begin with an index
19675 from the second operand, we can swap the operands. */
19676 poly_int64 nelt = d->perm.length ();
19677 if (known_ge (d->perm[0], nelt))
19678 {
19679 d->perm.rotate_inputs (1);
19680 std::swap (d->op0, d->op1);
19681 }
19682
19683 if ((d->vec_flags == VEC_ADVSIMD
19684 || d->vec_flags == VEC_SVE_DATA
19685 || d->vec_flags == VEC_SVE_PRED)
19686 && known_gt (nelt, 1))
19687 {
19688 if (aarch64_evpc_rev_local (d))
19689 return true;
19690 else if (aarch64_evpc_rev_global (d))
19691 return true;
19692 else if (aarch64_evpc_ext (d))
19693 return true;
19694 else if (aarch64_evpc_dup (d))
19695 return true;
19696 else if (aarch64_evpc_zip (d))
19697 return true;
19698 else if (aarch64_evpc_uzp (d))
19699 return true;
19700 else if (aarch64_evpc_trn (d))
19701 return true;
19702 else if (aarch64_evpc_sel (d))
19703 return true;
19704 if (d->vec_flags == VEC_SVE_DATA)
19705 return aarch64_evpc_sve_tbl (d);
19706 else if (d->vec_flags == VEC_ADVSIMD)
19707 return aarch64_evpc_tbl (d);
19708 }
19709 return false;
19710 }
19711
19712 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19713
19714 static bool
19715 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19716 rtx op1, const vec_perm_indices &sel)
19717 {
19718 struct expand_vec_perm_d d;
19719
19720 /* Check whether the mask can be applied to a single vector. */
19721 if (sel.ninputs () == 1
19722 || (op0 && rtx_equal_p (op0, op1)))
19723 d.one_vector_p = true;
19724 else if (sel.all_from_input_p (0))
19725 {
19726 d.one_vector_p = true;
19727 op1 = op0;
19728 }
19729 else if (sel.all_from_input_p (1))
19730 {
19731 d.one_vector_p = true;
19732 op0 = op1;
19733 }
19734 else
19735 d.one_vector_p = false;
19736
19737 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
19738 sel.nelts_per_input ());
19739 d.vmode = vmode;
19740 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
19741 d.target = target;
19742 d.op0 = op0;
19743 d.op1 = op1;
19744 d.testing_p = !target;
19745
19746 if (!d.testing_p)
19747 return aarch64_expand_vec_perm_const_1 (&d);
19748
19749 rtx_insn *last = get_last_insn ();
19750 bool ret = aarch64_expand_vec_perm_const_1 (&d);
19751 gcc_assert (last == get_last_insn ());
19752
19753 return ret;
19754 }
19755
19756 /* Generate a byte permute mask for a register of mode MODE,
19757 which has NUNITS units. */
19758
19759 rtx
19760 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
19761 {
19762 /* We have to reverse each vector because we dont have
19763 a permuted load that can reverse-load according to ABI rules. */
19764 rtx mask;
19765 rtvec v = rtvec_alloc (16);
19766 unsigned int i, j;
19767 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
19768
19769 gcc_assert (BYTES_BIG_ENDIAN);
19770 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
19771
19772 for (i = 0; i < nunits; i++)
19773 for (j = 0; j < usize; j++)
19774 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
19775 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
19776 return force_reg (V16QImode, mask);
19777 }
19778
19779 /* Expand an SVE integer comparison using the SVE equivalent of:
19780
19781 (set TARGET (CODE OP0 OP1)). */
19782
19783 void
19784 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
19785 {
19786 machine_mode pred_mode = GET_MODE (target);
19787 machine_mode data_mode = GET_MODE (op0);
19788 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
19789 op0, op1);
19790 if (!rtx_equal_p (target, res))
19791 emit_move_insn (target, res);
19792 }
19793
19794 /* Return the UNSPEC_COND_* code for comparison CODE. */
19795
19796 static unsigned int
19797 aarch64_unspec_cond_code (rtx_code code)
19798 {
19799 switch (code)
19800 {
19801 case NE:
19802 return UNSPEC_COND_FCMNE;
19803 case EQ:
19804 return UNSPEC_COND_FCMEQ;
19805 case LT:
19806 return UNSPEC_COND_FCMLT;
19807 case GT:
19808 return UNSPEC_COND_FCMGT;
19809 case LE:
19810 return UNSPEC_COND_FCMLE;
19811 case GE:
19812 return UNSPEC_COND_FCMGE;
19813 case UNORDERED:
19814 return UNSPEC_COND_FCMUO;
19815 default:
19816 gcc_unreachable ();
19817 }
19818 }
19819
19820 /* Emit:
19821
19822 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19823
19824 where <X> is the operation associated with comparison CODE.
19825 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19826
19827 static void
19828 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
19829 bool known_ptrue_p, rtx op0, rtx op1)
19830 {
19831 rtx flag = gen_int_mode (known_ptrue_p, SImode);
19832 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
19833 gen_rtvec (4, pred, flag, op0, op1),
19834 aarch64_unspec_cond_code (code));
19835 emit_set_insn (target, unspec);
19836 }
19837
19838 /* Emit the SVE equivalent of:
19839
19840 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
19841 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
19842 (set TARGET (ior:PRED_MODE TMP1 TMP2))
19843
19844 where <Xi> is the operation associated with comparison CODEi.
19845 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19846
19847 static void
19848 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
19849 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
19850 {
19851 machine_mode pred_mode = GET_MODE (pred);
19852 rtx tmp1 = gen_reg_rtx (pred_mode);
19853 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
19854 rtx tmp2 = gen_reg_rtx (pred_mode);
19855 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
19856 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
19857 }
19858
19859 /* Emit the SVE equivalent of:
19860
19861 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19862 (set TARGET (not TMP))
19863
19864 where <X> is the operation associated with comparison CODE.
19865 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19866
19867 static void
19868 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
19869 bool known_ptrue_p, rtx op0, rtx op1)
19870 {
19871 machine_mode pred_mode = GET_MODE (pred);
19872 rtx tmp = gen_reg_rtx (pred_mode);
19873 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
19874 aarch64_emit_unop (target, one_cmpl_optab, tmp);
19875 }
19876
19877 /* Expand an SVE floating-point comparison using the SVE equivalent of:
19878
19879 (set TARGET (CODE OP0 OP1))
19880
19881 If CAN_INVERT_P is true, the caller can also handle inverted results;
19882 return true if the result is in fact inverted. */
19883
19884 bool
19885 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
19886 rtx op0, rtx op1, bool can_invert_p)
19887 {
19888 machine_mode pred_mode = GET_MODE (target);
19889 machine_mode data_mode = GET_MODE (op0);
19890
19891 rtx ptrue = aarch64_ptrue_reg (pred_mode);
19892 switch (code)
19893 {
19894 case UNORDERED:
19895 /* UNORDERED has no immediate form. */
19896 op1 = force_reg (data_mode, op1);
19897 /* fall through */
19898 case LT:
19899 case LE:
19900 case GT:
19901 case GE:
19902 case EQ:
19903 case NE:
19904 {
19905 /* There is native support for the comparison. */
19906 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19907 return false;
19908 }
19909
19910 case LTGT:
19911 /* This is a trapping operation (LT or GT). */
19912 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
19913 return false;
19914
19915 case UNEQ:
19916 if (!flag_trapping_math)
19917 {
19918 /* This would trap for signaling NaNs. */
19919 op1 = force_reg (data_mode, op1);
19920 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
19921 ptrue, true, op0, op1);
19922 return false;
19923 }
19924 /* fall through */
19925 case UNLT:
19926 case UNLE:
19927 case UNGT:
19928 case UNGE:
19929 if (flag_trapping_math)
19930 {
19931 /* Work out which elements are ordered. */
19932 rtx ordered = gen_reg_rtx (pred_mode);
19933 op1 = force_reg (data_mode, op1);
19934 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
19935 ptrue, true, op0, op1);
19936
19937 /* Test the opposite condition for the ordered elements,
19938 then invert the result. */
19939 if (code == UNEQ)
19940 code = NE;
19941 else
19942 code = reverse_condition_maybe_unordered (code);
19943 if (can_invert_p)
19944 {
19945 aarch64_emit_sve_fp_cond (target, code,
19946 ordered, false, op0, op1);
19947 return true;
19948 }
19949 aarch64_emit_sve_invert_fp_cond (target, code,
19950 ordered, false, op0, op1);
19951 return false;
19952 }
19953 break;
19954
19955 case ORDERED:
19956 /* ORDERED has no immediate form. */
19957 op1 = force_reg (data_mode, op1);
19958 break;
19959
19960 default:
19961 gcc_unreachable ();
19962 }
19963
19964 /* There is native support for the inverse comparison. */
19965 code = reverse_condition_maybe_unordered (code);
19966 if (can_invert_p)
19967 {
19968 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19969 return true;
19970 }
19971 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
19972 return false;
19973 }
19974
19975 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
19976 of the data being selected and CMP_MODE is the mode of the values being
19977 compared. */
19978
19979 void
19980 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
19981 rtx *ops)
19982 {
19983 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
19984 rtx pred = gen_reg_rtx (pred_mode);
19985 if (FLOAT_MODE_P (cmp_mode))
19986 {
19987 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
19988 ops[4], ops[5], true))
19989 std::swap (ops[1], ops[2]);
19990 }
19991 else
19992 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
19993
19994 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
19995 ops[1] = force_reg (data_mode, ops[1]);
19996 /* The "false" value can only be zero if the "true" value is a constant. */
19997 if (register_operand (ops[1], data_mode)
19998 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
19999 ops[2] = force_reg (data_mode, ops[2]);
20000
20001 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
20002 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
20003 }
20004
20005 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
20006 true. However due to issues with register allocation it is preferable
20007 to avoid tieing integer scalar and FP scalar modes. Executing integer
20008 operations in general registers is better than treating them as scalar
20009 vector operations. This reduces latency and avoids redundant int<->FP
20010 moves. So tie modes if they are either the same class, or vector modes
20011 with other vector modes, vector structs or any scalar mode. */
20012
20013 static bool
20014 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
20015 {
20016 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
20017 return true;
20018
20019 /* We specifically want to allow elements of "structure" modes to
20020 be tieable to the structure. This more general condition allows
20021 other rarer situations too. The reason we don't extend this to
20022 predicate modes is that there are no predicate structure modes
20023 nor any specific instructions for extracting part of a predicate
20024 register. */
20025 if (aarch64_vector_data_mode_p (mode1)
20026 && aarch64_vector_data_mode_p (mode2))
20027 return true;
20028
20029 /* Also allow any scalar modes with vectors. */
20030 if (aarch64_vector_mode_supported_p (mode1)
20031 || aarch64_vector_mode_supported_p (mode2))
20032 return true;
20033
20034 return false;
20035 }
20036
20037 /* Return a new RTX holding the result of moving POINTER forward by
20038 AMOUNT bytes. */
20039
20040 static rtx
20041 aarch64_move_pointer (rtx pointer, poly_int64 amount)
20042 {
20043 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
20044
20045 return adjust_automodify_address (pointer, GET_MODE (pointer),
20046 next, amount);
20047 }
20048
20049 /* Return a new RTX holding the result of moving POINTER forward by the
20050 size of the mode it points to. */
20051
20052 static rtx
20053 aarch64_progress_pointer (rtx pointer)
20054 {
20055 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
20056 }
20057
20058 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
20059 MODE bytes. */
20060
20061 static void
20062 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
20063 machine_mode mode)
20064 {
20065 rtx reg = gen_reg_rtx (mode);
20066
20067 /* "Cast" the pointers to the correct mode. */
20068 *src = adjust_address (*src, mode, 0);
20069 *dst = adjust_address (*dst, mode, 0);
20070 /* Emit the memcpy. */
20071 emit_move_insn (reg, *src);
20072 emit_move_insn (*dst, reg);
20073 /* Move the pointers forward. */
20074 *src = aarch64_progress_pointer (*src);
20075 *dst = aarch64_progress_pointer (*dst);
20076 }
20077
20078 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
20079 we succeed, otherwise return false. */
20080
20081 bool
20082 aarch64_expand_cpymem (rtx *operands)
20083 {
20084 int n, mode_bits;
20085 rtx dst = operands[0];
20086 rtx src = operands[1];
20087 rtx base;
20088 machine_mode cur_mode = BLKmode, next_mode;
20089 bool speed_p = !optimize_function_for_size_p (cfun);
20090
20091 /* When optimizing for size, give a better estimate of the length of a
20092 memcpy call, but use the default otherwise. Moves larger than 8 bytes
20093 will always require an even number of instructions to do now. And each
20094 operation requires both a load+store, so devide the max number by 2. */
20095 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
20096
20097 /* We can't do anything smart if the amount to copy is not constant. */
20098 if (!CONST_INT_P (operands[2]))
20099 return false;
20100
20101 n = INTVAL (operands[2]);
20102
20103 /* Try to keep the number of instructions low. For all cases we will do at
20104 most two moves for the residual amount, since we'll always overlap the
20105 remainder. */
20106 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
20107 return false;
20108
20109 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
20110 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
20111
20112 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
20113 src = adjust_automodify_address (src, VOIDmode, base, 0);
20114
20115 /* Convert n to bits to make the rest of the code simpler. */
20116 n = n * BITS_PER_UNIT;
20117
20118 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
20119 larger than TImode, but we should not use them for loads/stores here. */
20120 const int copy_limit = GET_MODE_BITSIZE (TImode);
20121
20122 while (n > 0)
20123 {
20124 /* Find the largest mode in which to do the copy in without over reading
20125 or writing. */
20126 opt_scalar_int_mode mode_iter;
20127 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
20128 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
20129 cur_mode = mode_iter.require ();
20130
20131 gcc_assert (cur_mode != BLKmode);
20132
20133 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
20134 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
20135
20136 n -= mode_bits;
20137
20138 /* Do certain trailing copies as overlapping if it's going to be
20139 cheaper. i.e. less instructions to do so. For instance doing a 15
20140 byte copy it's more efficient to do two overlapping 8 byte copies than
20141 8 + 6 + 1. */
20142 if (n > 0 && n <= 8 * BITS_PER_UNIT)
20143 {
20144 next_mode = smallest_mode_for_size (n, MODE_INT);
20145 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
20146 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
20147 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
20148 n = n_bits;
20149 }
20150 }
20151
20152 return true;
20153 }
20154
20155 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
20156 SImode stores. Handle the case when the constant has identical
20157 bottom and top halves. This is beneficial when the two stores can be
20158 merged into an STP and we avoid synthesising potentially expensive
20159 immediates twice. Return true if such a split is possible. */
20160
20161 bool
20162 aarch64_split_dimode_const_store (rtx dst, rtx src)
20163 {
20164 rtx lo = gen_lowpart (SImode, src);
20165 rtx hi = gen_highpart_mode (SImode, DImode, src);
20166
20167 bool size_p = optimize_function_for_size_p (cfun);
20168
20169 if (!rtx_equal_p (lo, hi))
20170 return false;
20171
20172 unsigned int orig_cost
20173 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
20174 unsigned int lo_cost
20175 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
20176
20177 /* We want to transform:
20178 MOV x1, 49370
20179 MOVK x1, 0x140, lsl 16
20180 MOVK x1, 0xc0da, lsl 32
20181 MOVK x1, 0x140, lsl 48
20182 STR x1, [x0]
20183 into:
20184 MOV w1, 49370
20185 MOVK w1, 0x140, lsl 16
20186 STP w1, w1, [x0]
20187 So we want to perform this only when we save two instructions
20188 or more. When optimizing for size, however, accept any code size
20189 savings we can. */
20190 if (size_p && orig_cost <= lo_cost)
20191 return false;
20192
20193 if (!size_p
20194 && (orig_cost <= lo_cost + 1))
20195 return false;
20196
20197 rtx mem_lo = adjust_address (dst, SImode, 0);
20198 if (!aarch64_mem_pair_operand (mem_lo, SImode))
20199 return false;
20200
20201 rtx tmp_reg = gen_reg_rtx (SImode);
20202 aarch64_expand_mov_immediate (tmp_reg, lo);
20203 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
20204 /* Don't emit an explicit store pair as this may not be always profitable.
20205 Let the sched-fusion logic decide whether to merge them. */
20206 emit_move_insn (mem_lo, tmp_reg);
20207 emit_move_insn (mem_hi, tmp_reg);
20208
20209 return true;
20210 }
20211
20212 /* Generate RTL for a conditional branch with rtx comparison CODE in
20213 mode CC_MODE. The destination of the unlikely conditional branch
20214 is LABEL_REF. */
20215
20216 void
20217 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
20218 rtx label_ref)
20219 {
20220 rtx x;
20221 x = gen_rtx_fmt_ee (code, VOIDmode,
20222 gen_rtx_REG (cc_mode, CC_REGNUM),
20223 const0_rtx);
20224
20225 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
20226 gen_rtx_LABEL_REF (VOIDmode, label_ref),
20227 pc_rtx);
20228 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
20229 }
20230
20231 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20232
20233 OP1 represents the TImode destination operand 1
20234 OP2 represents the TImode destination operand 2
20235 LOW_DEST represents the low half (DImode) of TImode operand 0
20236 LOW_IN1 represents the low half (DImode) of TImode operand 1
20237 LOW_IN2 represents the low half (DImode) of TImode operand 2
20238 HIGH_DEST represents the high half (DImode) of TImode operand 0
20239 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20240 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20241
20242 void
20243 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20244 rtx *low_in1, rtx *low_in2,
20245 rtx *high_dest, rtx *high_in1,
20246 rtx *high_in2)
20247 {
20248 *low_dest = gen_reg_rtx (DImode);
20249 *low_in1 = gen_lowpart (DImode, op1);
20250 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20251 subreg_lowpart_offset (DImode, TImode));
20252 *high_dest = gen_reg_rtx (DImode);
20253 *high_in1 = gen_highpart (DImode, op1);
20254 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20255 subreg_highpart_offset (DImode, TImode));
20256 }
20257
20258 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20259
20260 This function differs from 'arch64_addti_scratch_regs' in that
20261 OP1 can be an immediate constant (zero). We must call
20262 subreg_highpart_offset with DImode and TImode arguments, otherwise
20263 VOIDmode will be used for the const_int which generates an internal
20264 error from subreg_size_highpart_offset which does not expect a size of zero.
20265
20266 OP1 represents the TImode destination operand 1
20267 OP2 represents the TImode destination operand 2
20268 LOW_DEST represents the low half (DImode) of TImode operand 0
20269 LOW_IN1 represents the low half (DImode) of TImode operand 1
20270 LOW_IN2 represents the low half (DImode) of TImode operand 2
20271 HIGH_DEST represents the high half (DImode) of TImode operand 0
20272 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20273 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20274
20275
20276 void
20277 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20278 rtx *low_in1, rtx *low_in2,
20279 rtx *high_dest, rtx *high_in1,
20280 rtx *high_in2)
20281 {
20282 *low_dest = gen_reg_rtx (DImode);
20283 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
20284 subreg_lowpart_offset (DImode, TImode));
20285
20286 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20287 subreg_lowpart_offset (DImode, TImode));
20288 *high_dest = gen_reg_rtx (DImode);
20289
20290 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
20291 subreg_highpart_offset (DImode, TImode));
20292 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20293 subreg_highpart_offset (DImode, TImode));
20294 }
20295
20296 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20297
20298 OP0 represents the TImode destination operand 0
20299 LOW_DEST represents the low half (DImode) of TImode operand 0
20300 LOW_IN1 represents the low half (DImode) of TImode operand 1
20301 LOW_IN2 represents the low half (DImode) of TImode operand 2
20302 HIGH_DEST represents the high half (DImode) of TImode operand 0
20303 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20304 HIGH_IN2 represents the high half (DImode) of TImode operand 2
20305 UNSIGNED_P is true if the operation is being performed on unsigned
20306 values. */
20307 void
20308 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
20309 rtx low_in2, rtx high_dest, rtx high_in1,
20310 rtx high_in2, bool unsigned_p)
20311 {
20312 if (low_in2 == const0_rtx)
20313 {
20314 low_dest = low_in1;
20315 high_in2 = force_reg (DImode, high_in2);
20316 if (unsigned_p)
20317 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
20318 else
20319 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
20320 }
20321 else
20322 {
20323 if (aarch64_plus_immediate (low_in2, DImode))
20324 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
20325 GEN_INT (-INTVAL (low_in2))));
20326 else
20327 {
20328 low_in2 = force_reg (DImode, low_in2);
20329 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
20330 }
20331 high_in2 = force_reg (DImode, high_in2);
20332
20333 if (unsigned_p)
20334 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
20335 else
20336 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
20337 }
20338
20339 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
20340 emit_move_insn (gen_highpart (DImode, op0), high_dest);
20341
20342 }
20343
20344 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
20345
20346 static unsigned HOST_WIDE_INT
20347 aarch64_asan_shadow_offset (void)
20348 {
20349 if (TARGET_ILP32)
20350 return (HOST_WIDE_INT_1 << 29);
20351 else
20352 return (HOST_WIDE_INT_1 << 36);
20353 }
20354
20355 static rtx
20356 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
20357 int code, tree treeop0, tree treeop1)
20358 {
20359 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20360 rtx op0, op1;
20361 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20362 insn_code icode;
20363 struct expand_operand ops[4];
20364
20365 start_sequence ();
20366 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20367
20368 op_mode = GET_MODE (op0);
20369 if (op_mode == VOIDmode)
20370 op_mode = GET_MODE (op1);
20371
20372 switch (op_mode)
20373 {
20374 case E_QImode:
20375 case E_HImode:
20376 case E_SImode:
20377 cmp_mode = SImode;
20378 icode = CODE_FOR_cmpsi;
20379 break;
20380
20381 case E_DImode:
20382 cmp_mode = DImode;
20383 icode = CODE_FOR_cmpdi;
20384 break;
20385
20386 case E_SFmode:
20387 cmp_mode = SFmode;
20388 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20389 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
20390 break;
20391
20392 case E_DFmode:
20393 cmp_mode = DFmode;
20394 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20395 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
20396 break;
20397
20398 default:
20399 end_sequence ();
20400 return NULL_RTX;
20401 }
20402
20403 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
20404 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
20405 if (!op0 || !op1)
20406 {
20407 end_sequence ();
20408 return NULL_RTX;
20409 }
20410 *prep_seq = get_insns ();
20411 end_sequence ();
20412
20413 create_fixed_operand (&ops[0], op0);
20414 create_fixed_operand (&ops[1], op1);
20415
20416 start_sequence ();
20417 if (!maybe_expand_insn (icode, 2, ops))
20418 {
20419 end_sequence ();
20420 return NULL_RTX;
20421 }
20422 *gen_seq = get_insns ();
20423 end_sequence ();
20424
20425 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
20426 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
20427 }
20428
20429 static rtx
20430 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
20431 int cmp_code, tree treeop0, tree treeop1, int bit_code)
20432 {
20433 rtx op0, op1, target;
20434 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20435 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20436 insn_code icode;
20437 struct expand_operand ops[6];
20438 int aarch64_cond;
20439
20440 push_to_sequence (*prep_seq);
20441 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20442
20443 op_mode = GET_MODE (op0);
20444 if (op_mode == VOIDmode)
20445 op_mode = GET_MODE (op1);
20446
20447 switch (op_mode)
20448 {
20449 case E_QImode:
20450 case E_HImode:
20451 case E_SImode:
20452 cmp_mode = SImode;
20453 break;
20454
20455 case E_DImode:
20456 cmp_mode = DImode;
20457 break;
20458
20459 case E_SFmode:
20460 cmp_mode = SFmode;
20461 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20462 break;
20463
20464 case E_DFmode:
20465 cmp_mode = DFmode;
20466 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20467 break;
20468
20469 default:
20470 end_sequence ();
20471 return NULL_RTX;
20472 }
20473
20474 icode = code_for_ccmp (cc_mode, cmp_mode);
20475
20476 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
20477 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
20478 if (!op0 || !op1)
20479 {
20480 end_sequence ();
20481 return NULL_RTX;
20482 }
20483 *prep_seq = get_insns ();
20484 end_sequence ();
20485
20486 target = gen_rtx_REG (cc_mode, CC_REGNUM);
20487 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
20488
20489 if (bit_code != AND)
20490 {
20491 /* Treat the ccmp patterns as canonical and use them where possible,
20492 but fall back to ccmp_rev patterns if there's no other option. */
20493 rtx_code prev_code = GET_CODE (prev);
20494 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
20495 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
20496 && !(prev_code == EQ
20497 || prev_code == NE
20498 || prev_code == ORDERED
20499 || prev_code == UNORDERED))
20500 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
20501 else
20502 {
20503 rtx_code code = reverse_condition (prev_code);
20504 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
20505 }
20506 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
20507 }
20508
20509 create_fixed_operand (&ops[0], XEXP (prev, 0));
20510 create_fixed_operand (&ops[1], target);
20511 create_fixed_operand (&ops[2], op0);
20512 create_fixed_operand (&ops[3], op1);
20513 create_fixed_operand (&ops[4], prev);
20514 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
20515
20516 push_to_sequence (*gen_seq);
20517 if (!maybe_expand_insn (icode, 6, ops))
20518 {
20519 end_sequence ();
20520 return NULL_RTX;
20521 }
20522
20523 *gen_seq = get_insns ();
20524 end_sequence ();
20525
20526 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
20527 }
20528
20529 #undef TARGET_GEN_CCMP_FIRST
20530 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
20531
20532 #undef TARGET_GEN_CCMP_NEXT
20533 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
20534
20535 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
20536 instruction fusion of some sort. */
20537
20538 static bool
20539 aarch64_macro_fusion_p (void)
20540 {
20541 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
20542 }
20543
20544
20545 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
20546 should be kept together during scheduling. */
20547
20548 static bool
20549 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
20550 {
20551 rtx set_dest;
20552 rtx prev_set = single_set (prev);
20553 rtx curr_set = single_set (curr);
20554 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
20555 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
20556
20557 if (!aarch64_macro_fusion_p ())
20558 return false;
20559
20560 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
20561 {
20562 /* We are trying to match:
20563 prev (mov) == (set (reg r0) (const_int imm16))
20564 curr (movk) == (set (zero_extract (reg r0)
20565 (const_int 16)
20566 (const_int 16))
20567 (const_int imm16_1)) */
20568
20569 set_dest = SET_DEST (curr_set);
20570
20571 if (GET_CODE (set_dest) == ZERO_EXTRACT
20572 && CONST_INT_P (SET_SRC (curr_set))
20573 && CONST_INT_P (SET_SRC (prev_set))
20574 && CONST_INT_P (XEXP (set_dest, 2))
20575 && INTVAL (XEXP (set_dest, 2)) == 16
20576 && REG_P (XEXP (set_dest, 0))
20577 && REG_P (SET_DEST (prev_set))
20578 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
20579 {
20580 return true;
20581 }
20582 }
20583
20584 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
20585 {
20586
20587 /* We're trying to match:
20588 prev (adrp) == (set (reg r1)
20589 (high (symbol_ref ("SYM"))))
20590 curr (add) == (set (reg r0)
20591 (lo_sum (reg r1)
20592 (symbol_ref ("SYM"))))
20593 Note that r0 need not necessarily be the same as r1, especially
20594 during pre-regalloc scheduling. */
20595
20596 if (satisfies_constraint_Ush (SET_SRC (prev_set))
20597 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20598 {
20599 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
20600 && REG_P (XEXP (SET_SRC (curr_set), 0))
20601 && REGNO (XEXP (SET_SRC (curr_set), 0))
20602 == REGNO (SET_DEST (prev_set))
20603 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
20604 XEXP (SET_SRC (curr_set), 1)))
20605 return true;
20606 }
20607 }
20608
20609 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
20610 {
20611
20612 /* We're trying to match:
20613 prev (movk) == (set (zero_extract (reg r0)
20614 (const_int 16)
20615 (const_int 32))
20616 (const_int imm16_1))
20617 curr (movk) == (set (zero_extract (reg r0)
20618 (const_int 16)
20619 (const_int 48))
20620 (const_int imm16_2)) */
20621
20622 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
20623 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
20624 && REG_P (XEXP (SET_DEST (prev_set), 0))
20625 && REG_P (XEXP (SET_DEST (curr_set), 0))
20626 && REGNO (XEXP (SET_DEST (prev_set), 0))
20627 == REGNO (XEXP (SET_DEST (curr_set), 0))
20628 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
20629 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
20630 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
20631 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
20632 && CONST_INT_P (SET_SRC (prev_set))
20633 && CONST_INT_P (SET_SRC (curr_set)))
20634 return true;
20635
20636 }
20637 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
20638 {
20639 /* We're trying to match:
20640 prev (adrp) == (set (reg r0)
20641 (high (symbol_ref ("SYM"))))
20642 curr (ldr) == (set (reg r1)
20643 (mem (lo_sum (reg r0)
20644 (symbol_ref ("SYM")))))
20645 or
20646 curr (ldr) == (set (reg r1)
20647 (zero_extend (mem
20648 (lo_sum (reg r0)
20649 (symbol_ref ("SYM")))))) */
20650 if (satisfies_constraint_Ush (SET_SRC (prev_set))
20651 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20652 {
20653 rtx curr_src = SET_SRC (curr_set);
20654
20655 if (GET_CODE (curr_src) == ZERO_EXTEND)
20656 curr_src = XEXP (curr_src, 0);
20657
20658 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
20659 && REG_P (XEXP (XEXP (curr_src, 0), 0))
20660 && REGNO (XEXP (XEXP (curr_src, 0), 0))
20661 == REGNO (SET_DEST (prev_set))
20662 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
20663 XEXP (SET_SRC (prev_set), 0)))
20664 return true;
20665 }
20666 }
20667
20668 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
20669 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
20670 && prev_set && curr_set && any_condjump_p (curr)
20671 && GET_CODE (SET_SRC (prev_set)) == COMPARE
20672 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
20673 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
20674 return true;
20675
20676 /* Fuse flag-setting ALU instructions and conditional branch. */
20677 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
20678 && any_condjump_p (curr))
20679 {
20680 unsigned int condreg1, condreg2;
20681 rtx cc_reg_1;
20682 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
20683 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
20684
20685 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
20686 && prev
20687 && modified_in_p (cc_reg_1, prev))
20688 {
20689 enum attr_type prev_type = get_attr_type (prev);
20690
20691 /* FIXME: this misses some which is considered simple arthematic
20692 instructions for ThunderX. Simple shifts are missed here. */
20693 if (prev_type == TYPE_ALUS_SREG
20694 || prev_type == TYPE_ALUS_IMM
20695 || prev_type == TYPE_LOGICS_REG
20696 || prev_type == TYPE_LOGICS_IMM)
20697 return true;
20698 }
20699 }
20700
20701 /* Fuse ALU instructions and CBZ/CBNZ. */
20702 if (prev_set
20703 && curr_set
20704 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
20705 && any_condjump_p (curr))
20706 {
20707 /* We're trying to match:
20708 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
20709 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
20710 (const_int 0))
20711 (label_ref ("SYM"))
20712 (pc)) */
20713 if (SET_DEST (curr_set) == (pc_rtx)
20714 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
20715 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
20716 && REG_P (SET_DEST (prev_set))
20717 && REGNO (SET_DEST (prev_set))
20718 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
20719 {
20720 /* Fuse ALU operations followed by conditional branch instruction. */
20721 switch (get_attr_type (prev))
20722 {
20723 case TYPE_ALU_IMM:
20724 case TYPE_ALU_SREG:
20725 case TYPE_ADC_REG:
20726 case TYPE_ADC_IMM:
20727 case TYPE_ADCS_REG:
20728 case TYPE_ADCS_IMM:
20729 case TYPE_LOGIC_REG:
20730 case TYPE_LOGIC_IMM:
20731 case TYPE_CSEL:
20732 case TYPE_ADR:
20733 case TYPE_MOV_IMM:
20734 case TYPE_SHIFT_REG:
20735 case TYPE_SHIFT_IMM:
20736 case TYPE_BFM:
20737 case TYPE_RBIT:
20738 case TYPE_REV:
20739 case TYPE_EXTEND:
20740 return true;
20741
20742 default:;
20743 }
20744 }
20745 }
20746
20747 return false;
20748 }
20749
20750 /* Return true iff the instruction fusion described by OP is enabled. */
20751
20752 bool
20753 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
20754 {
20755 return (aarch64_tune_params.fusible_ops & op) != 0;
20756 }
20757
20758 /* If MEM is in the form of [base+offset], extract the two parts
20759 of address and set to BASE and OFFSET, otherwise return false
20760 after clearing BASE and OFFSET. */
20761
20762 bool
20763 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
20764 {
20765 rtx addr;
20766
20767 gcc_assert (MEM_P (mem));
20768
20769 addr = XEXP (mem, 0);
20770
20771 if (REG_P (addr))
20772 {
20773 *base = addr;
20774 *offset = const0_rtx;
20775 return true;
20776 }
20777
20778 if (GET_CODE (addr) == PLUS
20779 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
20780 {
20781 *base = XEXP (addr, 0);
20782 *offset = XEXP (addr, 1);
20783 return true;
20784 }
20785
20786 *base = NULL_RTX;
20787 *offset = NULL_RTX;
20788
20789 return false;
20790 }
20791
20792 /* Types for scheduling fusion. */
20793 enum sched_fusion_type
20794 {
20795 SCHED_FUSION_NONE = 0,
20796 SCHED_FUSION_LD_SIGN_EXTEND,
20797 SCHED_FUSION_LD_ZERO_EXTEND,
20798 SCHED_FUSION_LD,
20799 SCHED_FUSION_ST,
20800 SCHED_FUSION_NUM
20801 };
20802
20803 /* If INSN is a load or store of address in the form of [base+offset],
20804 extract the two parts and set to BASE and OFFSET. Return scheduling
20805 fusion type this INSN is. */
20806
20807 static enum sched_fusion_type
20808 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
20809 {
20810 rtx x, dest, src;
20811 enum sched_fusion_type fusion = SCHED_FUSION_LD;
20812
20813 gcc_assert (INSN_P (insn));
20814 x = PATTERN (insn);
20815 if (GET_CODE (x) != SET)
20816 return SCHED_FUSION_NONE;
20817
20818 src = SET_SRC (x);
20819 dest = SET_DEST (x);
20820
20821 machine_mode dest_mode = GET_MODE (dest);
20822
20823 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
20824 return SCHED_FUSION_NONE;
20825
20826 if (GET_CODE (src) == SIGN_EXTEND)
20827 {
20828 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
20829 src = XEXP (src, 0);
20830 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20831 return SCHED_FUSION_NONE;
20832 }
20833 else if (GET_CODE (src) == ZERO_EXTEND)
20834 {
20835 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
20836 src = XEXP (src, 0);
20837 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20838 return SCHED_FUSION_NONE;
20839 }
20840
20841 if (GET_CODE (src) == MEM && REG_P (dest))
20842 extract_base_offset_in_addr (src, base, offset);
20843 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
20844 {
20845 fusion = SCHED_FUSION_ST;
20846 extract_base_offset_in_addr (dest, base, offset);
20847 }
20848 else
20849 return SCHED_FUSION_NONE;
20850
20851 if (*base == NULL_RTX || *offset == NULL_RTX)
20852 fusion = SCHED_FUSION_NONE;
20853
20854 return fusion;
20855 }
20856
20857 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
20858
20859 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
20860 and PRI are only calculated for these instructions. For other instruction,
20861 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
20862 type instruction fusion can be added by returning different priorities.
20863
20864 It's important that irrelevant instructions get the largest FUSION_PRI. */
20865
20866 static void
20867 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
20868 int *fusion_pri, int *pri)
20869 {
20870 int tmp, off_val;
20871 rtx base, offset;
20872 enum sched_fusion_type fusion;
20873
20874 gcc_assert (INSN_P (insn));
20875
20876 tmp = max_pri - 1;
20877 fusion = fusion_load_store (insn, &base, &offset);
20878 if (fusion == SCHED_FUSION_NONE)
20879 {
20880 *pri = tmp;
20881 *fusion_pri = tmp;
20882 return;
20883 }
20884
20885 /* Set FUSION_PRI according to fusion type and base register. */
20886 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
20887
20888 /* Calculate PRI. */
20889 tmp /= 2;
20890
20891 /* INSN with smaller offset goes first. */
20892 off_val = (int)(INTVAL (offset));
20893 if (off_val >= 0)
20894 tmp -= (off_val & 0xfffff);
20895 else
20896 tmp += ((- off_val) & 0xfffff);
20897
20898 *pri = tmp;
20899 return;
20900 }
20901
20902 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
20903 Adjust priority of sha1h instructions so they are scheduled before
20904 other SHA1 instructions. */
20905
20906 static int
20907 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
20908 {
20909 rtx x = PATTERN (insn);
20910
20911 if (GET_CODE (x) == SET)
20912 {
20913 x = SET_SRC (x);
20914
20915 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
20916 return priority + 10;
20917 }
20918
20919 return priority;
20920 }
20921
20922 /* Given OPERANDS of consecutive load/store, check if we can merge
20923 them into ldp/stp. LOAD is true if they are load instructions.
20924 MODE is the mode of memory operands. */
20925
20926 bool
20927 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
20928 machine_mode mode)
20929 {
20930 HOST_WIDE_INT offval_1, offval_2, msize;
20931 enum reg_class rclass_1, rclass_2;
20932 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
20933
20934 if (load)
20935 {
20936 mem_1 = operands[1];
20937 mem_2 = operands[3];
20938 reg_1 = operands[0];
20939 reg_2 = operands[2];
20940 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
20941 if (REGNO (reg_1) == REGNO (reg_2))
20942 return false;
20943 }
20944 else
20945 {
20946 mem_1 = operands[0];
20947 mem_2 = operands[2];
20948 reg_1 = operands[1];
20949 reg_2 = operands[3];
20950 }
20951
20952 /* The mems cannot be volatile. */
20953 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
20954 return false;
20955
20956 /* If we have SImode and slow unaligned ldp,
20957 check the alignment to be at least 8 byte. */
20958 if (mode == SImode
20959 && (aarch64_tune_params.extra_tuning_flags
20960 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
20961 && !optimize_size
20962 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
20963 return false;
20964
20965 /* Check if the addresses are in the form of [base+offset]. */
20966 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20967 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
20968 return false;
20969 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20970 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
20971 return false;
20972
20973 /* Check if the bases are same. */
20974 if (!rtx_equal_p (base_1, base_2))
20975 return false;
20976
20977 /* The operands must be of the same size. */
20978 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
20979 GET_MODE_SIZE (GET_MODE (mem_2))));
20980
20981 offval_1 = INTVAL (offset_1);
20982 offval_2 = INTVAL (offset_2);
20983 /* We should only be trying this for fixed-sized modes. There is no
20984 SVE LDP/STP instruction. */
20985 msize = GET_MODE_SIZE (mode).to_constant ();
20986 /* Check if the offsets are consecutive. */
20987 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
20988 return false;
20989
20990 /* Check if the addresses are clobbered by load. */
20991 if (load)
20992 {
20993 if (reg_mentioned_p (reg_1, mem_1))
20994 return false;
20995
20996 /* In increasing order, the last load can clobber the address. */
20997 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
20998 return false;
20999 }
21000
21001 /* One of the memory accesses must be a mempair operand.
21002 If it is not the first one, they need to be swapped by the
21003 peephole. */
21004 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
21005 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
21006 return false;
21007
21008 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
21009 rclass_1 = FP_REGS;
21010 else
21011 rclass_1 = GENERAL_REGS;
21012
21013 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
21014 rclass_2 = FP_REGS;
21015 else
21016 rclass_2 = GENERAL_REGS;
21017
21018 /* Check if the registers are of same class. */
21019 if (rclass_1 != rclass_2)
21020 return false;
21021
21022 return true;
21023 }
21024
21025 /* Given OPERANDS of consecutive load/store that can be merged,
21026 swap them if they are not in ascending order. */
21027 void
21028 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
21029 {
21030 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
21031 HOST_WIDE_INT offval_1, offval_2;
21032
21033 if (load)
21034 {
21035 mem_1 = operands[1];
21036 mem_2 = operands[3];
21037 }
21038 else
21039 {
21040 mem_1 = operands[0];
21041 mem_2 = operands[2];
21042 }
21043
21044 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
21045 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
21046
21047 offval_1 = INTVAL (offset_1);
21048 offval_2 = INTVAL (offset_2);
21049
21050 if (offval_1 > offval_2)
21051 {
21052 /* Irrespective of whether this is a load or a store,
21053 we do the same swap. */
21054 std::swap (operands[0], operands[2]);
21055 std::swap (operands[1], operands[3]);
21056 }
21057 }
21058
21059 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
21060 comparison between the two. */
21061 int
21062 aarch64_host_wide_int_compare (const void *x, const void *y)
21063 {
21064 return wi::cmps (* ((const HOST_WIDE_INT *) x),
21065 * ((const HOST_WIDE_INT *) y));
21066 }
21067
21068 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
21069 other pointing to a REG rtx containing an offset, compare the offsets
21070 of the two pairs.
21071
21072 Return:
21073
21074 1 iff offset (X) > offset (Y)
21075 0 iff offset (X) == offset (Y)
21076 -1 iff offset (X) < offset (Y) */
21077 int
21078 aarch64_ldrstr_offset_compare (const void *x, const void *y)
21079 {
21080 const rtx * operands_1 = (const rtx *) x;
21081 const rtx * operands_2 = (const rtx *) y;
21082 rtx mem_1, mem_2, base, offset_1, offset_2;
21083
21084 if (MEM_P (operands_1[0]))
21085 mem_1 = operands_1[0];
21086 else
21087 mem_1 = operands_1[1];
21088
21089 if (MEM_P (operands_2[0]))
21090 mem_2 = operands_2[0];
21091 else
21092 mem_2 = operands_2[1];
21093
21094 /* Extract the offsets. */
21095 extract_base_offset_in_addr (mem_1, &base, &offset_1);
21096 extract_base_offset_in_addr (mem_2, &base, &offset_2);
21097
21098 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
21099
21100 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
21101 }
21102
21103 /* Given OPERANDS of consecutive load/store, check if we can merge
21104 them into ldp/stp by adjusting the offset. LOAD is true if they
21105 are load instructions. MODE is the mode of memory operands.
21106
21107 Given below consecutive stores:
21108
21109 str w1, [xb, 0x100]
21110 str w1, [xb, 0x104]
21111 str w1, [xb, 0x108]
21112 str w1, [xb, 0x10c]
21113
21114 Though the offsets are out of the range supported by stp, we can
21115 still pair them after adjusting the offset, like:
21116
21117 add scratch, xb, 0x100
21118 stp w1, w1, [scratch]
21119 stp w1, w1, [scratch, 0x8]
21120
21121 The peephole patterns detecting this opportunity should guarantee
21122 the scratch register is avaliable. */
21123
21124 bool
21125 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
21126 scalar_mode mode)
21127 {
21128 const int num_insns = 4;
21129 enum reg_class rclass;
21130 HOST_WIDE_INT offvals[num_insns], msize;
21131 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
21132
21133 if (load)
21134 {
21135 for (int i = 0; i < num_insns; i++)
21136 {
21137 reg[i] = operands[2 * i];
21138 mem[i] = operands[2 * i + 1];
21139
21140 gcc_assert (REG_P (reg[i]));
21141 }
21142
21143 /* Do not attempt to merge the loads if the loads clobber each other. */
21144 for (int i = 0; i < 8; i += 2)
21145 for (int j = i + 2; j < 8; j += 2)
21146 if (reg_overlap_mentioned_p (operands[i], operands[j]))
21147 return false;
21148 }
21149 else
21150 for (int i = 0; i < num_insns; i++)
21151 {
21152 mem[i] = operands[2 * i];
21153 reg[i] = operands[2 * i + 1];
21154 }
21155
21156 /* Skip if memory operand is by itself valid for ldp/stp. */
21157 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
21158 return false;
21159
21160 for (int i = 0; i < num_insns; i++)
21161 {
21162 /* The mems cannot be volatile. */
21163 if (MEM_VOLATILE_P (mem[i]))
21164 return false;
21165
21166 /* Check if the addresses are in the form of [base+offset]. */
21167 extract_base_offset_in_addr (mem[i], base + i, offset + i);
21168 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
21169 return false;
21170 }
21171
21172 /* Check if the registers are of same class. */
21173 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
21174 ? FP_REGS : GENERAL_REGS;
21175
21176 for (int i = 1; i < num_insns; i++)
21177 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
21178 {
21179 if (rclass != FP_REGS)
21180 return false;
21181 }
21182 else
21183 {
21184 if (rclass != GENERAL_REGS)
21185 return false;
21186 }
21187
21188 /* Only the last register in the order in which they occur
21189 may be clobbered by the load. */
21190 if (rclass == GENERAL_REGS && load)
21191 for (int i = 0; i < num_insns - 1; i++)
21192 if (reg_mentioned_p (reg[i], mem[i]))
21193 return false;
21194
21195 /* Check if the bases are same. */
21196 for (int i = 0; i < num_insns - 1; i++)
21197 if (!rtx_equal_p (base[i], base[i + 1]))
21198 return false;
21199
21200 for (int i = 0; i < num_insns; i++)
21201 offvals[i] = INTVAL (offset[i]);
21202
21203 msize = GET_MODE_SIZE (mode);
21204
21205 /* Check if the offsets can be put in the right order to do a ldp/stp. */
21206 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
21207 aarch64_host_wide_int_compare);
21208
21209 if (!(offvals[1] == offvals[0] + msize
21210 && offvals[3] == offvals[2] + msize))
21211 return false;
21212
21213 /* Check that offsets are within range of each other. The ldp/stp
21214 instructions have 7 bit immediate offsets, so use 0x80. */
21215 if (offvals[2] - offvals[0] >= msize * 0x80)
21216 return false;
21217
21218 /* The offsets must be aligned with respect to each other. */
21219 if (offvals[0] % msize != offvals[2] % msize)
21220 return false;
21221
21222 /* If we have SImode and slow unaligned ldp,
21223 check the alignment to be at least 8 byte. */
21224 if (mode == SImode
21225 && (aarch64_tune_params.extra_tuning_flags
21226 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21227 && !optimize_size
21228 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
21229 return false;
21230
21231 return true;
21232 }
21233
21234 /* Given OPERANDS of consecutive load/store, this function pairs them
21235 into LDP/STP after adjusting the offset. It depends on the fact
21236 that the operands can be sorted so the offsets are correct for STP.
21237 MODE is the mode of memory operands. CODE is the rtl operator
21238 which should be applied to all memory operands, it's SIGN_EXTEND,
21239 ZERO_EXTEND or UNKNOWN. */
21240
21241 bool
21242 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
21243 scalar_mode mode, RTX_CODE code)
21244 {
21245 rtx base, offset_1, offset_3, t1, t2;
21246 rtx mem_1, mem_2, mem_3, mem_4;
21247 rtx temp_operands[8];
21248 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
21249 stp_off_upper_limit, stp_off_lower_limit, msize;
21250
21251 /* We make changes on a copy as we may still bail out. */
21252 for (int i = 0; i < 8; i ++)
21253 temp_operands[i] = operands[i];
21254
21255 /* Sort the operands. */
21256 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
21257
21258 /* Copy the memory operands so that if we have to bail for some
21259 reason the original addresses are unchanged. */
21260 if (load)
21261 {
21262 mem_1 = copy_rtx (temp_operands[1]);
21263 mem_2 = copy_rtx (temp_operands[3]);
21264 mem_3 = copy_rtx (temp_operands[5]);
21265 mem_4 = copy_rtx (temp_operands[7]);
21266 }
21267 else
21268 {
21269 mem_1 = copy_rtx (temp_operands[0]);
21270 mem_2 = copy_rtx (temp_operands[2]);
21271 mem_3 = copy_rtx (temp_operands[4]);
21272 mem_4 = copy_rtx (temp_operands[6]);
21273 gcc_assert (code == UNKNOWN);
21274 }
21275
21276 extract_base_offset_in_addr (mem_1, &base, &offset_1);
21277 extract_base_offset_in_addr (mem_3, &base, &offset_3);
21278 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
21279 && offset_3 != NULL_RTX);
21280
21281 /* Adjust offset so it can fit in LDP/STP instruction. */
21282 msize = GET_MODE_SIZE (mode);
21283 stp_off_upper_limit = msize * (0x40 - 1);
21284 stp_off_lower_limit = - msize * 0x40;
21285
21286 off_val_1 = INTVAL (offset_1);
21287 off_val_3 = INTVAL (offset_3);
21288
21289 /* The base offset is optimally half way between the two STP/LDP offsets. */
21290 if (msize <= 4)
21291 base_off = (off_val_1 + off_val_3) / 2;
21292 else
21293 /* However, due to issues with negative LDP/STP offset generation for
21294 larger modes, for DF, DI and vector modes. we must not use negative
21295 addresses smaller than 9 signed unadjusted bits can store. This
21296 provides the most range in this case. */
21297 base_off = off_val_1;
21298
21299 /* Adjust the base so that it is aligned with the addresses but still
21300 optimal. */
21301 if (base_off % msize != off_val_1 % msize)
21302 /* Fix the offset, bearing in mind we want to make it bigger not
21303 smaller. */
21304 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21305 else if (msize <= 4)
21306 /* The negative range of LDP/STP is one larger than the positive range. */
21307 base_off += msize;
21308
21309 /* Check if base offset is too big or too small. We can attempt to resolve
21310 this issue by setting it to the maximum value and seeing if the offsets
21311 still fit. */
21312 if (base_off >= 0x1000)
21313 {
21314 base_off = 0x1000 - 1;
21315 /* We must still make sure that the base offset is aligned with respect
21316 to the address. But it may not be made any bigger. */
21317 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21318 }
21319
21320 /* Likewise for the case where the base is too small. */
21321 if (base_off <= -0x1000)
21322 {
21323 base_off = -0x1000 + 1;
21324 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21325 }
21326
21327 /* Offset of the first STP/LDP. */
21328 new_off_1 = off_val_1 - base_off;
21329
21330 /* Offset of the second STP/LDP. */
21331 new_off_3 = off_val_3 - base_off;
21332
21333 /* The offsets must be within the range of the LDP/STP instructions. */
21334 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
21335 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
21336 return false;
21337
21338 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
21339 new_off_1), true);
21340 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
21341 new_off_1 + msize), true);
21342 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
21343 new_off_3), true);
21344 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
21345 new_off_3 + msize), true);
21346
21347 if (!aarch64_mem_pair_operand (mem_1, mode)
21348 || !aarch64_mem_pair_operand (mem_3, mode))
21349 return false;
21350
21351 if (code == ZERO_EXTEND)
21352 {
21353 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
21354 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
21355 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
21356 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
21357 }
21358 else if (code == SIGN_EXTEND)
21359 {
21360 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
21361 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
21362 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
21363 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
21364 }
21365
21366 if (load)
21367 {
21368 operands[0] = temp_operands[0];
21369 operands[1] = mem_1;
21370 operands[2] = temp_operands[2];
21371 operands[3] = mem_2;
21372 operands[4] = temp_operands[4];
21373 operands[5] = mem_3;
21374 operands[6] = temp_operands[6];
21375 operands[7] = mem_4;
21376 }
21377 else
21378 {
21379 operands[0] = mem_1;
21380 operands[1] = temp_operands[1];
21381 operands[2] = mem_2;
21382 operands[3] = temp_operands[3];
21383 operands[4] = mem_3;
21384 operands[5] = temp_operands[5];
21385 operands[6] = mem_4;
21386 operands[7] = temp_operands[7];
21387 }
21388
21389 /* Emit adjusting instruction. */
21390 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
21391 /* Emit ldp/stp instructions. */
21392 t1 = gen_rtx_SET (operands[0], operands[1]);
21393 t2 = gen_rtx_SET (operands[2], operands[3]);
21394 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21395 t1 = gen_rtx_SET (operands[4], operands[5]);
21396 t2 = gen_rtx_SET (operands[6], operands[7]);
21397 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21398 return true;
21399 }
21400
21401 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
21402 it isn't worth branching around empty masked ops (including masked
21403 stores). */
21404
21405 static bool
21406 aarch64_empty_mask_is_expensive (unsigned)
21407 {
21408 return false;
21409 }
21410
21411 /* Return 1 if pseudo register should be created and used to hold
21412 GOT address for PIC code. */
21413
21414 bool
21415 aarch64_use_pseudo_pic_reg (void)
21416 {
21417 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
21418 }
21419
21420 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
21421
21422 static int
21423 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
21424 {
21425 switch (XINT (x, 1))
21426 {
21427 case UNSPEC_GOTSMALLPIC:
21428 case UNSPEC_GOTSMALLPIC28K:
21429 case UNSPEC_GOTTINYPIC:
21430 return 0;
21431 default:
21432 break;
21433 }
21434
21435 return default_unspec_may_trap_p (x, flags);
21436 }
21437
21438
21439 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21440 return the log2 of that value. Otherwise return -1. */
21441
21442 int
21443 aarch64_fpconst_pow_of_2 (rtx x)
21444 {
21445 const REAL_VALUE_TYPE *r;
21446
21447 if (!CONST_DOUBLE_P (x))
21448 return -1;
21449
21450 r = CONST_DOUBLE_REAL_VALUE (x);
21451
21452 if (REAL_VALUE_NEGATIVE (*r)
21453 || REAL_VALUE_ISNAN (*r)
21454 || REAL_VALUE_ISINF (*r)
21455 || !real_isinteger (r, DFmode))
21456 return -1;
21457
21458 return exact_log2 (real_to_integer (r));
21459 }
21460
21461 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21462 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21463 return n. Otherwise return -1. */
21464
21465 int
21466 aarch64_fpconst_pow2_recip (rtx x)
21467 {
21468 REAL_VALUE_TYPE r0;
21469
21470 if (!CONST_DOUBLE_P (x))
21471 return -1;
21472
21473 r0 = *CONST_DOUBLE_REAL_VALUE (x);
21474 if (exact_real_inverse (DFmode, &r0)
21475 && !REAL_VALUE_NEGATIVE (r0))
21476 {
21477 int ret = exact_log2 (real_to_integer (&r0));
21478 if (ret >= 1 && ret <= 32)
21479 return ret;
21480 }
21481 return -1;
21482 }
21483
21484 /* If X is a vector of equal CONST_DOUBLE values and that value is
21485 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
21486
21487 int
21488 aarch64_vec_fpconst_pow_of_2 (rtx x)
21489 {
21490 int nelts;
21491 if (GET_CODE (x) != CONST_VECTOR
21492 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
21493 return -1;
21494
21495 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
21496 return -1;
21497
21498 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
21499 if (firstval <= 0)
21500 return -1;
21501
21502 for (int i = 1; i < nelts; i++)
21503 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
21504 return -1;
21505
21506 return firstval;
21507 }
21508
21509 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21510 to float.
21511
21512 __fp16 always promotes through this hook.
21513 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21514 through the generic excess precision logic rather than here. */
21515
21516 static tree
21517 aarch64_promoted_type (const_tree t)
21518 {
21519 if (SCALAR_FLOAT_TYPE_P (t)
21520 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
21521 return float_type_node;
21522
21523 return NULL_TREE;
21524 }
21525
21526 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
21527
21528 static bool
21529 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
21530 optimization_type opt_type)
21531 {
21532 switch (op)
21533 {
21534 case rsqrt_optab:
21535 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
21536
21537 default:
21538 return true;
21539 }
21540 }
21541
21542 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
21543
21544 static unsigned int
21545 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
21546 int *offset)
21547 {
21548 /* Polynomial invariant 1 == (VG / 2) - 1. */
21549 gcc_assert (i == 1);
21550 *factor = 2;
21551 *offset = 1;
21552 return AARCH64_DWARF_VG;
21553 }
21554
21555 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
21556 if MODE is HFmode, and punt to the generic implementation otherwise. */
21557
21558 static bool
21559 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
21560 {
21561 return (mode == HFmode
21562 ? true
21563 : default_libgcc_floating_mode_supported_p (mode));
21564 }
21565
21566 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
21567 if MODE is HFmode, and punt to the generic implementation otherwise. */
21568
21569 static bool
21570 aarch64_scalar_mode_supported_p (scalar_mode mode)
21571 {
21572 return (mode == HFmode
21573 ? true
21574 : default_scalar_mode_supported_p (mode));
21575 }
21576
21577 /* Set the value of FLT_EVAL_METHOD.
21578 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
21579
21580 0: evaluate all operations and constants, whose semantic type has at
21581 most the range and precision of type float, to the range and
21582 precision of float; evaluate all other operations and constants to
21583 the range and precision of the semantic type;
21584
21585 N, where _FloatN is a supported interchange floating type
21586 evaluate all operations and constants, whose semantic type has at
21587 most the range and precision of _FloatN type, to the range and
21588 precision of the _FloatN type; evaluate all other operations and
21589 constants to the range and precision of the semantic type;
21590
21591 If we have the ARMv8.2-A extensions then we support _Float16 in native
21592 precision, so we should set this to 16. Otherwise, we support the type,
21593 but want to evaluate expressions in float precision, so set this to
21594 0. */
21595
21596 static enum flt_eval_method
21597 aarch64_excess_precision (enum excess_precision_type type)
21598 {
21599 switch (type)
21600 {
21601 case EXCESS_PRECISION_TYPE_FAST:
21602 case EXCESS_PRECISION_TYPE_STANDARD:
21603 /* We can calculate either in 16-bit range and precision or
21604 32-bit range and precision. Make that decision based on whether
21605 we have native support for the ARMv8.2-A 16-bit floating-point
21606 instructions or not. */
21607 return (TARGET_FP_F16INST
21608 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
21609 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
21610 case EXCESS_PRECISION_TYPE_IMPLICIT:
21611 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
21612 default:
21613 gcc_unreachable ();
21614 }
21615 return FLT_EVAL_METHOD_UNPREDICTABLE;
21616 }
21617
21618 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
21619 scheduled for speculative execution. Reject the long-running division
21620 and square-root instructions. */
21621
21622 static bool
21623 aarch64_sched_can_speculate_insn (rtx_insn *insn)
21624 {
21625 switch (get_attr_type (insn))
21626 {
21627 case TYPE_SDIV:
21628 case TYPE_UDIV:
21629 case TYPE_FDIVS:
21630 case TYPE_FDIVD:
21631 case TYPE_FSQRTS:
21632 case TYPE_FSQRTD:
21633 case TYPE_NEON_FP_SQRT_S:
21634 case TYPE_NEON_FP_SQRT_D:
21635 case TYPE_NEON_FP_SQRT_S_Q:
21636 case TYPE_NEON_FP_SQRT_D_Q:
21637 case TYPE_NEON_FP_DIV_S:
21638 case TYPE_NEON_FP_DIV_D:
21639 case TYPE_NEON_FP_DIV_S_Q:
21640 case TYPE_NEON_FP_DIV_D_Q:
21641 return false;
21642 default:
21643 return true;
21644 }
21645 }
21646
21647 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
21648
21649 static int
21650 aarch64_compute_pressure_classes (reg_class *classes)
21651 {
21652 int i = 0;
21653 classes[i++] = GENERAL_REGS;
21654 classes[i++] = FP_REGS;
21655 /* PR_REGS isn't a useful pressure class because many predicate pseudo
21656 registers need to go in PR_LO_REGS at some point during their
21657 lifetime. Splitting it into two halves has the effect of making
21658 all predicates count against PR_LO_REGS, so that we try whenever
21659 possible to restrict the number of live predicates to 8. This
21660 greatly reduces the amount of spilling in certain loops. */
21661 classes[i++] = PR_LO_REGS;
21662 classes[i++] = PR_HI_REGS;
21663 return i;
21664 }
21665
21666 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
21667
21668 static bool
21669 aarch64_can_change_mode_class (machine_mode from,
21670 machine_mode to, reg_class_t)
21671 {
21672 unsigned int from_flags = aarch64_classify_vector_mode (from);
21673 unsigned int to_flags = aarch64_classify_vector_mode (to);
21674
21675 bool from_sve_p = (from_flags & VEC_ANY_SVE);
21676 bool to_sve_p = (to_flags & VEC_ANY_SVE);
21677
21678 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
21679 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
21680
21681 /* Don't allow changes between partial SVE modes and other modes.
21682 The contents of partial SVE modes are distributed evenly across
21683 the register, whereas GCC expects them to be clustered together. */
21684 if (from_partial_sve_p != to_partial_sve_p)
21685 return false;
21686
21687 /* Similarly reject changes between partial SVE modes that have
21688 different patterns of significant and insignificant bits. */
21689 if (from_partial_sve_p
21690 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
21691 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
21692 return false;
21693
21694 if (BYTES_BIG_ENDIAN)
21695 {
21696 /* Don't allow changes between SVE data modes and non-SVE modes.
21697 See the comment at the head of aarch64-sve.md for details. */
21698 if (from_sve_p != to_sve_p)
21699 return false;
21700
21701 /* Don't allow changes in element size: lane 0 of the new vector
21702 would not then be lane 0 of the old vector. See the comment
21703 above aarch64_maybe_expand_sve_subreg_move for a more detailed
21704 description.
21705
21706 In the worst case, this forces a register to be spilled in
21707 one mode and reloaded in the other, which handles the
21708 endianness correctly. */
21709 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
21710 return false;
21711 }
21712 return true;
21713 }
21714
21715 /* Implement TARGET_EARLY_REMAT_MODES. */
21716
21717 static void
21718 aarch64_select_early_remat_modes (sbitmap modes)
21719 {
21720 /* SVE values are not normally live across a call, so it should be
21721 worth doing early rematerialization even in VL-specific mode. */
21722 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
21723 if (aarch64_sve_mode_p ((machine_mode) i))
21724 bitmap_set_bit (modes, i);
21725 }
21726
21727 /* Override the default target speculation_safe_value. */
21728 static rtx
21729 aarch64_speculation_safe_value (machine_mode mode,
21730 rtx result, rtx val, rtx failval)
21731 {
21732 /* Maybe we should warn if falling back to hard barriers. They are
21733 likely to be noticably more expensive than the alternative below. */
21734 if (!aarch64_track_speculation)
21735 return default_speculation_safe_value (mode, result, val, failval);
21736
21737 if (!REG_P (val))
21738 val = copy_to_mode_reg (mode, val);
21739
21740 if (!aarch64_reg_or_zero (failval, mode))
21741 failval = copy_to_mode_reg (mode, failval);
21742
21743 emit_insn (gen_despeculate_copy (mode, result, val, failval));
21744 return result;
21745 }
21746
21747 /* Implement TARGET_ESTIMATED_POLY_VALUE.
21748 Look into the tuning structure for an estimate.
21749 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
21750 Advanced SIMD 128 bits. */
21751
21752 static HOST_WIDE_INT
21753 aarch64_estimated_poly_value (poly_int64 val)
21754 {
21755 enum aarch64_sve_vector_bits_enum width_source
21756 = aarch64_tune_params.sve_width;
21757
21758 /* If we still don't have an estimate, use the default. */
21759 if (width_source == SVE_SCALABLE)
21760 return default_estimated_poly_value (val);
21761
21762 HOST_WIDE_INT over_128 = width_source - 128;
21763 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
21764 }
21765
21766
21767 /* Return true for types that could be supported as SIMD return or
21768 argument types. */
21769
21770 static bool
21771 supported_simd_type (tree t)
21772 {
21773 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
21774 {
21775 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
21776 return s == 1 || s == 2 || s == 4 || s == 8;
21777 }
21778 return false;
21779 }
21780
21781 /* Return true for types that currently are supported as SIMD return
21782 or argument types. */
21783
21784 static bool
21785 currently_supported_simd_type (tree t, tree b)
21786 {
21787 if (COMPLEX_FLOAT_TYPE_P (t))
21788 return false;
21789
21790 if (TYPE_SIZE (t) != TYPE_SIZE (b))
21791 return false;
21792
21793 return supported_simd_type (t);
21794 }
21795
21796 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
21797
21798 static int
21799 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
21800 struct cgraph_simd_clone *clonei,
21801 tree base_type, int num)
21802 {
21803 tree t, ret_type, arg_type;
21804 unsigned int elt_bits, vec_bits, count;
21805
21806 if (!TARGET_SIMD)
21807 return 0;
21808
21809 if (clonei->simdlen
21810 && (clonei->simdlen < 2
21811 || clonei->simdlen > 1024
21812 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
21813 {
21814 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21815 "unsupported simdlen %d", clonei->simdlen);
21816 return 0;
21817 }
21818
21819 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
21820 if (TREE_CODE (ret_type) != VOID_TYPE
21821 && !currently_supported_simd_type (ret_type, base_type))
21822 {
21823 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
21824 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21825 "GCC does not currently support mixed size types "
21826 "for %<simd%> functions");
21827 else if (supported_simd_type (ret_type))
21828 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21829 "GCC does not currently support return type %qT "
21830 "for %<simd%> functions", ret_type);
21831 else
21832 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21833 "unsupported return type %qT for %<simd%> functions",
21834 ret_type);
21835 return 0;
21836 }
21837
21838 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
21839 {
21840 arg_type = TREE_TYPE (t);
21841
21842 if (!currently_supported_simd_type (arg_type, base_type))
21843 {
21844 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
21845 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21846 "GCC does not currently support mixed size types "
21847 "for %<simd%> functions");
21848 else
21849 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21850 "GCC does not currently support argument type %qT "
21851 "for %<simd%> functions", arg_type);
21852 return 0;
21853 }
21854 }
21855
21856 clonei->vecsize_mangle = 'n';
21857 clonei->mask_mode = VOIDmode;
21858 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
21859 if (clonei->simdlen == 0)
21860 {
21861 count = 2;
21862 vec_bits = (num == 0 ? 64 : 128);
21863 clonei->simdlen = vec_bits / elt_bits;
21864 }
21865 else
21866 {
21867 count = 1;
21868 vec_bits = clonei->simdlen * elt_bits;
21869 if (vec_bits != 64 && vec_bits != 128)
21870 {
21871 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21872 "GCC does not currently support simdlen %d for type %qT",
21873 clonei->simdlen, base_type);
21874 return 0;
21875 }
21876 }
21877 clonei->vecsize_int = vec_bits;
21878 clonei->vecsize_float = vec_bits;
21879 return count;
21880 }
21881
21882 /* Implement TARGET_SIMD_CLONE_ADJUST. */
21883
21884 static void
21885 aarch64_simd_clone_adjust (struct cgraph_node *node)
21886 {
21887 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
21888 use the correct ABI. */
21889
21890 tree t = TREE_TYPE (node->decl);
21891 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
21892 TYPE_ATTRIBUTES (t));
21893 }
21894
21895 /* Implement TARGET_SIMD_CLONE_USABLE. */
21896
21897 static int
21898 aarch64_simd_clone_usable (struct cgraph_node *node)
21899 {
21900 switch (node->simdclone->vecsize_mangle)
21901 {
21902 case 'n':
21903 if (!TARGET_SIMD)
21904 return -1;
21905 return 0;
21906 default:
21907 gcc_unreachable ();
21908 }
21909 }
21910
21911 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
21912
21913 static int
21914 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
21915 {
21916 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
21917 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
21918 return 0;
21919 return 1;
21920 }
21921
21922 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
21923
21924 static const char *
21925 aarch64_get_multilib_abi_name (void)
21926 {
21927 if (TARGET_BIG_END)
21928 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
21929 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
21930 }
21931
21932 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
21933 global variable based guard use the default else
21934 return a null tree. */
21935 static tree
21936 aarch64_stack_protect_guard (void)
21937 {
21938 if (aarch64_stack_protector_guard == SSP_GLOBAL)
21939 return default_stack_protect_guard ();
21940
21941 return NULL_TREE;
21942 }
21943
21944 /* Return the diagnostic message string if conversion from FROMTYPE to
21945 TOTYPE is not allowed, NULL otherwise. */
21946
21947 static const char *
21948 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
21949 {
21950 if (element_mode (fromtype) != element_mode (totype))
21951 {
21952 /* Do no allow conversions to/from BFmode scalar types. */
21953 if (TYPE_MODE (fromtype) == BFmode)
21954 return N_("invalid conversion from type %<bfloat16_t%>");
21955 if (TYPE_MODE (totype) == BFmode)
21956 return N_("invalid conversion to type %<bfloat16_t%>");
21957 }
21958
21959 /* Conversion allowed. */
21960 return NULL;
21961 }
21962
21963 /* Return the diagnostic message string if the unary operation OP is
21964 not permitted on TYPE, NULL otherwise. */
21965
21966 static const char *
21967 aarch64_invalid_unary_op (int op, const_tree type)
21968 {
21969 /* Reject all single-operand operations on BFmode except for &. */
21970 if (element_mode (type) == BFmode && op != ADDR_EXPR)
21971 return N_("operation not permitted on type %<bfloat16_t%>");
21972
21973 /* Operation allowed. */
21974 return NULL;
21975 }
21976
21977 /* Return the diagnostic message string if the binary operation OP is
21978 not permitted on TYPE1 and TYPE2, NULL otherwise. */
21979
21980 static const char *
21981 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
21982 const_tree type2)
21983 {
21984 /* Reject all 2-operand operations on BFmode. */
21985 if (element_mode (type1) == BFmode
21986 || element_mode (type2) == BFmode)
21987 return N_("operation not permitted on type %<bfloat16_t%>");
21988
21989 /* Operation allowed. */
21990 return NULL;
21991 }
21992
21993 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
21994 section at the end if needed. */
21995 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
21996 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
21997 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
21998 void
21999 aarch64_file_end_indicate_exec_stack ()
22000 {
22001 file_end_indicate_exec_stack ();
22002
22003 unsigned feature_1_and = 0;
22004 if (aarch64_bti_enabled ())
22005 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
22006
22007 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
22008 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
22009
22010 if (feature_1_and)
22011 {
22012 /* Generate .note.gnu.property section. */
22013 switch_to_section (get_section (".note.gnu.property",
22014 SECTION_NOTYPE, NULL));
22015
22016 /* PT_NOTE header: namesz, descsz, type.
22017 namesz = 4 ("GNU\0")
22018 descsz = 16 (Size of the program property array)
22019 [(12 + padding) * Number of array elements]
22020 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
22021 assemble_align (POINTER_SIZE);
22022 assemble_integer (GEN_INT (4), 4, 32, 1);
22023 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
22024 assemble_integer (GEN_INT (5), 4, 32, 1);
22025
22026 /* PT_NOTE name. */
22027 assemble_string ("GNU", 4);
22028
22029 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
22030 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
22031 datasz = 4
22032 data = feature_1_and. */
22033 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
22034 assemble_integer (GEN_INT (4), 4, 32, 1);
22035 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
22036
22037 /* Pad the size of the note to the required alignment. */
22038 assemble_align (POINTER_SIZE);
22039 }
22040 }
22041 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
22042 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
22043 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
22044
22045 /* Target-specific selftests. */
22046
22047 #if CHECKING_P
22048
22049 namespace selftest {
22050
22051 /* Selftest for the RTL loader.
22052 Verify that the RTL loader copes with a dump from
22053 print_rtx_function. This is essentially just a test that class
22054 function_reader can handle a real dump, but it also verifies
22055 that lookup_reg_by_dump_name correctly handles hard regs.
22056 The presence of hard reg names in the dump means that the test is
22057 target-specific, hence it is in this file. */
22058
22059 static void
22060 aarch64_test_loading_full_dump ()
22061 {
22062 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
22063
22064 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
22065
22066 rtx_insn *insn_1 = get_insn_by_uid (1);
22067 ASSERT_EQ (NOTE, GET_CODE (insn_1));
22068
22069 rtx_insn *insn_15 = get_insn_by_uid (15);
22070 ASSERT_EQ (INSN, GET_CODE (insn_15));
22071 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
22072
22073 /* Verify crtl->return_rtx. */
22074 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
22075 ASSERT_EQ (0, REGNO (crtl->return_rtx));
22076 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
22077 }
22078
22079 /* Run all target-specific selftests. */
22080
22081 static void
22082 aarch64_run_selftests (void)
22083 {
22084 aarch64_test_loading_full_dump ();
22085 }
22086
22087 } // namespace selftest
22088
22089 #endif /* #if CHECKING_P */
22090
22091 #undef TARGET_STACK_PROTECT_GUARD
22092 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
22093
22094 #undef TARGET_ADDRESS_COST
22095 #define TARGET_ADDRESS_COST aarch64_address_cost
22096
22097 /* This hook will determines whether unnamed bitfields affect the alignment
22098 of the containing structure. The hook returns true if the structure
22099 should inherit the alignment requirements of an unnamed bitfield's
22100 type. */
22101 #undef TARGET_ALIGN_ANON_BITFIELD
22102 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
22103
22104 #undef TARGET_ASM_ALIGNED_DI_OP
22105 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
22106
22107 #undef TARGET_ASM_ALIGNED_HI_OP
22108 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
22109
22110 #undef TARGET_ASM_ALIGNED_SI_OP
22111 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
22112
22113 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
22114 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
22115 hook_bool_const_tree_hwi_hwi_const_tree_true
22116
22117 #undef TARGET_ASM_FILE_START
22118 #define TARGET_ASM_FILE_START aarch64_start_file
22119
22120 #undef TARGET_ASM_OUTPUT_MI_THUNK
22121 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
22122
22123 #undef TARGET_ASM_SELECT_RTX_SECTION
22124 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
22125
22126 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
22127 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
22128
22129 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
22130 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
22131
22132 #undef TARGET_BUILD_BUILTIN_VA_LIST
22133 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
22134
22135 #undef TARGET_CALLEE_COPIES
22136 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
22137
22138 #undef TARGET_CAN_ELIMINATE
22139 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
22140
22141 #undef TARGET_CAN_INLINE_P
22142 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
22143
22144 #undef TARGET_CANNOT_FORCE_CONST_MEM
22145 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
22146
22147 #undef TARGET_CASE_VALUES_THRESHOLD
22148 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
22149
22150 #undef TARGET_CONDITIONAL_REGISTER_USAGE
22151 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
22152
22153 /* Only the least significant bit is used for initialization guard
22154 variables. */
22155 #undef TARGET_CXX_GUARD_MASK_BIT
22156 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
22157
22158 #undef TARGET_C_MODE_FOR_SUFFIX
22159 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
22160
22161 #ifdef TARGET_BIG_ENDIAN_DEFAULT
22162 #undef TARGET_DEFAULT_TARGET_FLAGS
22163 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
22164 #endif
22165
22166 #undef TARGET_CLASS_MAX_NREGS
22167 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
22168
22169 #undef TARGET_BUILTIN_DECL
22170 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
22171
22172 #undef TARGET_BUILTIN_RECIPROCAL
22173 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
22174
22175 #undef TARGET_C_EXCESS_PRECISION
22176 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
22177
22178 #undef TARGET_EXPAND_BUILTIN
22179 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
22180
22181 #undef TARGET_EXPAND_BUILTIN_VA_START
22182 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
22183
22184 #undef TARGET_FOLD_BUILTIN
22185 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
22186
22187 #undef TARGET_FUNCTION_ARG
22188 #define TARGET_FUNCTION_ARG aarch64_function_arg
22189
22190 #undef TARGET_FUNCTION_ARG_ADVANCE
22191 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
22192
22193 #undef TARGET_FUNCTION_ARG_BOUNDARY
22194 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
22195
22196 #undef TARGET_FUNCTION_ARG_PADDING
22197 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
22198
22199 #undef TARGET_GET_RAW_RESULT_MODE
22200 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
22201 #undef TARGET_GET_RAW_ARG_MODE
22202 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
22203
22204 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
22205 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
22206
22207 #undef TARGET_FUNCTION_VALUE
22208 #define TARGET_FUNCTION_VALUE aarch64_function_value
22209
22210 #undef TARGET_FUNCTION_VALUE_REGNO_P
22211 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
22212
22213 #undef TARGET_GIMPLE_FOLD_BUILTIN
22214 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
22215
22216 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
22217 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
22218
22219 #undef TARGET_INIT_BUILTINS
22220 #define TARGET_INIT_BUILTINS aarch64_init_builtins
22221
22222 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
22223 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
22224 aarch64_ira_change_pseudo_allocno_class
22225
22226 #undef TARGET_LEGITIMATE_ADDRESS_P
22227 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
22228
22229 #undef TARGET_LEGITIMATE_CONSTANT_P
22230 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
22231
22232 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
22233 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
22234 aarch64_legitimize_address_displacement
22235
22236 #undef TARGET_LIBGCC_CMP_RETURN_MODE
22237 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
22238
22239 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
22240 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
22241 aarch64_libgcc_floating_mode_supported_p
22242
22243 #undef TARGET_MANGLE_TYPE
22244 #define TARGET_MANGLE_TYPE aarch64_mangle_type
22245
22246 #undef TARGET_INVALID_CONVERSION
22247 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
22248
22249 #undef TARGET_INVALID_UNARY_OP
22250 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
22251
22252 #undef TARGET_INVALID_BINARY_OP
22253 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
22254
22255 #undef TARGET_VERIFY_TYPE_CONTEXT
22256 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22257
22258 #undef TARGET_MEMORY_MOVE_COST
22259 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22260
22261 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22262 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22263
22264 #undef TARGET_MUST_PASS_IN_STACK
22265 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22266
22267 /* This target hook should return true if accesses to volatile bitfields
22268 should use the narrowest mode possible. It should return false if these
22269 accesses should use the bitfield container type. */
22270 #undef TARGET_NARROW_VOLATILE_BITFIELD
22271 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22272
22273 #undef TARGET_OPTION_OVERRIDE
22274 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22275
22276 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22277 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22278 aarch64_override_options_after_change
22279
22280 #undef TARGET_OPTION_SAVE
22281 #define TARGET_OPTION_SAVE aarch64_option_save
22282
22283 #undef TARGET_OPTION_RESTORE
22284 #define TARGET_OPTION_RESTORE aarch64_option_restore
22285
22286 #undef TARGET_OPTION_PRINT
22287 #define TARGET_OPTION_PRINT aarch64_option_print
22288
22289 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22290 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22291
22292 #undef TARGET_SET_CURRENT_FUNCTION
22293 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22294
22295 #undef TARGET_PASS_BY_REFERENCE
22296 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22297
22298 #undef TARGET_PREFERRED_RELOAD_CLASS
22299 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22300
22301 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22302 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22303
22304 #undef TARGET_PROMOTED_TYPE
22305 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22306
22307 #undef TARGET_SECONDARY_RELOAD
22308 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22309
22310 #undef TARGET_SHIFT_TRUNCATION_MASK
22311 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22312
22313 #undef TARGET_SETUP_INCOMING_VARARGS
22314 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22315
22316 #undef TARGET_STRUCT_VALUE_RTX
22317 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
22318
22319 #undef TARGET_REGISTER_MOVE_COST
22320 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22321
22322 #undef TARGET_RETURN_IN_MEMORY
22323 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22324
22325 #undef TARGET_RETURN_IN_MSB
22326 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22327
22328 #undef TARGET_RTX_COSTS
22329 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22330
22331 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22332 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22333
22334 #undef TARGET_SCHED_ISSUE_RATE
22335 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22336
22337 #undef TARGET_SCHED_VARIABLE_ISSUE
22338 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22339
22340 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22341 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22342 aarch64_sched_first_cycle_multipass_dfa_lookahead
22343
22344 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22345 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22346 aarch64_first_cycle_multipass_dfa_lookahead_guard
22347
22348 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22349 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22350 aarch64_get_separate_components
22351
22352 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22353 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22354 aarch64_components_for_bb
22355
22356 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22357 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22358 aarch64_disqualify_components
22359
22360 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22361 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22362 aarch64_emit_prologue_components
22363
22364 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22365 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22366 aarch64_emit_epilogue_components
22367
22368 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22369 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22370 aarch64_set_handled_components
22371
22372 #undef TARGET_TRAMPOLINE_INIT
22373 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22374
22375 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22376 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22377
22378 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22379 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22380
22381 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
22382 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
22383
22384 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22385 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22386 aarch64_builtin_support_vector_misalignment
22387
22388 #undef TARGET_ARRAY_MODE
22389 #define TARGET_ARRAY_MODE aarch64_array_mode
22390
22391 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22392 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22393
22394 #undef TARGET_VECTORIZE_ADD_STMT_COST
22395 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22396
22397 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22398 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22399 aarch64_builtin_vectorization_cost
22400
22401 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22402 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22403
22404 #undef TARGET_VECTORIZE_BUILTINS
22405 #define TARGET_VECTORIZE_BUILTINS
22406
22407 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22408 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22409 aarch64_builtin_vectorized_function
22410
22411 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22412 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22413 aarch64_autovectorize_vector_modes
22414
22415 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22416 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22417 aarch64_atomic_assign_expand_fenv
22418
22419 /* Section anchor support. */
22420
22421 #undef TARGET_MIN_ANCHOR_OFFSET
22422 #define TARGET_MIN_ANCHOR_OFFSET -256
22423
22424 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22425 byte offset; we can do much more for larger data types, but have no way
22426 to determine the size of the access. We assume accesses are aligned. */
22427 #undef TARGET_MAX_ANCHOR_OFFSET
22428 #define TARGET_MAX_ANCHOR_OFFSET 4095
22429
22430 #undef TARGET_VECTOR_ALIGNMENT
22431 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22432
22433 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22434 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22435 aarch64_vectorize_preferred_vector_alignment
22436 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22437 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22438 aarch64_simd_vector_alignment_reachable
22439
22440 /* vec_perm support. */
22441
22442 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22443 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22444 aarch64_vectorize_vec_perm_const
22445
22446 #undef TARGET_VECTORIZE_RELATED_MODE
22447 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22448 #undef TARGET_VECTORIZE_GET_MASK_MODE
22449 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22450 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22451 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22452 aarch64_empty_mask_is_expensive
22453 #undef TARGET_PREFERRED_ELSE_VALUE
22454 #define TARGET_PREFERRED_ELSE_VALUE \
22455 aarch64_preferred_else_value
22456
22457 #undef TARGET_INIT_LIBFUNCS
22458 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22459
22460 #undef TARGET_FIXED_CONDITION_CODE_REGS
22461 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22462
22463 #undef TARGET_FLAGS_REGNUM
22464 #define TARGET_FLAGS_REGNUM CC_REGNUM
22465
22466 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22467 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22468
22469 #undef TARGET_ASAN_SHADOW_OFFSET
22470 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22471
22472 #undef TARGET_LEGITIMIZE_ADDRESS
22473 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22474
22475 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22476 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22477
22478 #undef TARGET_CAN_USE_DOLOOP_P
22479 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22480
22481 #undef TARGET_SCHED_ADJUST_PRIORITY
22482 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22483
22484 #undef TARGET_SCHED_MACRO_FUSION_P
22485 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22486
22487 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22488 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
22489
22490 #undef TARGET_SCHED_FUSION_PRIORITY
22491 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
22492
22493 #undef TARGET_UNSPEC_MAY_TRAP_P
22494 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
22495
22496 #undef TARGET_USE_PSEUDO_PIC_REG
22497 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
22498
22499 #undef TARGET_PRINT_OPERAND
22500 #define TARGET_PRINT_OPERAND aarch64_print_operand
22501
22502 #undef TARGET_PRINT_OPERAND_ADDRESS
22503 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
22504
22505 #undef TARGET_OPTAB_SUPPORTED_P
22506 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
22507
22508 #undef TARGET_OMIT_STRUCT_RETURN_REG
22509 #define TARGET_OMIT_STRUCT_RETURN_REG true
22510
22511 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
22512 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
22513 aarch64_dwarf_poly_indeterminate_value
22514
22515 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
22516 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
22517 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
22518
22519 #undef TARGET_HARD_REGNO_NREGS
22520 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
22521 #undef TARGET_HARD_REGNO_MODE_OK
22522 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
22523
22524 #undef TARGET_MODES_TIEABLE_P
22525 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
22526
22527 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
22528 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
22529 aarch64_hard_regno_call_part_clobbered
22530
22531 #undef TARGET_INSN_CALLEE_ABI
22532 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
22533
22534 #undef TARGET_CONSTANT_ALIGNMENT
22535 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
22536
22537 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
22538 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
22539 aarch64_stack_clash_protection_alloca_probe_range
22540
22541 #undef TARGET_COMPUTE_PRESSURE_CLASSES
22542 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
22543
22544 #undef TARGET_CAN_CHANGE_MODE_CLASS
22545 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
22546
22547 #undef TARGET_SELECT_EARLY_REMAT_MODES
22548 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
22549
22550 #undef TARGET_SPECULATION_SAFE_VALUE
22551 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
22552
22553 #undef TARGET_ESTIMATED_POLY_VALUE
22554 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
22555
22556 #undef TARGET_ATTRIBUTE_TABLE
22557 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
22558
22559 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
22560 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
22561 aarch64_simd_clone_compute_vecsize_and_simdlen
22562
22563 #undef TARGET_SIMD_CLONE_ADJUST
22564 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
22565
22566 #undef TARGET_SIMD_CLONE_USABLE
22567 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
22568
22569 #undef TARGET_COMP_TYPE_ATTRIBUTES
22570 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
22571
22572 #undef TARGET_GET_MULTILIB_ABI_NAME
22573 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
22574
22575 #undef TARGET_FNTYPE_ABI
22576 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
22577
22578 #if CHECKING_P
22579 #undef TARGET_RUN_TARGET_SELFTESTS
22580 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
22581 #endif /* #if CHECKING_P */
22582
22583 #undef TARGET_ASM_POST_CFI_STARTPROC
22584 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
22585
22586 #undef TARGET_STRICT_ARGUMENT_NAMING
22587 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
22588
22589 #undef TARGET_MD_ASM_ADJUST
22590 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
22591
22592 struct gcc_target targetm = TARGET_INITIALIZER;
22593
22594 #include "gt-aarch64.h"