]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.cc
Update copyright years.
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.cc
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2023 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #include "config.h"
26 #include "system.h"
27 #include "coretypes.h"
28 #include "backend.h"
29 #include "target.h"
30 #include "rtl.h"
31 #include "tree.h"
32 #include "memmodel.h"
33 #include "gimple.h"
34 #include "cfghooks.h"
35 #include "cfgloop.h"
36 #include "df.h"
37 #include "tm_p.h"
38 #include "stringpool.h"
39 #include "attribs.h"
40 #include "optabs.h"
41 #include "regs.h"
42 #include "emit-rtl.h"
43 #include "recog.h"
44 #include "cgraph.h"
45 #include "diagnostic.h"
46 #include "insn-attr.h"
47 #include "alias.h"
48 #include "fold-const.h"
49 #include "stor-layout.h"
50 #include "calls.h"
51 #include "varasm.h"
52 #include "output.h"
53 #include "flags.h"
54 #include "explow.h"
55 #include "expr.h"
56 #include "reload.h"
57 #include "langhooks.h"
58 #include "opts.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77 #include "function-abi.h"
78 #include "gimple-pretty-print.h"
79 #include "tree-ssa-loop-niter.h"
80 #include "fractional-cost.h"
81 #include "rtlanal.h"
82 #include "tree-dfa.h"
83 #include "asan.h"
84 #include "aarch64-feature-deps.h"
85
86 /* This file should be included last. */
87 #include "target-def.h"
88
89 /* Defined for convenience. */
90 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
91
92 /* Information about a legitimate vector immediate operand. */
93 struct simd_immediate_info
94 {
95 enum insn_type { MOV, MVN, INDEX, PTRUE };
96 enum modifier_type { LSL, MSL };
97
98 simd_immediate_info () {}
99 simd_immediate_info (scalar_float_mode, rtx);
100 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
101 insn_type = MOV, modifier_type = LSL,
102 unsigned int = 0);
103 simd_immediate_info (scalar_mode, rtx, rtx);
104 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
105
106 /* The mode of the elements. */
107 scalar_mode elt_mode;
108
109 /* The instruction to use to move the immediate into a vector. */
110 insn_type insn;
111
112 union
113 {
114 /* For MOV and MVN. */
115 struct
116 {
117 /* The value of each element. */
118 rtx value;
119
120 /* The kind of shift modifier to use, and the number of bits to shift.
121 This is (LSL, 0) if no shift is needed. */
122 modifier_type modifier;
123 unsigned int shift;
124 } mov;
125
126 /* For INDEX. */
127 struct
128 {
129 /* The value of the first element and the step to be added for each
130 subsequent element. */
131 rtx base, step;
132 } index;
133
134 /* For PTRUE. */
135 aarch64_svpattern pattern;
136 } u;
137 };
138
139 /* Construct a floating-point immediate in which each element has mode
140 ELT_MODE_IN and value VALUE_IN. */
141 inline simd_immediate_info
142 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
143 : elt_mode (elt_mode_in), insn (MOV)
144 {
145 u.mov.value = value_in;
146 u.mov.modifier = LSL;
147 u.mov.shift = 0;
148 }
149
150 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
151 and value VALUE_IN. The other parameters are as for the structure
152 fields. */
153 inline simd_immediate_info
154 ::simd_immediate_info (scalar_int_mode elt_mode_in,
155 unsigned HOST_WIDE_INT value_in,
156 insn_type insn_in, modifier_type modifier_in,
157 unsigned int shift_in)
158 : elt_mode (elt_mode_in), insn (insn_in)
159 {
160 u.mov.value = gen_int_mode (value_in, elt_mode_in);
161 u.mov.modifier = modifier_in;
162 u.mov.shift = shift_in;
163 }
164
165 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
166 and where element I is equal to BASE_IN + I * STEP_IN. */
167 inline simd_immediate_info
168 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
169 : elt_mode (elt_mode_in), insn (INDEX)
170 {
171 u.index.base = base_in;
172 u.index.step = step_in;
173 }
174
175 /* Construct a predicate that controls elements of mode ELT_MODE_IN
176 and has PTRUE pattern PATTERN_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_int_mode elt_mode_in,
179 aarch64_svpattern pattern_in)
180 : elt_mode (elt_mode_in), insn (PTRUE)
181 {
182 u.pattern = pattern_in;
183 }
184
185 namespace {
186
187 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
188 class pure_scalable_type_info
189 {
190 public:
191 /* Represents the result of analyzing a type. All values are nonzero,
192 in the possibly forlorn hope that accidental conversions to bool
193 trigger a warning. */
194 enum analysis_result
195 {
196 /* The type does not have an ABI identity; i.e. it doesn't contain
197 at least one object whose type is a Fundamental Data Type. */
198 NO_ABI_IDENTITY = 1,
199
200 /* The type is definitely a Pure Scalable Type. */
201 IS_PST,
202
203 /* The type is definitely not a Pure Scalable Type. */
204 ISNT_PST,
205
206 /* It doesn't matter for PCS purposes whether the type is a Pure
207 Scalable Type or not, since the type will be handled the same
208 way regardless.
209
210 Specifically, this means that if the type is a Pure Scalable Type,
211 there aren't enough argument registers to hold it, and so it will
212 need to be passed or returned in memory. If the type isn't a
213 Pure Scalable Type, it's too big to be passed or returned in core
214 or SIMD&FP registers, and so again will need to go in memory. */
215 DOESNT_MATTER
216 };
217
218 /* Aggregates of 17 bytes or more are normally passed and returned
219 in memory, so aggregates of that size can safely be analyzed as
220 DOESNT_MATTER. We need to be able to collect enough pieces to
221 represent a PST that is smaller than that. Since predicates are
222 2 bytes in size for -msve-vector-bits=128, that means we need to be
223 able to store at least 8 pieces.
224
225 We also need to be able to store enough pieces to represent
226 a single vector in each vector argument register and a single
227 predicate in each predicate argument register. This means that
228 we need at least 12 pieces. */
229 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
230 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
231
232 /* Describes one piece of a PST. Each piece is one of:
233
234 - a single Scalable Vector Type (SVT)
235 - a single Scalable Predicate Type (SPT)
236 - a PST containing 2, 3 or 4 SVTs, with no padding
237
238 It either represents a single built-in type or a PST formed from
239 multiple homogeneous built-in types. */
240 struct piece
241 {
242 rtx get_rtx (unsigned int, unsigned int) const;
243
244 /* The number of vector and predicate registers that the piece
245 occupies. One of the two is always zero. */
246 unsigned int num_zr;
247 unsigned int num_pr;
248
249 /* The mode of the registers described above. */
250 machine_mode mode;
251
252 /* If this piece is formed from multiple homogeneous built-in types,
253 this is the mode of the built-in types, otherwise it is MODE. */
254 machine_mode orig_mode;
255
256 /* The offset in bytes of the piece from the start of the type. */
257 poly_uint64_pod offset;
258 };
259
260 /* Divides types analyzed as IS_PST into individual pieces. The pieces
261 are in memory order. */
262 auto_vec<piece, MAX_PIECES> pieces;
263
264 unsigned int num_zr () const;
265 unsigned int num_pr () const;
266
267 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
268
269 analysis_result analyze (const_tree);
270 bool analyze_registers (const_tree);
271
272 private:
273 analysis_result analyze_array (const_tree);
274 analysis_result analyze_record (const_tree);
275 void add_piece (const piece &);
276 };
277 }
278
279 /* The current code model. */
280 enum aarch64_code_model aarch64_cmodel;
281
282 /* The number of 64-bit elements in an SVE vector. */
283 poly_uint16 aarch64_sve_vg;
284
285 #ifdef HAVE_AS_TLS
286 #undef TARGET_HAVE_TLS
287 #define TARGET_HAVE_TLS 1
288 #endif
289
290 static bool aarch64_composite_type_p (const_tree, machine_mode);
291 static bool aarch64_return_in_memory_1 (const_tree);
292 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
293 const_tree,
294 machine_mode *, int *,
295 bool *, bool);
296 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
297 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
298 static void aarch64_override_options_after_change (void);
299 static bool aarch64_vector_mode_supported_p (machine_mode);
300 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
301 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
302 const_tree type,
303 int misalignment,
304 bool is_packed);
305 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
306 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
307 aarch64_addr_query_type);
308
309 /* The processor for which instructions should be scheduled. */
310 enum aarch64_processor aarch64_tune = cortexa53;
311
312 /* Mask to specify which instruction scheduling options should be used. */
313 uint64_t aarch64_tune_flags = 0;
314
315 /* Global flag for PC relative loads. */
316 bool aarch64_pcrelative_literal_loads;
317
318 /* Global flag for whether frame pointer is enabled. */
319 bool aarch64_use_frame_pointer;
320
321 #define BRANCH_PROTECT_STR_MAX 255
322 char *accepted_branch_protection_string = NULL;
323
324 static enum aarch64_parse_opt_result
325 aarch64_parse_branch_protection (const char*, char**);
326
327 /* Support for command line parsing of boolean flags in the tuning
328 structures. */
329 struct aarch64_flag_desc
330 {
331 const char* name;
332 unsigned int flag;
333 };
334
335 #define AARCH64_FUSION_PAIR(name, internal_name) \
336 { name, AARCH64_FUSE_##internal_name },
337 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
338 {
339 { "none", AARCH64_FUSE_NOTHING },
340 #include "aarch64-fusion-pairs.def"
341 { "all", AARCH64_FUSE_ALL },
342 { NULL, AARCH64_FUSE_NOTHING }
343 };
344
345 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
346 { name, AARCH64_EXTRA_TUNE_##internal_name },
347 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
348 {
349 { "none", AARCH64_EXTRA_TUNE_NONE },
350 #include "aarch64-tuning-flags.def"
351 { "all", AARCH64_EXTRA_TUNE_ALL },
352 { NULL, AARCH64_EXTRA_TUNE_NONE }
353 };
354
355 /* Tuning parameters. */
356
357 static const struct cpu_addrcost_table generic_addrcost_table =
358 {
359 {
360 1, /* hi */
361 0, /* si */
362 0, /* di */
363 1, /* ti */
364 },
365 0, /* pre_modify */
366 0, /* post_modify */
367 0, /* post_modify_ld3_st3 */
368 0, /* post_modify_ld4_st4 */
369 0, /* register_offset */
370 0, /* register_sextend */
371 0, /* register_zextend */
372 0 /* imm_offset */
373 };
374
375 static const struct cpu_addrcost_table exynosm1_addrcost_table =
376 {
377 {
378 0, /* hi */
379 0, /* si */
380 0, /* di */
381 2, /* ti */
382 },
383 0, /* pre_modify */
384 0, /* post_modify */
385 0, /* post_modify_ld3_st3 */
386 0, /* post_modify_ld4_st4 */
387 1, /* register_offset */
388 1, /* register_sextend */
389 2, /* register_zextend */
390 0, /* imm_offset */
391 };
392
393 static const struct cpu_addrcost_table xgene1_addrcost_table =
394 {
395 {
396 1, /* hi */
397 0, /* si */
398 0, /* di */
399 1, /* ti */
400 },
401 1, /* pre_modify */
402 1, /* post_modify */
403 1, /* post_modify_ld3_st3 */
404 1, /* post_modify_ld4_st4 */
405 0, /* register_offset */
406 1, /* register_sextend */
407 1, /* register_zextend */
408 0, /* imm_offset */
409 };
410
411 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
412 {
413 {
414 1, /* hi */
415 1, /* si */
416 1, /* di */
417 2, /* ti */
418 },
419 0, /* pre_modify */
420 0, /* post_modify */
421 0, /* post_modify_ld3_st3 */
422 0, /* post_modify_ld4_st4 */
423 2, /* register_offset */
424 3, /* register_sextend */
425 3, /* register_zextend */
426 0, /* imm_offset */
427 };
428
429 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
430 {
431 {
432 1, /* hi */
433 1, /* si */
434 1, /* di */
435 2, /* ti */
436 },
437 0, /* pre_modify */
438 0, /* post_modify */
439 0, /* post_modify_ld3_st3 */
440 0, /* post_modify_ld4_st4 */
441 2, /* register_offset */
442 3, /* register_sextend */
443 3, /* register_zextend */
444 0, /* imm_offset */
445 };
446
447 static const struct cpu_addrcost_table tsv110_addrcost_table =
448 {
449 {
450 1, /* hi */
451 0, /* si */
452 0, /* di */
453 1, /* ti */
454 },
455 0, /* pre_modify */
456 0, /* post_modify */
457 0, /* post_modify_ld3_st3 */
458 0, /* post_modify_ld4_st4 */
459 0, /* register_offset */
460 1, /* register_sextend */
461 1, /* register_zextend */
462 0, /* imm_offset */
463 };
464
465 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
466 {
467 {
468 1, /* hi */
469 1, /* si */
470 1, /* di */
471 2, /* ti */
472 },
473 1, /* pre_modify */
474 1, /* post_modify */
475 1, /* post_modify_ld3_st3 */
476 1, /* post_modify_ld4_st4 */
477 3, /* register_offset */
478 3, /* register_sextend */
479 3, /* register_zextend */
480 2, /* imm_offset */
481 };
482
483 static const struct cpu_addrcost_table a64fx_addrcost_table =
484 {
485 {
486 1, /* hi */
487 1, /* si */
488 1, /* di */
489 2, /* ti */
490 },
491 0, /* pre_modify */
492 0, /* post_modify */
493 0, /* post_modify_ld3_st3 */
494 0, /* post_modify_ld4_st4 */
495 2, /* register_offset */
496 3, /* register_sextend */
497 3, /* register_zextend */
498 0, /* imm_offset */
499 };
500
501 static const struct cpu_addrcost_table neoversev1_addrcost_table =
502 {
503 {
504 1, /* hi */
505 0, /* si */
506 0, /* di */
507 1, /* ti */
508 },
509 0, /* pre_modify */
510 0, /* post_modify */
511 3, /* post_modify_ld3_st3 */
512 3, /* post_modify_ld4_st4 */
513 0, /* register_offset */
514 0, /* register_sextend */
515 0, /* register_zextend */
516 0 /* imm_offset */
517 };
518
519 static const struct cpu_addrcost_table neoversen2_addrcost_table =
520 {
521 {
522 1, /* hi */
523 0, /* si */
524 0, /* di */
525 1, /* ti */
526 },
527 0, /* pre_modify */
528 0, /* post_modify */
529 2, /* post_modify_ld3_st3 */
530 2, /* post_modify_ld4_st4 */
531 0, /* register_offset */
532 0, /* register_sextend */
533 0, /* register_zextend */
534 0 /* imm_offset */
535 };
536
537 static const struct cpu_addrcost_table neoversev2_addrcost_table =
538 {
539 {
540 1, /* hi */
541 0, /* si */
542 0, /* di */
543 1, /* ti */
544 },
545 0, /* pre_modify */
546 0, /* post_modify */
547 2, /* post_modify_ld3_st3 */
548 2, /* post_modify_ld4_st4 */
549 0, /* register_offset */
550 0, /* register_sextend */
551 0, /* register_zextend */
552 0 /* imm_offset */
553 };
554
555 static const struct cpu_regmove_cost generic_regmove_cost =
556 {
557 1, /* GP2GP */
558 /* Avoid the use of slow int<->fp moves for spilling by setting
559 their cost higher than memmov_cost. */
560 5, /* GP2FP */
561 5, /* FP2GP */
562 2 /* FP2FP */
563 };
564
565 static const struct cpu_regmove_cost cortexa57_regmove_cost =
566 {
567 1, /* GP2GP */
568 /* Avoid the use of slow int<->fp moves for spilling by setting
569 their cost higher than memmov_cost. */
570 5, /* GP2FP */
571 5, /* FP2GP */
572 2 /* FP2FP */
573 };
574
575 static const struct cpu_regmove_cost cortexa53_regmove_cost =
576 {
577 1, /* GP2GP */
578 /* Avoid the use of slow int<->fp moves for spilling by setting
579 their cost higher than memmov_cost. */
580 5, /* GP2FP */
581 5, /* FP2GP */
582 2 /* FP2FP */
583 };
584
585 static const struct cpu_regmove_cost exynosm1_regmove_cost =
586 {
587 1, /* GP2GP */
588 /* Avoid the use of slow int<->fp moves for spilling by setting
589 their cost higher than memmov_cost (actual, 4 and 9). */
590 9, /* GP2FP */
591 9, /* FP2GP */
592 1 /* FP2FP */
593 };
594
595 static const struct cpu_regmove_cost thunderx_regmove_cost =
596 {
597 2, /* GP2GP */
598 2, /* GP2FP */
599 6, /* FP2GP */
600 4 /* FP2FP */
601 };
602
603 static const struct cpu_regmove_cost xgene1_regmove_cost =
604 {
605 1, /* GP2GP */
606 /* Avoid the use of slow int<->fp moves for spilling by setting
607 their cost higher than memmov_cost. */
608 8, /* GP2FP */
609 8, /* FP2GP */
610 2 /* FP2FP */
611 };
612
613 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
614 {
615 2, /* GP2GP */
616 /* Avoid the use of int<->fp moves for spilling. */
617 6, /* GP2FP */
618 6, /* FP2GP */
619 4 /* FP2FP */
620 };
621
622 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
623 {
624 1, /* GP2GP */
625 /* Avoid the use of int<->fp moves for spilling. */
626 5, /* GP2FP */
627 6, /* FP2GP */
628 3, /* FP2FP */
629 };
630
631 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
632 {
633 1, /* GP2GP */
634 /* Avoid the use of int<->fp moves for spilling. */
635 4, /* GP2FP */
636 5, /* FP2GP */
637 4 /* FP2FP */
638 };
639
640 static const struct cpu_regmove_cost tsv110_regmove_cost =
641 {
642 1, /* GP2GP */
643 /* Avoid the use of slow int<->fp moves for spilling by setting
644 their cost higher than memmov_cost. */
645 2, /* GP2FP */
646 3, /* FP2GP */
647 2 /* FP2FP */
648 };
649
650 static const struct cpu_regmove_cost a64fx_regmove_cost =
651 {
652 1, /* GP2GP */
653 /* Avoid the use of slow int<->fp moves for spilling by setting
654 their cost higher than memmov_cost. */
655 5, /* GP2FP */
656 7, /* FP2GP */
657 2 /* FP2FP */
658 };
659
660 static const struct cpu_regmove_cost neoversen2_regmove_cost =
661 {
662 1, /* GP2GP */
663 /* Spilling to int<->fp instead of memory is recommended so set
664 realistic costs compared to memmov_cost. */
665 3, /* GP2FP */
666 2, /* FP2GP */
667 2 /* FP2FP */
668 };
669
670 static const struct cpu_regmove_cost neoversev1_regmove_cost =
671 {
672 1, /* GP2GP */
673 /* Spilling to int<->fp instead of memory is recommended so set
674 realistic costs compared to memmov_cost. */
675 3, /* GP2FP */
676 2, /* FP2GP */
677 2 /* FP2FP */
678 };
679
680 static const struct cpu_regmove_cost neoversev2_regmove_cost =
681 {
682 1, /* GP2GP */
683 /* Spilling to int<->fp instead of memory is recommended so set
684 realistic costs compared to memmov_cost. */
685 3, /* GP2FP */
686 2, /* FP2GP */
687 2 /* FP2FP */
688 };
689
690 /* Generic costs for Advanced SIMD vector operations. */
691 static const advsimd_vec_cost generic_advsimd_vector_cost =
692 {
693 1, /* int_stmt_cost */
694 1, /* fp_stmt_cost */
695 0, /* ld2_st2_permute_cost */
696 0, /* ld3_st3_permute_cost */
697 0, /* ld4_st4_permute_cost */
698 2, /* permute_cost */
699 2, /* reduc_i8_cost */
700 2, /* reduc_i16_cost */
701 2, /* reduc_i32_cost */
702 2, /* reduc_i64_cost */
703 2, /* reduc_f16_cost */
704 2, /* reduc_f32_cost */
705 2, /* reduc_f64_cost */
706 2, /* store_elt_extra_cost */
707 2, /* vec_to_scalar_cost */
708 1, /* scalar_to_vec_cost */
709 1, /* align_load_cost */
710 1, /* unalign_load_cost */
711 1, /* unalign_store_cost */
712 1 /* store_cost */
713 };
714
715 /* Generic costs for SVE vector operations. */
716 static const sve_vec_cost generic_sve_vector_cost =
717 {
718 {
719 1, /* int_stmt_cost */
720 1, /* fp_stmt_cost */
721 0, /* ld2_st2_permute_cost */
722 0, /* ld3_st3_permute_cost */
723 0, /* ld4_st4_permute_cost */
724 2, /* permute_cost */
725 2, /* reduc_i8_cost */
726 2, /* reduc_i16_cost */
727 2, /* reduc_i32_cost */
728 2, /* reduc_i64_cost */
729 2, /* reduc_f16_cost */
730 2, /* reduc_f32_cost */
731 2, /* reduc_f64_cost */
732 2, /* store_elt_extra_cost */
733 2, /* vec_to_scalar_cost */
734 1, /* scalar_to_vec_cost */
735 1, /* align_load_cost */
736 1, /* unalign_load_cost */
737 1, /* unalign_store_cost */
738 1 /* store_cost */
739 },
740 2, /* clast_cost */
741 2, /* fadda_f16_cost */
742 2, /* fadda_f32_cost */
743 2, /* fadda_f64_cost */
744 4, /* gather_load_x32_cost */
745 2, /* gather_load_x64_cost */
746 1 /* scatter_store_elt_cost */
747 };
748
749 /* Generic costs for vector insn classes. */
750 static const struct cpu_vector_cost generic_vector_cost =
751 {
752 1, /* scalar_int_stmt_cost */
753 1, /* scalar_fp_stmt_cost */
754 1, /* scalar_load_cost */
755 1, /* scalar_store_cost */
756 3, /* cond_taken_branch_cost */
757 1, /* cond_not_taken_branch_cost */
758 &generic_advsimd_vector_cost, /* advsimd */
759 &generic_sve_vector_cost, /* sve */
760 nullptr /* issue_info */
761 };
762
763 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
764 {
765 2, /* int_stmt_cost */
766 5, /* fp_stmt_cost */
767 0, /* ld2_st2_permute_cost */
768 0, /* ld3_st3_permute_cost */
769 0, /* ld4_st4_permute_cost */
770 3, /* permute_cost */
771 13, /* reduc_i8_cost */
772 13, /* reduc_i16_cost */
773 13, /* reduc_i32_cost */
774 13, /* reduc_i64_cost */
775 13, /* reduc_f16_cost */
776 13, /* reduc_f32_cost */
777 13, /* reduc_f64_cost */
778 13, /* store_elt_extra_cost */
779 13, /* vec_to_scalar_cost */
780 4, /* scalar_to_vec_cost */
781 6, /* align_load_cost */
782 6, /* unalign_load_cost */
783 1, /* unalign_store_cost */
784 1 /* store_cost */
785 };
786
787 static const sve_vec_cost a64fx_sve_vector_cost =
788 {
789 {
790 2, /* int_stmt_cost */
791 5, /* fp_stmt_cost */
792 0, /* ld2_st2_permute_cost */
793 0, /* ld3_st3_permute_cost */
794 0, /* ld4_st4_permute_cost */
795 3, /* permute_cost */
796 13, /* reduc_i8_cost */
797 13, /* reduc_i16_cost */
798 13, /* reduc_i32_cost */
799 13, /* reduc_i64_cost */
800 13, /* reduc_f16_cost */
801 13, /* reduc_f32_cost */
802 13, /* reduc_f64_cost */
803 13, /* store_elt_extra_cost */
804 13, /* vec_to_scalar_cost */
805 4, /* scalar_to_vec_cost */
806 6, /* align_load_cost */
807 6, /* unalign_load_cost */
808 1, /* unalign_store_cost */
809 1 /* store_cost */
810 },
811 13, /* clast_cost */
812 13, /* fadda_f16_cost */
813 13, /* fadda_f32_cost */
814 13, /* fadda_f64_cost */
815 64, /* gather_load_x32_cost */
816 32, /* gather_load_x64_cost */
817 1 /* scatter_store_elt_cost */
818 };
819
820 static const struct cpu_vector_cost a64fx_vector_cost =
821 {
822 1, /* scalar_int_stmt_cost */
823 5, /* scalar_fp_stmt_cost */
824 4, /* scalar_load_cost */
825 1, /* scalar_store_cost */
826 3, /* cond_taken_branch_cost */
827 1, /* cond_not_taken_branch_cost */
828 &a64fx_advsimd_vector_cost, /* advsimd */
829 &a64fx_sve_vector_cost, /* sve */
830 nullptr /* issue_info */
831 };
832
833 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
834 {
835 1, /* int_stmt_cost */
836 3, /* fp_stmt_cost */
837 0, /* ld2_st2_permute_cost */
838 0, /* ld3_st3_permute_cost */
839 0, /* ld4_st4_permute_cost */
840 2, /* permute_cost */
841 1, /* reduc_i8_cost */
842 1, /* reduc_i16_cost */
843 1, /* reduc_i32_cost */
844 1, /* reduc_i64_cost */
845 1, /* reduc_f16_cost */
846 1, /* reduc_f32_cost */
847 1, /* reduc_f64_cost */
848 1, /* store_elt_extra_cost */
849 1, /* vec_to_scalar_cost */
850 1, /* scalar_to_vec_cost */
851 1, /* align_load_cost */
852 1, /* unalign_load_cost */
853 1, /* unalign_store_cost */
854 1 /* store_cost */
855 };
856
857 /* QDF24XX costs for vector insn classes. */
858 static const struct cpu_vector_cost qdf24xx_vector_cost =
859 {
860 1, /* scalar_int_stmt_cost */
861 1, /* scalar_fp_stmt_cost */
862 1, /* scalar_load_cost */
863 1, /* scalar_store_cost */
864 3, /* cond_taken_branch_cost */
865 1, /* cond_not_taken_branch_cost */
866 &qdf24xx_advsimd_vector_cost, /* advsimd */
867 nullptr, /* sve */
868 nullptr /* issue_info */
869 };
870
871
872 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
873 {
874 4, /* int_stmt_cost */
875 1, /* fp_stmt_cost */
876 0, /* ld2_st2_permute_cost */
877 0, /* ld3_st3_permute_cost */
878 0, /* ld4_st4_permute_cost */
879 4, /* permute_cost */
880 2, /* reduc_i8_cost */
881 2, /* reduc_i16_cost */
882 2, /* reduc_i32_cost */
883 2, /* reduc_i64_cost */
884 2, /* reduc_f16_cost */
885 2, /* reduc_f32_cost */
886 2, /* reduc_f64_cost */
887 2, /* store_elt_extra_cost */
888 2, /* vec_to_scalar_cost */
889 2, /* scalar_to_vec_cost */
890 3, /* align_load_cost */
891 5, /* unalign_load_cost */
892 5, /* unalign_store_cost */
893 1 /* store_cost */
894 };
895
896 /* ThunderX costs for vector insn classes. */
897 static const struct cpu_vector_cost thunderx_vector_cost =
898 {
899 1, /* scalar_int_stmt_cost */
900 1, /* scalar_fp_stmt_cost */
901 3, /* scalar_load_cost */
902 1, /* scalar_store_cost */
903 3, /* cond_taken_branch_cost */
904 3, /* cond_not_taken_branch_cost */
905 &thunderx_advsimd_vector_cost, /* advsimd */
906 nullptr, /* sve */
907 nullptr /* issue_info */
908 };
909
910 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
911 {
912 2, /* int_stmt_cost */
913 2, /* fp_stmt_cost */
914 0, /* ld2_st2_permute_cost */
915 0, /* ld3_st3_permute_cost */
916 0, /* ld4_st4_permute_cost */
917 2, /* permute_cost */
918 3, /* reduc_i8_cost */
919 3, /* reduc_i16_cost */
920 3, /* reduc_i32_cost */
921 3, /* reduc_i64_cost */
922 3, /* reduc_f16_cost */
923 3, /* reduc_f32_cost */
924 3, /* reduc_f64_cost */
925 3, /* store_elt_extra_cost */
926 3, /* vec_to_scalar_cost */
927 2, /* scalar_to_vec_cost */
928 5, /* align_load_cost */
929 5, /* unalign_load_cost */
930 1, /* unalign_store_cost */
931 1 /* store_cost */
932 };
933
934 static const struct cpu_vector_cost tsv110_vector_cost =
935 {
936 1, /* scalar_int_stmt_cost */
937 1, /* scalar_fp_stmt_cost */
938 5, /* scalar_load_cost */
939 1, /* scalar_store_cost */
940 1, /* cond_taken_branch_cost */
941 1, /* cond_not_taken_branch_cost */
942 &tsv110_advsimd_vector_cost, /* advsimd */
943 nullptr, /* sve */
944 nullptr /* issue_info */
945 };
946
947 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
948 {
949 2, /* int_stmt_cost */
950 2, /* fp_stmt_cost */
951 0, /* ld2_st2_permute_cost */
952 0, /* ld3_st3_permute_cost */
953 0, /* ld4_st4_permute_cost */
954 3, /* permute_cost */
955 8, /* reduc_i8_cost */
956 8, /* reduc_i16_cost */
957 8, /* reduc_i32_cost */
958 8, /* reduc_i64_cost */
959 8, /* reduc_f16_cost */
960 8, /* reduc_f32_cost */
961 8, /* reduc_f64_cost */
962 8, /* store_elt_extra_cost */
963 8, /* vec_to_scalar_cost */
964 8, /* scalar_to_vec_cost */
965 4, /* align_load_cost */
966 4, /* unalign_load_cost */
967 1, /* unalign_store_cost */
968 1 /* store_cost */
969 };
970
971 /* Cortex-A57 costs for vector insn classes. */
972 static const struct cpu_vector_cost cortexa57_vector_cost =
973 {
974 1, /* scalar_int_stmt_cost */
975 1, /* scalar_fp_stmt_cost */
976 4, /* scalar_load_cost */
977 1, /* scalar_store_cost */
978 1, /* cond_taken_branch_cost */
979 1, /* cond_not_taken_branch_cost */
980 &cortexa57_advsimd_vector_cost, /* advsimd */
981 nullptr, /* sve */
982 nullptr /* issue_info */
983 };
984
985 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
986 {
987 3, /* int_stmt_cost */
988 3, /* fp_stmt_cost */
989 0, /* ld2_st2_permute_cost */
990 0, /* ld3_st3_permute_cost */
991 0, /* ld4_st4_permute_cost */
992 3, /* permute_cost */
993 3, /* reduc_i8_cost */
994 3, /* reduc_i16_cost */
995 3, /* reduc_i32_cost */
996 3, /* reduc_i64_cost */
997 3, /* reduc_f16_cost */
998 3, /* reduc_f32_cost */
999 3, /* reduc_f64_cost */
1000 3, /* store_elt_extra_cost */
1001 3, /* vec_to_scalar_cost */
1002 3, /* scalar_to_vec_cost */
1003 5, /* align_load_cost */
1004 5, /* unalign_load_cost */
1005 1, /* unalign_store_cost */
1006 1 /* store_cost */
1007 };
1008
1009 static const struct cpu_vector_cost exynosm1_vector_cost =
1010 {
1011 1, /* scalar_int_stmt_cost */
1012 1, /* scalar_fp_stmt_cost */
1013 5, /* scalar_load_cost */
1014 1, /* scalar_store_cost */
1015 1, /* cond_taken_branch_cost */
1016 1, /* cond_not_taken_branch_cost */
1017 &exynosm1_advsimd_vector_cost, /* advsimd */
1018 nullptr, /* sve */
1019 nullptr /* issue_info */
1020 };
1021
1022 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
1023 {
1024 2, /* int_stmt_cost */
1025 2, /* fp_stmt_cost */
1026 0, /* ld2_st2_permute_cost */
1027 0, /* ld3_st3_permute_cost */
1028 0, /* ld4_st4_permute_cost */
1029 2, /* permute_cost */
1030 4, /* reduc_i8_cost */
1031 4, /* reduc_i16_cost */
1032 4, /* reduc_i32_cost */
1033 4, /* reduc_i64_cost */
1034 4, /* reduc_f16_cost */
1035 4, /* reduc_f32_cost */
1036 4, /* reduc_f64_cost */
1037 4, /* store_elt_extra_cost */
1038 4, /* vec_to_scalar_cost */
1039 4, /* scalar_to_vec_cost */
1040 10, /* align_load_cost */
1041 10, /* unalign_load_cost */
1042 2, /* unalign_store_cost */
1043 2 /* store_cost */
1044 };
1045
1046 /* Generic costs for vector insn classes. */
1047 static const struct cpu_vector_cost xgene1_vector_cost =
1048 {
1049 1, /* scalar_int_stmt_cost */
1050 1, /* scalar_fp_stmt_cost */
1051 5, /* scalar_load_cost */
1052 1, /* scalar_store_cost */
1053 2, /* cond_taken_branch_cost */
1054 1, /* cond_not_taken_branch_cost */
1055 &xgene1_advsimd_vector_cost, /* advsimd */
1056 nullptr, /* sve */
1057 nullptr /* issue_info */
1058 };
1059
1060 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
1061 {
1062 4, /* int_stmt_cost */
1063 5, /* fp_stmt_cost */
1064 0, /* ld2_st2_permute_cost */
1065 0, /* ld3_st3_permute_cost */
1066 0, /* ld4_st4_permute_cost */
1067 10, /* permute_cost */
1068 6, /* reduc_i8_cost */
1069 6, /* reduc_i16_cost */
1070 6, /* reduc_i32_cost */
1071 6, /* reduc_i64_cost */
1072 6, /* reduc_f16_cost */
1073 6, /* reduc_f32_cost */
1074 6, /* reduc_f64_cost */
1075 6, /* store_elt_extra_cost */
1076 6, /* vec_to_scalar_cost */
1077 5, /* scalar_to_vec_cost */
1078 4, /* align_load_cost */
1079 4, /* unalign_load_cost */
1080 1, /* unalign_store_cost */
1081 1 /* store_cost */
1082 };
1083
1084 /* Costs for vector insn classes for Vulcan. */
1085 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1086 {
1087 1, /* scalar_int_stmt_cost */
1088 6, /* scalar_fp_stmt_cost */
1089 4, /* scalar_load_cost */
1090 1, /* scalar_store_cost */
1091 2, /* cond_taken_branch_cost */
1092 1, /* cond_not_taken_branch_cost */
1093 &thunderx2t99_advsimd_vector_cost, /* advsimd */
1094 nullptr, /* sve */
1095 nullptr /* issue_info */
1096 };
1097
1098 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1099 {
1100 5, /* int_stmt_cost */
1101 5, /* fp_stmt_cost */
1102 0, /* ld2_st2_permute_cost */
1103 0, /* ld3_st3_permute_cost */
1104 0, /* ld4_st4_permute_cost */
1105 10, /* permute_cost */
1106 5, /* reduc_i8_cost */
1107 5, /* reduc_i16_cost */
1108 5, /* reduc_i32_cost */
1109 5, /* reduc_i64_cost */
1110 5, /* reduc_f16_cost */
1111 5, /* reduc_f32_cost */
1112 5, /* reduc_f64_cost */
1113 5, /* store_elt_extra_cost */
1114 5, /* vec_to_scalar_cost */
1115 5, /* scalar_to_vec_cost */
1116 4, /* align_load_cost */
1117 4, /* unalign_load_cost */
1118 4, /* unalign_store_cost */
1119 4 /* store_cost */
1120 };
1121
1122 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1123 {
1124 1, /* scalar_int_stmt_cost */
1125 5, /* scalar_fp_stmt_cost */
1126 4, /* scalar_load_cost */
1127 1, /* scalar_store_cost */
1128 2, /* cond_taken_branch_cost */
1129 1, /* cond_not_taken_branch_cost */
1130 &thunderx3t110_advsimd_vector_cost, /* advsimd */
1131 nullptr, /* sve */
1132 nullptr /* issue_info */
1133 };
1134
1135 static const advsimd_vec_cost ampere1_advsimd_vector_cost =
1136 {
1137 3, /* int_stmt_cost */
1138 3, /* fp_stmt_cost */
1139 0, /* ld2_st2_permute_cost */
1140 0, /* ld3_st3_permute_cost */
1141 0, /* ld4_st4_permute_cost */
1142 2, /* permute_cost */
1143 12, /* reduc_i8_cost */
1144 9, /* reduc_i16_cost */
1145 6, /* reduc_i32_cost */
1146 5, /* reduc_i64_cost */
1147 9, /* reduc_f16_cost */
1148 6, /* reduc_f32_cost */
1149 5, /* reduc_f64_cost */
1150 8, /* store_elt_extra_cost */
1151 6, /* vec_to_scalar_cost */
1152 7, /* scalar_to_vec_cost */
1153 5, /* align_load_cost */
1154 5, /* unalign_load_cost */
1155 2, /* unalign_store_cost */
1156 2 /* store_cost */
1157 };
1158
1159 /* Ampere-1 costs for vector insn classes. */
1160 static const struct cpu_vector_cost ampere1_vector_cost =
1161 {
1162 1, /* scalar_int_stmt_cost */
1163 1, /* scalar_fp_stmt_cost */
1164 4, /* scalar_load_cost */
1165 1, /* scalar_store_cost */
1166 1, /* cond_taken_branch_cost */
1167 1, /* cond_not_taken_branch_cost */
1168 &ampere1_advsimd_vector_cost, /* advsimd */
1169 nullptr, /* sve */
1170 nullptr /* issue_info */
1171 };
1172
1173 /* Generic costs for branch instructions. */
1174 static const struct cpu_branch_cost generic_branch_cost =
1175 {
1176 1, /* Predictable. */
1177 3 /* Unpredictable. */
1178 };
1179
1180 /* Generic approximation modes. */
1181 static const cpu_approx_modes generic_approx_modes =
1182 {
1183 AARCH64_APPROX_NONE, /* division */
1184 AARCH64_APPROX_NONE, /* sqrt */
1185 AARCH64_APPROX_NONE /* recip_sqrt */
1186 };
1187
1188 /* Approximation modes for Exynos M1. */
1189 static const cpu_approx_modes exynosm1_approx_modes =
1190 {
1191 AARCH64_APPROX_NONE, /* division */
1192 AARCH64_APPROX_ALL, /* sqrt */
1193 AARCH64_APPROX_ALL /* recip_sqrt */
1194 };
1195
1196 /* Approximation modes for X-Gene 1. */
1197 static const cpu_approx_modes xgene1_approx_modes =
1198 {
1199 AARCH64_APPROX_NONE, /* division */
1200 AARCH64_APPROX_NONE, /* sqrt */
1201 AARCH64_APPROX_ALL /* recip_sqrt */
1202 };
1203
1204 /* Generic prefetch settings (which disable prefetch). */
1205 static const cpu_prefetch_tune generic_prefetch_tune =
1206 {
1207 0, /* num_slots */
1208 -1, /* l1_cache_size */
1209 -1, /* l1_cache_line_size */
1210 -1, /* l2_cache_size */
1211 true, /* prefetch_dynamic_strides */
1212 -1, /* minimum_stride */
1213 -1 /* default_opt_level */
1214 };
1215
1216 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1217 {
1218 0, /* num_slots */
1219 -1, /* l1_cache_size */
1220 64, /* l1_cache_line_size */
1221 -1, /* l2_cache_size */
1222 true, /* prefetch_dynamic_strides */
1223 -1, /* minimum_stride */
1224 -1 /* default_opt_level */
1225 };
1226
1227 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1228 {
1229 4, /* num_slots */
1230 32, /* l1_cache_size */
1231 64, /* l1_cache_line_size */
1232 512, /* l2_cache_size */
1233 false, /* prefetch_dynamic_strides */
1234 2048, /* minimum_stride */
1235 3 /* default_opt_level */
1236 };
1237
1238 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1239 {
1240 8, /* num_slots */
1241 32, /* l1_cache_size */
1242 128, /* l1_cache_line_size */
1243 16*1024, /* l2_cache_size */
1244 true, /* prefetch_dynamic_strides */
1245 -1, /* minimum_stride */
1246 3 /* default_opt_level */
1247 };
1248
1249 static const cpu_prefetch_tune thunderx_prefetch_tune =
1250 {
1251 8, /* num_slots */
1252 32, /* l1_cache_size */
1253 128, /* l1_cache_line_size */
1254 -1, /* l2_cache_size */
1255 true, /* prefetch_dynamic_strides */
1256 -1, /* minimum_stride */
1257 -1 /* default_opt_level */
1258 };
1259
1260 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1261 {
1262 8, /* num_slots */
1263 32, /* l1_cache_size */
1264 64, /* l1_cache_line_size */
1265 256, /* l2_cache_size */
1266 true, /* prefetch_dynamic_strides */
1267 -1, /* minimum_stride */
1268 -1 /* default_opt_level */
1269 };
1270
1271 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1272 {
1273 8, /* num_slots */
1274 32, /* l1_cache_size */
1275 64, /* l1_cache_line_size */
1276 256, /* l2_cache_size */
1277 true, /* prefetch_dynamic_strides */
1278 -1, /* minimum_stride */
1279 -1 /* default_opt_level */
1280 };
1281
1282 static const cpu_prefetch_tune tsv110_prefetch_tune =
1283 {
1284 0, /* num_slots */
1285 64, /* l1_cache_size */
1286 64, /* l1_cache_line_size */
1287 512, /* l2_cache_size */
1288 true, /* prefetch_dynamic_strides */
1289 -1, /* minimum_stride */
1290 -1 /* default_opt_level */
1291 };
1292
1293 static const cpu_prefetch_tune xgene1_prefetch_tune =
1294 {
1295 8, /* num_slots */
1296 32, /* l1_cache_size */
1297 64, /* l1_cache_line_size */
1298 256, /* l2_cache_size */
1299 true, /* prefetch_dynamic_strides */
1300 -1, /* minimum_stride */
1301 -1 /* default_opt_level */
1302 };
1303
1304 static const cpu_prefetch_tune a64fx_prefetch_tune =
1305 {
1306 8, /* num_slots */
1307 64, /* l1_cache_size */
1308 256, /* l1_cache_line_size */
1309 32768, /* l2_cache_size */
1310 true, /* prefetch_dynamic_strides */
1311 -1, /* minimum_stride */
1312 -1 /* default_opt_level */
1313 };
1314
1315 static const cpu_prefetch_tune ampere1_prefetch_tune =
1316 {
1317 0, /* num_slots */
1318 64, /* l1_cache_size */
1319 64, /* l1_cache_line_size */
1320 2048, /* l2_cache_size */
1321 true, /* prefetch_dynamic_strides */
1322 -1, /* minimum_stride */
1323 -1 /* default_opt_level */
1324 };
1325
1326 static const struct tune_params generic_tunings =
1327 {
1328 &cortexa57_extra_costs,
1329 &generic_addrcost_table,
1330 &generic_regmove_cost,
1331 &generic_vector_cost,
1332 &generic_branch_cost,
1333 &generic_approx_modes,
1334 SVE_NOT_IMPLEMENTED, /* sve_width */
1335 { 4, /* load_int. */
1336 4, /* store_int. */
1337 4, /* load_fp. */
1338 4, /* store_fp. */
1339 4, /* load_pred. */
1340 4 /* store_pred. */
1341 }, /* memmov_cost. */
1342 2, /* issue_rate */
1343 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1344 "16:12", /* function_align. */
1345 "4", /* jump_align. */
1346 "8", /* loop_align. */
1347 2, /* int_reassoc_width. */
1348 4, /* fp_reassoc_width. */
1349 1, /* fma_reassoc_width. */
1350 1, /* vec_reassoc_width. */
1351 2, /* min_div_recip_mul_sf. */
1352 2, /* min_div_recip_mul_df. */
1353 0, /* max_case_values. */
1354 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1355 /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1356 Neoverse V1. It does not have a noticeable effect on A64FX and should
1357 have at most a very minor effect on SVE2 cores. */
1358 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
1359 &generic_prefetch_tune
1360 };
1361
1362 static const struct tune_params cortexa35_tunings =
1363 {
1364 &cortexa53_extra_costs,
1365 &generic_addrcost_table,
1366 &cortexa53_regmove_cost,
1367 &generic_vector_cost,
1368 &generic_branch_cost,
1369 &generic_approx_modes,
1370 SVE_NOT_IMPLEMENTED, /* sve_width */
1371 { 4, /* load_int. */
1372 4, /* store_int. */
1373 4, /* load_fp. */
1374 4, /* store_fp. */
1375 4, /* load_pred. */
1376 4 /* store_pred. */
1377 }, /* memmov_cost. */
1378 1, /* issue_rate */
1379 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1380 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1381 "16", /* function_align. */
1382 "4", /* jump_align. */
1383 "8", /* loop_align. */
1384 2, /* int_reassoc_width. */
1385 4, /* fp_reassoc_width. */
1386 1, /* fma_reassoc_width. */
1387 1, /* vec_reassoc_width. */
1388 2, /* min_div_recip_mul_sf. */
1389 2, /* min_div_recip_mul_df. */
1390 0, /* max_case_values. */
1391 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1392 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1393 &generic_prefetch_tune
1394 };
1395
1396 static const struct tune_params cortexa53_tunings =
1397 {
1398 &cortexa53_extra_costs,
1399 &generic_addrcost_table,
1400 &cortexa53_regmove_cost,
1401 &generic_vector_cost,
1402 &generic_branch_cost,
1403 &generic_approx_modes,
1404 SVE_NOT_IMPLEMENTED, /* sve_width */
1405 { 4, /* load_int. */
1406 4, /* store_int. */
1407 4, /* load_fp. */
1408 4, /* store_fp. */
1409 4, /* load_pred. */
1410 4 /* store_pred. */
1411 }, /* memmov_cost. */
1412 2, /* issue_rate */
1413 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1414 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1415 "16", /* function_align. */
1416 "4", /* jump_align. */
1417 "8", /* loop_align. */
1418 2, /* int_reassoc_width. */
1419 4, /* fp_reassoc_width. */
1420 1, /* fma_reassoc_width. */
1421 1, /* vec_reassoc_width. */
1422 2, /* min_div_recip_mul_sf. */
1423 2, /* min_div_recip_mul_df. */
1424 0, /* max_case_values. */
1425 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1426 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1427 &generic_prefetch_tune
1428 };
1429
1430 static const struct tune_params cortexa57_tunings =
1431 {
1432 &cortexa57_extra_costs,
1433 &generic_addrcost_table,
1434 &cortexa57_regmove_cost,
1435 &cortexa57_vector_cost,
1436 &generic_branch_cost,
1437 &generic_approx_modes,
1438 SVE_NOT_IMPLEMENTED, /* sve_width */
1439 { 4, /* load_int. */
1440 4, /* store_int. */
1441 4, /* load_fp. */
1442 4, /* store_fp. */
1443 4, /* load_pred. */
1444 4 /* store_pred. */
1445 }, /* memmov_cost. */
1446 3, /* issue_rate */
1447 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1448 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1449 "16", /* function_align. */
1450 "4", /* jump_align. */
1451 "8", /* loop_align. */
1452 2, /* int_reassoc_width. */
1453 4, /* fp_reassoc_width. */
1454 1, /* fma_reassoc_width. */
1455 1, /* vec_reassoc_width. */
1456 2, /* min_div_recip_mul_sf. */
1457 2, /* min_div_recip_mul_df. */
1458 0, /* max_case_values. */
1459 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1460 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
1461 &generic_prefetch_tune
1462 };
1463
1464 static const struct tune_params cortexa72_tunings =
1465 {
1466 &cortexa57_extra_costs,
1467 &generic_addrcost_table,
1468 &cortexa57_regmove_cost,
1469 &cortexa57_vector_cost,
1470 &generic_branch_cost,
1471 &generic_approx_modes,
1472 SVE_NOT_IMPLEMENTED, /* sve_width */
1473 { 4, /* load_int. */
1474 4, /* store_int. */
1475 4, /* load_fp. */
1476 4, /* store_fp. */
1477 4, /* load_pred. */
1478 4 /* store_pred. */
1479 }, /* memmov_cost. */
1480 3, /* issue_rate */
1481 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1482 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1483 "16", /* function_align. */
1484 "4", /* jump_align. */
1485 "8", /* loop_align. */
1486 2, /* int_reassoc_width. */
1487 4, /* fp_reassoc_width. */
1488 1, /* fma_reassoc_width. */
1489 1, /* vec_reassoc_width. */
1490 2, /* min_div_recip_mul_sf. */
1491 2, /* min_div_recip_mul_df. */
1492 0, /* max_case_values. */
1493 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1494 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1495 &generic_prefetch_tune
1496 };
1497
1498 static const struct tune_params cortexa73_tunings =
1499 {
1500 &cortexa57_extra_costs,
1501 &generic_addrcost_table,
1502 &cortexa57_regmove_cost,
1503 &cortexa57_vector_cost,
1504 &generic_branch_cost,
1505 &generic_approx_modes,
1506 SVE_NOT_IMPLEMENTED, /* sve_width */
1507 { 4, /* load_int. */
1508 4, /* store_int. */
1509 4, /* load_fp. */
1510 4, /* store_fp. */
1511 4, /* load_pred. */
1512 4 /* store_pred. */
1513 }, /* memmov_cost. */
1514 2, /* issue_rate. */
1515 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1516 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1517 "16", /* function_align. */
1518 "4", /* jump_align. */
1519 "8", /* loop_align. */
1520 2, /* int_reassoc_width. */
1521 4, /* fp_reassoc_width. */
1522 1, /* fma_reassoc_width. */
1523 1, /* vec_reassoc_width. */
1524 2, /* min_div_recip_mul_sf. */
1525 2, /* min_div_recip_mul_df. */
1526 0, /* max_case_values. */
1527 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1528 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1529 &generic_prefetch_tune
1530 };
1531
1532
1533
1534 static const struct tune_params exynosm1_tunings =
1535 {
1536 &exynosm1_extra_costs,
1537 &exynosm1_addrcost_table,
1538 &exynosm1_regmove_cost,
1539 &exynosm1_vector_cost,
1540 &generic_branch_cost,
1541 &exynosm1_approx_modes,
1542 SVE_NOT_IMPLEMENTED, /* sve_width */
1543 { 4, /* load_int. */
1544 4, /* store_int. */
1545 4, /* load_fp. */
1546 4, /* store_fp. */
1547 4, /* load_pred. */
1548 4 /* store_pred. */
1549 }, /* memmov_cost. */
1550 3, /* issue_rate */
1551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
1552 "4", /* function_align. */
1553 "4", /* jump_align. */
1554 "4", /* loop_align. */
1555 2, /* int_reassoc_width. */
1556 4, /* fp_reassoc_width. */
1557 1, /* fma_reassoc_width. */
1558 1, /* vec_reassoc_width. */
1559 2, /* min_div_recip_mul_sf. */
1560 2, /* min_div_recip_mul_df. */
1561 48, /* max_case_values. */
1562 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1563 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1564 &exynosm1_prefetch_tune
1565 };
1566
1567 static const struct tune_params thunderxt88_tunings =
1568 {
1569 &thunderx_extra_costs,
1570 &generic_addrcost_table,
1571 &thunderx_regmove_cost,
1572 &thunderx_vector_cost,
1573 &generic_branch_cost,
1574 &generic_approx_modes,
1575 SVE_NOT_IMPLEMENTED, /* sve_width */
1576 { 6, /* load_int. */
1577 6, /* store_int. */
1578 6, /* load_fp. */
1579 6, /* store_fp. */
1580 6, /* load_pred. */
1581 6 /* store_pred. */
1582 }, /* memmov_cost. */
1583 2, /* issue_rate */
1584 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1585 "8", /* function_align. */
1586 "8", /* jump_align. */
1587 "8", /* loop_align. */
1588 2, /* int_reassoc_width. */
1589 4, /* fp_reassoc_width. */
1590 1, /* fma_reassoc_width. */
1591 1, /* vec_reassoc_width. */
1592 2, /* min_div_recip_mul_sf. */
1593 2, /* min_div_recip_mul_df. */
1594 0, /* max_case_values. */
1595 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1596 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1597 &thunderxt88_prefetch_tune
1598 };
1599
1600 static const struct tune_params thunderx_tunings =
1601 {
1602 &thunderx_extra_costs,
1603 &generic_addrcost_table,
1604 &thunderx_regmove_cost,
1605 &thunderx_vector_cost,
1606 &generic_branch_cost,
1607 &generic_approx_modes,
1608 SVE_NOT_IMPLEMENTED, /* sve_width */
1609 { 6, /* load_int. */
1610 6, /* store_int. */
1611 6, /* load_fp. */
1612 6, /* store_fp. */
1613 6, /* load_pred. */
1614 6 /* store_pred. */
1615 }, /* memmov_cost. */
1616 2, /* issue_rate */
1617 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1618 "8", /* function_align. */
1619 "8", /* jump_align. */
1620 "8", /* loop_align. */
1621 2, /* int_reassoc_width. */
1622 4, /* fp_reassoc_width. */
1623 1, /* fma_reassoc_width. */
1624 1, /* vec_reassoc_width. */
1625 2, /* min_div_recip_mul_sf. */
1626 2, /* min_div_recip_mul_df. */
1627 0, /* max_case_values. */
1628 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1629 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1630 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1631 &thunderx_prefetch_tune
1632 };
1633
1634 static const struct tune_params tsv110_tunings =
1635 {
1636 &tsv110_extra_costs,
1637 &tsv110_addrcost_table,
1638 &tsv110_regmove_cost,
1639 &tsv110_vector_cost,
1640 &generic_branch_cost,
1641 &generic_approx_modes,
1642 SVE_NOT_IMPLEMENTED, /* sve_width */
1643 { 4, /* load_int. */
1644 4, /* store_int. */
1645 4, /* load_fp. */
1646 4, /* store_fp. */
1647 4, /* load_pred. */
1648 4 /* store_pred. */
1649 }, /* memmov_cost. */
1650 4, /* issue_rate */
1651 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1652 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1653 "16", /* function_align. */
1654 "4", /* jump_align. */
1655 "8", /* loop_align. */
1656 2, /* int_reassoc_width. */
1657 4, /* fp_reassoc_width. */
1658 1, /* fma_reassoc_width. */
1659 1, /* vec_reassoc_width. */
1660 2, /* min_div_recip_mul_sf. */
1661 2, /* min_div_recip_mul_df. */
1662 0, /* max_case_values. */
1663 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1664 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1665 &tsv110_prefetch_tune
1666 };
1667
1668 static const struct tune_params xgene1_tunings =
1669 {
1670 &xgene1_extra_costs,
1671 &xgene1_addrcost_table,
1672 &xgene1_regmove_cost,
1673 &xgene1_vector_cost,
1674 &generic_branch_cost,
1675 &xgene1_approx_modes,
1676 SVE_NOT_IMPLEMENTED, /* sve_width */
1677 { 6, /* load_int. */
1678 6, /* store_int. */
1679 6, /* load_fp. */
1680 6, /* store_fp. */
1681 6, /* load_pred. */
1682 6 /* store_pred. */
1683 }, /* memmov_cost. */
1684 4, /* issue_rate */
1685 AARCH64_FUSE_NOTHING, /* fusible_ops */
1686 "16", /* function_align. */
1687 "16", /* jump_align. */
1688 "16", /* loop_align. */
1689 2, /* int_reassoc_width. */
1690 4, /* fp_reassoc_width. */
1691 1, /* fma_reassoc_width. */
1692 1, /* vec_reassoc_width. */
1693 2, /* min_div_recip_mul_sf. */
1694 2, /* min_div_recip_mul_df. */
1695 17, /* max_case_values. */
1696 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1697 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1698 &xgene1_prefetch_tune
1699 };
1700
1701 static const struct tune_params emag_tunings =
1702 {
1703 &xgene1_extra_costs,
1704 &xgene1_addrcost_table,
1705 &xgene1_regmove_cost,
1706 &xgene1_vector_cost,
1707 &generic_branch_cost,
1708 &xgene1_approx_modes,
1709 SVE_NOT_IMPLEMENTED,
1710 { 6, /* load_int. */
1711 6, /* store_int. */
1712 6, /* load_fp. */
1713 6, /* store_fp. */
1714 6, /* load_pred. */
1715 6 /* store_pred. */
1716 }, /* memmov_cost. */
1717 4, /* issue_rate */
1718 AARCH64_FUSE_NOTHING, /* fusible_ops */
1719 "16", /* function_align. */
1720 "16", /* jump_align. */
1721 "16", /* loop_align. */
1722 2, /* int_reassoc_width. */
1723 4, /* fp_reassoc_width. */
1724 1, /* fma_reassoc_width. */
1725 1, /* vec_reassoc_width. */
1726 2, /* min_div_recip_mul_sf. */
1727 2, /* min_div_recip_mul_df. */
1728 17, /* max_case_values. */
1729 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1730 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1731 &xgene1_prefetch_tune
1732 };
1733
1734 static const struct tune_params qdf24xx_tunings =
1735 {
1736 &qdf24xx_extra_costs,
1737 &qdf24xx_addrcost_table,
1738 &qdf24xx_regmove_cost,
1739 &qdf24xx_vector_cost,
1740 &generic_branch_cost,
1741 &generic_approx_modes,
1742 SVE_NOT_IMPLEMENTED, /* sve_width */
1743 { 4, /* load_int. */
1744 4, /* store_int. */
1745 4, /* load_fp. */
1746 4, /* store_fp. */
1747 4, /* load_pred. */
1748 4 /* store_pred. */
1749 }, /* memmov_cost. */
1750 4, /* issue_rate */
1751 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1752 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1753 "16", /* function_align. */
1754 "8", /* jump_align. */
1755 "16", /* loop_align. */
1756 2, /* int_reassoc_width. */
1757 4, /* fp_reassoc_width. */
1758 1, /* fma_reassoc_width. */
1759 1, /* vec_reassoc_width. */
1760 2, /* min_div_recip_mul_sf. */
1761 2, /* min_div_recip_mul_df. */
1762 0, /* max_case_values. */
1763 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1764 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1765 &qdf24xx_prefetch_tune
1766 };
1767
1768 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1769 for now. */
1770 static const struct tune_params saphira_tunings =
1771 {
1772 &generic_extra_costs,
1773 &generic_addrcost_table,
1774 &generic_regmove_cost,
1775 &generic_vector_cost,
1776 &generic_branch_cost,
1777 &generic_approx_modes,
1778 SVE_NOT_IMPLEMENTED, /* sve_width */
1779 { 4, /* load_int. */
1780 4, /* store_int. */
1781 4, /* load_fp. */
1782 4, /* store_fp. */
1783 4, /* load_pred. */
1784 4 /* store_pred. */
1785 }, /* memmov_cost. */
1786 4, /* issue_rate */
1787 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1788 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1789 "16", /* function_align. */
1790 "8", /* jump_align. */
1791 "16", /* loop_align. */
1792 2, /* int_reassoc_width. */
1793 4, /* fp_reassoc_width. */
1794 1, /* fma_reassoc_width. */
1795 1, /* vec_reassoc_width. */
1796 2, /* min_div_recip_mul_sf. */
1797 2, /* min_div_recip_mul_df. */
1798 0, /* max_case_values. */
1799 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1800 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1801 &generic_prefetch_tune
1802 };
1803
1804 static const struct tune_params thunderx2t99_tunings =
1805 {
1806 &thunderx2t99_extra_costs,
1807 &thunderx2t99_addrcost_table,
1808 &thunderx2t99_regmove_cost,
1809 &thunderx2t99_vector_cost,
1810 &generic_branch_cost,
1811 &generic_approx_modes,
1812 SVE_NOT_IMPLEMENTED, /* sve_width */
1813 { 4, /* load_int. */
1814 4, /* store_int. */
1815 4, /* load_fp. */
1816 4, /* store_fp. */
1817 4, /* load_pred. */
1818 4 /* store_pred. */
1819 }, /* memmov_cost. */
1820 4, /* issue_rate. */
1821 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1822 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1823 "16", /* function_align. */
1824 "8", /* jump_align. */
1825 "16", /* loop_align. */
1826 3, /* int_reassoc_width. */
1827 2, /* fp_reassoc_width. */
1828 1, /* fma_reassoc_width. */
1829 2, /* vec_reassoc_width. */
1830 2, /* min_div_recip_mul_sf. */
1831 2, /* min_div_recip_mul_df. */
1832 0, /* max_case_values. */
1833 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1834 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1835 &thunderx2t99_prefetch_tune
1836 };
1837
1838 static const struct tune_params thunderx3t110_tunings =
1839 {
1840 &thunderx3t110_extra_costs,
1841 &thunderx3t110_addrcost_table,
1842 &thunderx3t110_regmove_cost,
1843 &thunderx3t110_vector_cost,
1844 &generic_branch_cost,
1845 &generic_approx_modes,
1846 SVE_NOT_IMPLEMENTED, /* sve_width */
1847 { 4, /* load_int. */
1848 4, /* store_int. */
1849 4, /* load_fp. */
1850 4, /* store_fp. */
1851 4, /* load_pred. */
1852 4 /* store_pred. */
1853 }, /* memmov_cost. */
1854 6, /* issue_rate. */
1855 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1856 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1857 "16", /* function_align. */
1858 "8", /* jump_align. */
1859 "16", /* loop_align. */
1860 3, /* int_reassoc_width. */
1861 2, /* fp_reassoc_width. */
1862 1, /* fma_reassoc_width. */
1863 2, /* vec_reassoc_width. */
1864 2, /* min_div_recip_mul_sf. */
1865 2, /* min_div_recip_mul_df. */
1866 0, /* max_case_values. */
1867 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1868 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1869 &thunderx3t110_prefetch_tune
1870 };
1871
1872 static const struct tune_params neoversen1_tunings =
1873 {
1874 &cortexa76_extra_costs,
1875 &generic_addrcost_table,
1876 &generic_regmove_cost,
1877 &cortexa57_vector_cost,
1878 &generic_branch_cost,
1879 &generic_approx_modes,
1880 SVE_NOT_IMPLEMENTED, /* sve_width */
1881 { 4, /* load_int. */
1882 2, /* store_int. */
1883 5, /* load_fp. */
1884 2, /* store_fp. */
1885 4, /* load_pred. */
1886 4 /* store_pred. */
1887 }, /* memmov_cost. */
1888 3, /* issue_rate */
1889 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1890 "32:16", /* function_align. */
1891 "4", /* jump_align. */
1892 "32:16", /* loop_align. */
1893 2, /* int_reassoc_width. */
1894 4, /* fp_reassoc_width. */
1895 1, /* fma_reassoc_width. */
1896 2, /* vec_reassoc_width. */
1897 2, /* min_div_recip_mul_sf. */
1898 2, /* min_div_recip_mul_df. */
1899 0, /* max_case_values. */
1900 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1901 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1902 &generic_prefetch_tune
1903 };
1904
1905 static const struct tune_params ampere1_tunings =
1906 {
1907 &ampere1_extra_costs,
1908 &generic_addrcost_table,
1909 &generic_regmove_cost,
1910 &ampere1_vector_cost,
1911 &generic_branch_cost,
1912 &generic_approx_modes,
1913 SVE_NOT_IMPLEMENTED, /* sve_width */
1914 { 4, /* load_int. */
1915 4, /* store_int. */
1916 4, /* load_fp. */
1917 4, /* store_fp. */
1918 4, /* load_pred. */
1919 4 /* store_pred. */
1920 }, /* memmov_cost. */
1921 4, /* issue_rate */
1922 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1923 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1924 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1925 AARCH64_FUSE_CMP_BRANCH),
1926 /* fusible_ops */
1927 "32", /* function_align. */
1928 "4", /* jump_align. */
1929 "32:16", /* loop_align. */
1930 2, /* int_reassoc_width. */
1931 4, /* fp_reassoc_width. */
1932 1, /* fma_reassoc_width. */
1933 2, /* vec_reassoc_width. */
1934 2, /* min_div_recip_mul_sf. */
1935 2, /* min_div_recip_mul_df. */
1936 0, /* max_case_values. */
1937 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1938 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1939 &ampere1_prefetch_tune
1940 };
1941
1942 static const struct tune_params ampere1a_tunings =
1943 {
1944 &ampere1a_extra_costs,
1945 &generic_addrcost_table,
1946 &generic_regmove_cost,
1947 &ampere1_vector_cost,
1948 &generic_branch_cost,
1949 &generic_approx_modes,
1950 SVE_NOT_IMPLEMENTED, /* sve_width */
1951 { 4, /* load_int. */
1952 4, /* store_int. */
1953 4, /* load_fp. */
1954 4, /* store_fp. */
1955 4, /* load_pred. */
1956 4 /* store_pred. */
1957 }, /* memmov_cost. */
1958 4, /* issue_rate */
1959 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1960 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1961 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1962 AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
1963 AARCH64_FUSE_ADDSUB_2REG_CONST1),
1964 /* fusible_ops */
1965 "32", /* function_align. */
1966 "4", /* jump_align. */
1967 "32:16", /* loop_align. */
1968 2, /* int_reassoc_width. */
1969 4, /* fp_reassoc_width. */
1970 1, /* fma_reassoc_width. */
1971 2, /* vec_reassoc_width. */
1972 2, /* min_div_recip_mul_sf. */
1973 2, /* min_div_recip_mul_df. */
1974 0, /* max_case_values. */
1975 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1976 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1977 &ampere1_prefetch_tune
1978 };
1979
1980 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1981 {
1982 2, /* int_stmt_cost */
1983 2, /* fp_stmt_cost */
1984 4, /* ld2_st2_permute_cost */
1985 4, /* ld3_st3_permute_cost */
1986 5, /* ld4_st4_permute_cost */
1987 3, /* permute_cost */
1988 4, /* reduc_i8_cost */
1989 4, /* reduc_i16_cost */
1990 2, /* reduc_i32_cost */
1991 2, /* reduc_i64_cost */
1992 6, /* reduc_f16_cost */
1993 3, /* reduc_f32_cost */
1994 2, /* reduc_f64_cost */
1995 2, /* store_elt_extra_cost */
1996 /* This value is just inherited from the Cortex-A57 table. */
1997 8, /* vec_to_scalar_cost */
1998 /* This depends very much on what the scalar value is and
1999 where it comes from. E.g. some constants take two dependent
2000 instructions or a load, while others might be moved from a GPR.
2001 4 seems to be a reasonable compromise in practice. */
2002 4, /* scalar_to_vec_cost */
2003 4, /* align_load_cost */
2004 4, /* unalign_load_cost */
2005 /* Although stores have a latency of 2 and compete for the
2006 vector pipes, in practice it's better not to model that. */
2007 1, /* unalign_store_cost */
2008 1 /* store_cost */
2009 };
2010
2011 static const sve_vec_cost neoversev1_sve_vector_cost =
2012 {
2013 {
2014 2, /* int_stmt_cost */
2015 2, /* fp_stmt_cost */
2016 4, /* ld2_st2_permute_cost */
2017 7, /* ld3_st3_permute_cost */
2018 8, /* ld4_st4_permute_cost */
2019 3, /* permute_cost */
2020 /* Theoretically, a reduction involving 31 scalar ADDs could
2021 complete in ~9 cycles and would have a cost of 31. [SU]ADDV
2022 completes in 14 cycles, so give it a cost of 31 + 5. */
2023 36, /* reduc_i8_cost */
2024 /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
2025 22, /* reduc_i16_cost */
2026 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
2027 14, /* reduc_i32_cost */
2028 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
2029 11, /* reduc_i64_cost */
2030 /* Theoretically, a reduction involving 15 scalar FADDs could
2031 complete in ~9 cycles and would have a cost of 30. FADDV
2032 completes in 13 cycles, so give it a cost of 30 + 4. */
2033 34, /* reduc_f16_cost */
2034 /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
2035 19, /* reduc_f32_cost */
2036 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
2037 11, /* reduc_f64_cost */
2038 2, /* store_elt_extra_cost */
2039 /* This value is just inherited from the Cortex-A57 table. */
2040 8, /* vec_to_scalar_cost */
2041 /* See the comment above the Advanced SIMD versions. */
2042 4, /* scalar_to_vec_cost */
2043 4, /* align_load_cost */
2044 4, /* unalign_load_cost */
2045 /* Although stores have a latency of 2 and compete for the
2046 vector pipes, in practice it's better not to model that. */
2047 1, /* unalign_store_cost */
2048 1 /* store_cost */
2049 },
2050 3, /* clast_cost */
2051 19, /* fadda_f16_cost */
2052 11, /* fadda_f32_cost */
2053 8, /* fadda_f64_cost */
2054 32, /* gather_load_x32_cost */
2055 16, /* gather_load_x64_cost */
2056 3 /* scatter_store_elt_cost */
2057 };
2058
2059 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
2060 {
2061 3, /* loads_stores_per_cycle */
2062 2, /* stores_per_cycle */
2063 4, /* general_ops_per_cycle */
2064 0, /* fp_simd_load_general_ops */
2065 1 /* fp_simd_store_general_ops */
2066 };
2067
2068 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
2069 {
2070 {
2071 3, /* loads_stores_per_cycle */
2072 2, /* stores_per_cycle */
2073 4, /* general_ops_per_cycle */
2074 0, /* fp_simd_load_general_ops */
2075 1 /* fp_simd_store_general_ops */
2076 },
2077 2, /* ld2_st2_general_ops */
2078 2, /* ld3_st3_general_ops */
2079 3 /* ld4_st4_general_ops */
2080 };
2081
2082 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
2083 {
2084 {
2085 {
2086 2, /* loads_per_cycle */
2087 2, /* stores_per_cycle */
2088 2, /* general_ops_per_cycle */
2089 0, /* fp_simd_load_general_ops */
2090 1 /* fp_simd_store_general_ops */
2091 },
2092 2, /* ld2_st2_general_ops */
2093 2, /* ld3_st3_general_ops */
2094 3 /* ld4_st4_general_ops */
2095 },
2096 1, /* pred_ops_per_cycle */
2097 2, /* while_pred_ops */
2098 2, /* int_cmp_pred_ops */
2099 1, /* fp_cmp_pred_ops */
2100 1, /* gather_scatter_pair_general_ops */
2101 1 /* gather_scatter_pair_pred_ops */
2102 };
2103
2104 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
2105 {
2106 &neoversev1_scalar_issue_info,
2107 &neoversev1_advsimd_issue_info,
2108 &neoversev1_sve_issue_info
2109 };
2110
2111 /* Neoverse V1 costs for vector insn classes. */
2112 static const struct cpu_vector_cost neoversev1_vector_cost =
2113 {
2114 1, /* scalar_int_stmt_cost */
2115 2, /* scalar_fp_stmt_cost */
2116 4, /* scalar_load_cost */
2117 1, /* scalar_store_cost */
2118 1, /* cond_taken_branch_cost */
2119 1, /* cond_not_taken_branch_cost */
2120 &neoversev1_advsimd_vector_cost, /* advsimd */
2121 &neoversev1_sve_vector_cost, /* sve */
2122 &neoversev1_vec_issue_info /* issue_info */
2123 };
2124
2125 static const struct tune_params neoversev1_tunings =
2126 {
2127 &cortexa76_extra_costs,
2128 &neoversev1_addrcost_table,
2129 &neoversev1_regmove_cost,
2130 &neoversev1_vector_cost,
2131 &generic_branch_cost,
2132 &generic_approx_modes,
2133 SVE_256, /* sve_width */
2134 { 4, /* load_int. */
2135 2, /* store_int. */
2136 6, /* load_fp. */
2137 2, /* store_fp. */
2138 6, /* load_pred. */
2139 1 /* store_pred. */
2140 }, /* memmov_cost. */
2141 3, /* issue_rate */
2142 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2143 "32:16", /* function_align. */
2144 "4", /* jump_align. */
2145 "32:16", /* loop_align. */
2146 2, /* int_reassoc_width. */
2147 4, /* fp_reassoc_width. */
2148 4, /* fma_reassoc_width. */
2149 2, /* vec_reassoc_width. */
2150 2, /* min_div_recip_mul_sf. */
2151 2, /* min_div_recip_mul_df. */
2152 0, /* max_case_values. */
2153 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2154 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2155 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2156 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
2157 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
2158 &generic_prefetch_tune
2159 };
2160
2161 static const sve_vec_cost neoverse512tvb_sve_vector_cost =
2162 {
2163 {
2164 2, /* int_stmt_cost */
2165 2, /* fp_stmt_cost */
2166 4, /* ld2_st2_permute_cost */
2167 5, /* ld3_st3_permute_cost */
2168 5, /* ld4_st4_permute_cost */
2169 3, /* permute_cost */
2170 /* Theoretically, a reduction involving 15 scalar ADDs could
2171 complete in ~5 cycles and would have a cost of 15. Assume that
2172 [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
2173 21, /* reduc_i8_cost */
2174 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2175 13, /* reduc_i16_cost */
2176 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2177 9, /* reduc_i32_cost */
2178 /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
2179 8, /* reduc_i64_cost */
2180 /* Theoretically, a reduction involving 7 scalar FADDs could
2181 complete in ~6 cycles and would have a cost of 14. Assume that
2182 FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
2183 16, /* reduc_f16_cost */
2184 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2185 8, /* reduc_f32_cost */
2186 /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
2187 4, /* reduc_f64_cost */
2188 2, /* store_elt_extra_cost */
2189 /* This value is just inherited from the Cortex-A57 table. */
2190 8, /* vec_to_scalar_cost */
2191 /* This depends very much on what the scalar value is and
2192 where it comes from. E.g. some constants take two dependent
2193 instructions or a load, while others might be moved from a GPR.
2194 4 seems to be a reasonable compromise in practice. */
2195 4, /* scalar_to_vec_cost */
2196 4, /* align_load_cost */
2197 4, /* unalign_load_cost */
2198 /* Although stores generally have a latency of 2 and compete for the
2199 vector pipes, in practice it's better not to model that. */
2200 1, /* unalign_store_cost */
2201 1 /* store_cost */
2202 },
2203 3, /* clast_cost */
2204 10, /* fadda_f16_cost */
2205 6, /* fadda_f32_cost */
2206 4, /* fadda_f64_cost */
2207 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2208 (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2209 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2210 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2211 (cost 2) to that, to avoid the difference being lost in rounding.
2212
2213 There is no easy comparison between a strided Advanced SIMD x32 load
2214 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2215 operation more than a 64-bit gather. */
2216 14, /* gather_load_x32_cost */
2217 12, /* gather_load_x64_cost */
2218 3 /* scatter_store_elt_cost */
2219 };
2220
2221 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
2222 {
2223 {
2224 {
2225 3, /* loads_per_cycle */
2226 2, /* stores_per_cycle */
2227 4, /* general_ops_per_cycle */
2228 0, /* fp_simd_load_general_ops */
2229 1 /* fp_simd_store_general_ops */
2230 },
2231 2, /* ld2_st2_general_ops */
2232 2, /* ld3_st3_general_ops */
2233 3 /* ld4_st4_general_ops */
2234 },
2235 2, /* pred_ops_per_cycle */
2236 2, /* while_pred_ops */
2237 2, /* int_cmp_pred_ops */
2238 1, /* fp_cmp_pred_ops */
2239 1, /* gather_scatter_pair_general_ops */
2240 1 /* gather_scatter_pair_pred_ops */
2241 };
2242
2243 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
2244 {
2245 &neoversev1_scalar_issue_info,
2246 &neoversev1_advsimd_issue_info,
2247 &neoverse512tvb_sve_issue_info
2248 };
2249
2250 static const struct cpu_vector_cost neoverse512tvb_vector_cost =
2251 {
2252 1, /* scalar_int_stmt_cost */
2253 2, /* scalar_fp_stmt_cost */
2254 4, /* scalar_load_cost */
2255 1, /* scalar_store_cost */
2256 1, /* cond_taken_branch_cost */
2257 1, /* cond_not_taken_branch_cost */
2258 &neoversev1_advsimd_vector_cost, /* advsimd */
2259 &neoverse512tvb_sve_vector_cost, /* sve */
2260 &neoverse512tvb_vec_issue_info /* issue_info */
2261 };
2262
2263 static const struct tune_params neoverse512tvb_tunings =
2264 {
2265 &cortexa76_extra_costs,
2266 &neoversev1_addrcost_table,
2267 &neoversev1_regmove_cost,
2268 &neoverse512tvb_vector_cost,
2269 &generic_branch_cost,
2270 &generic_approx_modes,
2271 SVE_128 | SVE_256, /* sve_width */
2272 { 4, /* load_int. */
2273 2, /* store_int. */
2274 6, /* load_fp. */
2275 2, /* store_fp. */
2276 6, /* load_pred. */
2277 1 /* store_pred. */
2278 }, /* memmov_cost. */
2279 3, /* issue_rate */
2280 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2281 "32:16", /* function_align. */
2282 "4", /* jump_align. */
2283 "32:16", /* loop_align. */
2284 2, /* int_reassoc_width. */
2285 4, /* fp_reassoc_width. */
2286 4, /* fma_reassoc_width. */
2287 2, /* vec_reassoc_width. */
2288 2, /* min_div_recip_mul_sf. */
2289 2, /* min_div_recip_mul_df. */
2290 0, /* max_case_values. */
2291 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2292 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2293 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2294 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2295 &generic_prefetch_tune
2296 };
2297
2298 static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
2299 {
2300 2, /* int_stmt_cost */
2301 2, /* fp_stmt_cost */
2302 2, /* ld2_st2_permute_cost */
2303 2, /* ld3_st3_permute_cost */
2304 3, /* ld4_st4_permute_cost */
2305 3, /* permute_cost */
2306 4, /* reduc_i8_cost */
2307 4, /* reduc_i16_cost */
2308 2, /* reduc_i32_cost */
2309 2, /* reduc_i64_cost */
2310 6, /* reduc_f16_cost */
2311 4, /* reduc_f32_cost */
2312 2, /* reduc_f64_cost */
2313 2, /* store_elt_extra_cost */
2314 /* This value is just inherited from the Cortex-A57 table. */
2315 8, /* vec_to_scalar_cost */
2316 /* This depends very much on what the scalar value is and
2317 where it comes from. E.g. some constants take two dependent
2318 instructions or a load, while others might be moved from a GPR.
2319 4 seems to be a reasonable compromise in practice. */
2320 4, /* scalar_to_vec_cost */
2321 4, /* align_load_cost */
2322 4, /* unalign_load_cost */
2323 /* Although stores have a latency of 2 and compete for the
2324 vector pipes, in practice it's better not to model that. */
2325 1, /* unalign_store_cost */
2326 1 /* store_cost */
2327 };
2328
2329 static const sve_vec_cost neoversen2_sve_vector_cost =
2330 {
2331 {
2332 2, /* int_stmt_cost */
2333 2, /* fp_stmt_cost */
2334 3, /* ld2_st2_permute_cost */
2335 4, /* ld3_st3_permute_cost */
2336 4, /* ld4_st4_permute_cost */
2337 3, /* permute_cost */
2338 /* Theoretically, a reduction involving 15 scalar ADDs could
2339 complete in ~5 cycles and would have a cost of 15. [SU]ADDV
2340 completes in 11 cycles, so give it a cost of 15 + 6. */
2341 21, /* reduc_i8_cost */
2342 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2343 13, /* reduc_i16_cost */
2344 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2345 9, /* reduc_i32_cost */
2346 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2347 2, /* reduc_i64_cost */
2348 /* Theoretically, a reduction involving 7 scalar FADDs could
2349 complete in ~8 cycles and would have a cost of 14. FADDV
2350 completes in 6 cycles, so give it a cost of 14 - 2. */
2351 12, /* reduc_f16_cost */
2352 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */
2353 6, /* reduc_f32_cost */
2354 /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0. */
2355 2, /* reduc_f64_cost */
2356 2, /* store_elt_extra_cost */
2357 /* This value is just inherited from the Cortex-A57 table. */
2358 8, /* vec_to_scalar_cost */
2359 /* See the comment above the Advanced SIMD versions. */
2360 4, /* scalar_to_vec_cost */
2361 4, /* align_load_cost */
2362 4, /* unalign_load_cost */
2363 /* Although stores have a latency of 2 and compete for the
2364 vector pipes, in practice it's better not to model that. */
2365 1, /* unalign_store_cost */
2366 1 /* store_cost */
2367 },
2368 3, /* clast_cost */
2369 10, /* fadda_f16_cost */
2370 6, /* fadda_f32_cost */
2371 4, /* fadda_f64_cost */
2372 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2373 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2374 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2375 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2376 (cost 2) to that, to avoid the difference being lost in rounding.
2377
2378 There is no easy comparison between a strided Advanced SIMD x32 load
2379 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2380 operation more than a 64-bit gather. */
2381 14, /* gather_load_x32_cost */
2382 12, /* gather_load_x64_cost */
2383 3 /* scatter_store_elt_cost */
2384 };
2385
2386 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
2387 {
2388 3, /* loads_stores_per_cycle */
2389 2, /* stores_per_cycle */
2390 4, /* general_ops_per_cycle */
2391 0, /* fp_simd_load_general_ops */
2392 1 /* fp_simd_store_general_ops */
2393 };
2394
2395 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
2396 {
2397 {
2398 3, /* loads_stores_per_cycle */
2399 2, /* stores_per_cycle */
2400 2, /* general_ops_per_cycle */
2401 0, /* fp_simd_load_general_ops */
2402 1 /* fp_simd_store_general_ops */
2403 },
2404 2, /* ld2_st2_general_ops */
2405 2, /* ld3_st3_general_ops */
2406 3 /* ld4_st4_general_ops */
2407 };
2408
2409 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
2410 {
2411 {
2412 {
2413 3, /* loads_per_cycle */
2414 2, /* stores_per_cycle */
2415 2, /* general_ops_per_cycle */
2416 0, /* fp_simd_load_general_ops */
2417 1 /* fp_simd_store_general_ops */
2418 },
2419 2, /* ld2_st2_general_ops */
2420 3, /* ld3_st3_general_ops */
2421 3 /* ld4_st4_general_ops */
2422 },
2423 2, /* pred_ops_per_cycle */
2424 2, /* while_pred_ops */
2425 2, /* int_cmp_pred_ops */
2426 1, /* fp_cmp_pred_ops */
2427 1, /* gather_scatter_pair_general_ops */
2428 1 /* gather_scatter_pair_pred_ops */
2429 };
2430
2431 static const aarch64_vec_issue_info neoversen2_vec_issue_info =
2432 {
2433 &neoversen2_scalar_issue_info,
2434 &neoversen2_advsimd_issue_info,
2435 &neoversen2_sve_issue_info
2436 };
2437
2438 /* Neoverse N2 costs for vector insn classes. */
2439 static const struct cpu_vector_cost neoversen2_vector_cost =
2440 {
2441 1, /* scalar_int_stmt_cost */
2442 2, /* scalar_fp_stmt_cost */
2443 4, /* scalar_load_cost */
2444 1, /* scalar_store_cost */
2445 1, /* cond_taken_branch_cost */
2446 1, /* cond_not_taken_branch_cost */
2447 &neoversen2_advsimd_vector_cost, /* advsimd */
2448 &neoversen2_sve_vector_cost, /* sve */
2449 &neoversen2_vec_issue_info /* issue_info */
2450 };
2451
2452 static const struct tune_params neoversen2_tunings =
2453 {
2454 &cortexa76_extra_costs,
2455 &neoversen2_addrcost_table,
2456 &neoversen2_regmove_cost,
2457 &neoversen2_vector_cost,
2458 &generic_branch_cost,
2459 &generic_approx_modes,
2460 SVE_128, /* sve_width */
2461 { 4, /* load_int. */
2462 1, /* store_int. */
2463 6, /* load_fp. */
2464 2, /* store_fp. */
2465 6, /* load_pred. */
2466 1 /* store_pred. */
2467 }, /* memmov_cost. */
2468 3, /* issue_rate */
2469 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2470 "32:16", /* function_align. */
2471 "4", /* jump_align. */
2472 "32:16", /* loop_align. */
2473 2, /* int_reassoc_width. */
2474 4, /* fp_reassoc_width. */
2475 1, /* fma_reassoc_width. */
2476 2, /* vec_reassoc_width. */
2477 2, /* min_div_recip_mul_sf. */
2478 2, /* min_div_recip_mul_df. */
2479 0, /* max_case_values. */
2480 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2481 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2482 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2483 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2484 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2485 &generic_prefetch_tune
2486 };
2487
2488 static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
2489 {
2490 2, /* int_stmt_cost */
2491 2, /* fp_stmt_cost */
2492 2, /* ld2_st2_permute_cost */
2493 2, /* ld3_st3_permute_cost */
2494 3, /* ld4_st4_permute_cost */
2495 3, /* permute_cost */
2496 4, /* reduc_i8_cost */
2497 4, /* reduc_i16_cost */
2498 2, /* reduc_i32_cost */
2499 2, /* reduc_i64_cost */
2500 6, /* reduc_f16_cost */
2501 3, /* reduc_f32_cost */
2502 2, /* reduc_f64_cost */
2503 2, /* store_elt_extra_cost */
2504 /* This value is just inherited from the Cortex-A57 table. */
2505 8, /* vec_to_scalar_cost */
2506 /* This depends very much on what the scalar value is and
2507 where it comes from. E.g. some constants take two dependent
2508 instructions or a load, while others might be moved from a GPR.
2509 4 seems to be a reasonable compromise in practice. */
2510 4, /* scalar_to_vec_cost */
2511 4, /* align_load_cost */
2512 4, /* unalign_load_cost */
2513 /* Although stores have a latency of 2 and compete for the
2514 vector pipes, in practice it's better not to model that. */
2515 1, /* unalign_store_cost */
2516 1 /* store_cost */
2517 };
2518
2519 static const sve_vec_cost neoversev2_sve_vector_cost =
2520 {
2521 {
2522 2, /* int_stmt_cost */
2523 2, /* fp_stmt_cost */
2524 3, /* ld2_st2_permute_cost */
2525 3, /* ld3_st3_permute_cost */
2526 4, /* ld4_st4_permute_cost */
2527 3, /* permute_cost */
2528 /* Theoretically, a reduction involving 15 scalar ADDs could
2529 complete in ~3 cycles and would have a cost of 15. [SU]ADDV
2530 completes in 11 cycles, so give it a cost of 15 + 8. */
2531 21, /* reduc_i8_cost */
2532 /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */
2533 14, /* reduc_i16_cost */
2534 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */
2535 7, /* reduc_i32_cost */
2536 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2537 2, /* reduc_i64_cost */
2538 /* Theoretically, a reduction involving 7 scalar FADDs could
2539 complete in ~6 cycles and would have a cost of 14. FADDV
2540 completes in 8 cycles, so give it a cost of 14 + 2. */
2541 16, /* reduc_f16_cost */
2542 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2543 8, /* reduc_f32_cost */
2544 /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */
2545 4, /* reduc_f64_cost */
2546 2, /* store_elt_extra_cost */
2547 /* This value is just inherited from the Cortex-A57 table. */
2548 8, /* vec_to_scalar_cost */
2549 /* See the comment above the Advanced SIMD versions. */
2550 4, /* scalar_to_vec_cost */
2551 4, /* align_load_cost */
2552 4, /* unalign_load_cost */
2553 /* Although stores have a latency of 2 and compete for the
2554 vector pipes, in practice it's better not to model that. */
2555 1, /* unalign_store_cost */
2556 1 /* store_cost */
2557 },
2558 3, /* clast_cost */
2559 10, /* fadda_f16_cost */
2560 6, /* fadda_f32_cost */
2561 4, /* fadda_f64_cost */
2562 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2563 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2564 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2565 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2566 (cost 2) to that, to avoid the difference being lost in rounding.
2567
2568 There is no easy comparison between a strided Advanced SIMD x32 load
2569 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2570 operation more than a 64-bit gather. */
2571 14, /* gather_load_x32_cost */
2572 12, /* gather_load_x64_cost */
2573 3 /* scatter_store_elt_cost */
2574 };
2575
2576 static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
2577 {
2578 3, /* loads_stores_per_cycle */
2579 2, /* stores_per_cycle */
2580 6, /* general_ops_per_cycle */
2581 0, /* fp_simd_load_general_ops */
2582 1 /* fp_simd_store_general_ops */
2583 };
2584
2585 static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
2586 {
2587 {
2588 3, /* loads_stores_per_cycle */
2589 2, /* stores_per_cycle */
2590 4, /* general_ops_per_cycle */
2591 0, /* fp_simd_load_general_ops */
2592 1 /* fp_simd_store_general_ops */
2593 },
2594 2, /* ld2_st2_general_ops */
2595 2, /* ld3_st3_general_ops */
2596 3 /* ld4_st4_general_ops */
2597 };
2598
2599 static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
2600 {
2601 {
2602 {
2603 3, /* loads_per_cycle */
2604 2, /* stores_per_cycle */
2605 4, /* general_ops_per_cycle */
2606 0, /* fp_simd_load_general_ops */
2607 1 /* fp_simd_store_general_ops */
2608 },
2609 2, /* ld2_st2_general_ops */
2610 3, /* ld3_st3_general_ops */
2611 3 /* ld4_st4_general_ops */
2612 },
2613 2, /* pred_ops_per_cycle */
2614 2, /* while_pred_ops */
2615 2, /* int_cmp_pred_ops */
2616 1, /* fp_cmp_pred_ops */
2617 1, /* gather_scatter_pair_general_ops */
2618 1 /* gather_scatter_pair_pred_ops */
2619 };
2620
2621 static const aarch64_vec_issue_info neoversev2_vec_issue_info =
2622 {
2623 &neoversev2_scalar_issue_info,
2624 &neoversev2_advsimd_issue_info,
2625 &neoversev2_sve_issue_info
2626 };
2627
2628 /* Demeter costs for vector insn classes. */
2629 static const struct cpu_vector_cost neoversev2_vector_cost =
2630 {
2631 1, /* scalar_int_stmt_cost */
2632 2, /* scalar_fp_stmt_cost */
2633 4, /* scalar_load_cost */
2634 1, /* scalar_store_cost */
2635 1, /* cond_taken_branch_cost */
2636 1, /* cond_not_taken_branch_cost */
2637 &neoversev2_advsimd_vector_cost, /* advsimd */
2638 &neoversev2_sve_vector_cost, /* sve */
2639 &neoversev2_vec_issue_info /* issue_info */
2640 };
2641
2642 static const struct tune_params neoversev2_tunings =
2643 {
2644 &cortexa76_extra_costs,
2645 &neoversev2_addrcost_table,
2646 &neoversev2_regmove_cost,
2647 &neoversev2_vector_cost,
2648 &generic_branch_cost,
2649 &generic_approx_modes,
2650 SVE_128, /* sve_width */
2651 { 4, /* load_int. */
2652 2, /* store_int. */
2653 6, /* load_fp. */
2654 1, /* store_fp. */
2655 6, /* load_pred. */
2656 2 /* store_pred. */
2657 }, /* memmov_cost. */
2658 5, /* issue_rate */
2659 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2660 "32:16", /* function_align. */
2661 "4", /* jump_align. */
2662 "32:16", /* loop_align. */
2663 3, /* int_reassoc_width. */
2664 6, /* fp_reassoc_width. */
2665 4, /* fma_reassoc_width. */
2666 3, /* vec_reassoc_width. */
2667 2, /* min_div_recip_mul_sf. */
2668 2, /* min_div_recip_mul_df. */
2669 0, /* max_case_values. */
2670 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2671 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2672 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2673 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2674 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2675 &generic_prefetch_tune
2676 };
2677
2678 static const struct tune_params a64fx_tunings =
2679 {
2680 &a64fx_extra_costs,
2681 &a64fx_addrcost_table,
2682 &a64fx_regmove_cost,
2683 &a64fx_vector_cost,
2684 &generic_branch_cost,
2685 &generic_approx_modes,
2686 SVE_512, /* sve_width */
2687 { 4, /* load_int. */
2688 4, /* store_int. */
2689 4, /* load_fp. */
2690 4, /* store_fp. */
2691 4, /* load_pred. */
2692 4 /* store_pred. */
2693 }, /* memmov_cost. */
2694 7, /* issue_rate */
2695 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2696 "32", /* function_align. */
2697 "16", /* jump_align. */
2698 "32", /* loop_align. */
2699 4, /* int_reassoc_width. */
2700 2, /* fp_reassoc_width. */
2701 1, /* fma_reassoc_width. */
2702 2, /* vec_reassoc_width. */
2703 2, /* min_div_recip_mul_sf. */
2704 2, /* min_div_recip_mul_df. */
2705 0, /* max_case_values. */
2706 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2707 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
2708 &a64fx_prefetch_tune
2709 };
2710
2711 /* Support for fine-grained override of the tuning structures. */
2712 struct aarch64_tuning_override_function
2713 {
2714 const char* name;
2715 void (*parse_override)(const char*, struct tune_params*);
2716 };
2717
2718 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2719 static void aarch64_parse_tune_string (const char*, struct tune_params*);
2720 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
2721
2722 static const struct aarch64_tuning_override_function
2723 aarch64_tuning_override_functions[] =
2724 {
2725 { "fuse", aarch64_parse_fuse_string },
2726 { "tune", aarch64_parse_tune_string },
2727 { "sve_width", aarch64_parse_sve_width_string },
2728 { NULL, NULL }
2729 };
2730
2731 /* A processor implementing AArch64. */
2732 struct processor
2733 {
2734 const char *name;
2735 aarch64_processor ident;
2736 aarch64_processor sched_core;
2737 aarch64_arch arch;
2738 aarch64_feature_flags flags;
2739 const tune_params *tune;
2740 };
2741
2742 /* Architectures implementing AArch64. */
2743 static CONSTEXPR const processor all_architectures[] =
2744 {
2745 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
2746 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
2747 feature_deps::ARCH_IDENT ().enable, NULL},
2748 #include "aarch64-arches.def"
2749 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2750 };
2751
2752 /* Processor cores implementing AArch64. */
2753 static const struct processor all_cores[] =
2754 {
2755 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
2756 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
2757 feature_deps::cpu_##IDENT, &COSTS##_tunings},
2758 #include "aarch64-cores.def"
2759 {"generic", generic, cortexa53, AARCH64_ARCH_V8A,
2760 feature_deps::V8A ().enable, &generic_tunings},
2761 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2762 };
2763
2764 /* The current tuning set. */
2765 struct tune_params aarch64_tune_params = generic_tunings;
2766
2767 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
2768
2769 static tree
2770 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2771 int, bool *no_add_attrs)
2772 {
2773 /* Since we set fn_type_req to true, the caller should have checked
2774 this for us. */
2775 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2776 switch ((arm_pcs) fntype_abi (*node).id ())
2777 {
2778 case ARM_PCS_AAPCS64:
2779 case ARM_PCS_SIMD:
2780 return NULL_TREE;
2781
2782 case ARM_PCS_SVE:
2783 error ("the %qE attribute cannot be applied to an SVE function type",
2784 name);
2785 *no_add_attrs = true;
2786 return NULL_TREE;
2787
2788 case ARM_PCS_TLSDESC:
2789 case ARM_PCS_UNKNOWN:
2790 break;
2791 }
2792 gcc_unreachable ();
2793 }
2794
2795 /* Table of machine attributes. */
2796 static const struct attribute_spec aarch64_attribute_table[] =
2797 {
2798 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2799 affects_type_identity, handler, exclude } */
2800 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
2801 handle_aarch64_vector_pcs_attribute, NULL },
2802 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
2803 aarch64_sve::handle_arm_sve_vector_bits_attribute,
2804 NULL },
2805 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
2806 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
2807 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
2808 { NULL, 0, 0, false, false, false, false, NULL, NULL }
2809 };
2810
2811 /* An ISA extension in the co-processor and main instruction set space. */
2812 struct aarch64_option_extension
2813 {
2814 const char *const name;
2815 const unsigned long flags_on;
2816 const unsigned long flags_off;
2817 };
2818
2819 typedef enum aarch64_cond_code
2820 {
2821 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2822 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2823 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2824 }
2825 aarch64_cc;
2826
2827 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2828
2829 struct aarch64_branch_protect_type
2830 {
2831 /* The type's name that the user passes to the branch-protection option
2832 string. */
2833 const char* name;
2834 /* Function to handle the protection type and set global variables.
2835 First argument is the string token corresponding with this type and the
2836 second argument is the next token in the option string.
2837 Return values:
2838 * AARCH64_PARSE_OK: Handling was sucessful.
2839 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2840 should print an error.
2841 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2842 own error. */
2843 enum aarch64_parse_opt_result (*handler)(char*, char*);
2844 /* A list of types that can follow this type in the option string. */
2845 const aarch64_branch_protect_type* subtypes;
2846 unsigned int num_subtypes;
2847 };
2848
2849 static enum aarch64_parse_opt_result
2850 aarch64_handle_no_branch_protection (char* str, char* rest)
2851 {
2852 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
2853 aarch64_enable_bti = 0;
2854 if (rest)
2855 {
2856 error ("unexpected %<%s%> after %<%s%>", rest, str);
2857 return AARCH64_PARSE_INVALID_FEATURE;
2858 }
2859 return AARCH64_PARSE_OK;
2860 }
2861
2862 static enum aarch64_parse_opt_result
2863 aarch64_handle_standard_branch_protection (char* str, char* rest)
2864 {
2865 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2866 aarch64_ra_sign_key = AARCH64_KEY_A;
2867 aarch64_enable_bti = 1;
2868 if (rest)
2869 {
2870 error ("unexpected %<%s%> after %<%s%>", rest, str);
2871 return AARCH64_PARSE_INVALID_FEATURE;
2872 }
2873 return AARCH64_PARSE_OK;
2874 }
2875
2876 static enum aarch64_parse_opt_result
2877 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2878 char* rest ATTRIBUTE_UNUSED)
2879 {
2880 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2881 aarch64_ra_sign_key = AARCH64_KEY_A;
2882 return AARCH64_PARSE_OK;
2883 }
2884
2885 static enum aarch64_parse_opt_result
2886 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2887 char* rest ATTRIBUTE_UNUSED)
2888 {
2889 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2890 return AARCH64_PARSE_OK;
2891 }
2892
2893 static enum aarch64_parse_opt_result
2894 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2895 char* rest ATTRIBUTE_UNUSED)
2896 {
2897 aarch64_ra_sign_key = AARCH64_KEY_B;
2898 return AARCH64_PARSE_OK;
2899 }
2900
2901 static enum aarch64_parse_opt_result
2902 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2903 char* rest ATTRIBUTE_UNUSED)
2904 {
2905 aarch64_enable_bti = 1;
2906 return AARCH64_PARSE_OK;
2907 }
2908
2909 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2910 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
2911 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
2912 { NULL, NULL, NULL, 0 }
2913 };
2914
2915 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2916 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2917 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2918 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2919 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
2920 { "bti", aarch64_handle_bti_protection, NULL, 0 },
2921 { NULL, NULL, NULL, 0 }
2922 };
2923
2924 /* The condition codes of the processor, and the inverse function. */
2925 static const char * const aarch64_condition_codes[] =
2926 {
2927 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2928 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2929 };
2930
2931 /* The preferred condition codes for SVE conditions. */
2932 static const char *const aarch64_sve_condition_codes[] =
2933 {
2934 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2935 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2936 };
2937
2938 /* Return the assembly token for svpattern value VALUE. */
2939
2940 static const char *
2941 svpattern_token (enum aarch64_svpattern pattern)
2942 {
2943 switch (pattern)
2944 {
2945 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2946 AARCH64_FOR_SVPATTERN (CASE)
2947 #undef CASE
2948 case AARCH64_NUM_SVPATTERNS:
2949 break;
2950 }
2951 gcc_unreachable ();
2952 }
2953
2954 /* Return the location of a piece that is known to be passed or returned
2955 in registers. FIRST_ZR is the first unused vector argument register
2956 and FIRST_PR is the first unused predicate argument register. */
2957
2958 rtx
2959 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2960 unsigned int first_pr) const
2961 {
2962 gcc_assert (VECTOR_MODE_P (mode)
2963 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2964 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2965
2966 if (num_zr > 0 && num_pr == 0)
2967 return gen_rtx_REG (mode, first_zr);
2968
2969 if (num_zr == 0 && num_pr == 1)
2970 return gen_rtx_REG (mode, first_pr);
2971
2972 gcc_unreachable ();
2973 }
2974
2975 /* Return the total number of vector registers required by the PST. */
2976
2977 unsigned int
2978 pure_scalable_type_info::num_zr () const
2979 {
2980 unsigned int res = 0;
2981 for (unsigned int i = 0; i < pieces.length (); ++i)
2982 res += pieces[i].num_zr;
2983 return res;
2984 }
2985
2986 /* Return the total number of predicate registers required by the PST. */
2987
2988 unsigned int
2989 pure_scalable_type_info::num_pr () const
2990 {
2991 unsigned int res = 0;
2992 for (unsigned int i = 0; i < pieces.length (); ++i)
2993 res += pieces[i].num_pr;
2994 return res;
2995 }
2996
2997 /* Return the location of a PST that is known to be passed or returned
2998 in registers. FIRST_ZR is the first unused vector argument register
2999 and FIRST_PR is the first unused predicate argument register. */
3000
3001 rtx
3002 pure_scalable_type_info::get_rtx (machine_mode mode,
3003 unsigned int first_zr,
3004 unsigned int first_pr) const
3005 {
3006 /* Try to return a single REG if possible. This leads to better
3007 code generation; it isn't required for correctness. */
3008 if (mode == pieces[0].mode)
3009 {
3010 gcc_assert (pieces.length () == 1);
3011 return pieces[0].get_rtx (first_zr, first_pr);
3012 }
3013
3014 /* Build up a PARALLEL that contains the individual pieces. */
3015 rtvec rtxes = rtvec_alloc (pieces.length ());
3016 for (unsigned int i = 0; i < pieces.length (); ++i)
3017 {
3018 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
3019 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
3020 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
3021 first_zr += pieces[i].num_zr;
3022 first_pr += pieces[i].num_pr;
3023 }
3024 return gen_rtx_PARALLEL (mode, rtxes);
3025 }
3026
3027 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
3028 in the AAPCS64. */
3029
3030 pure_scalable_type_info::analysis_result
3031 pure_scalable_type_info::analyze (const_tree type)
3032 {
3033 /* Prevent accidental reuse. */
3034 gcc_assert (pieces.is_empty ());
3035
3036 /* No code will be generated for erroneous types, so we won't establish
3037 an ABI mapping. */
3038 if (type == error_mark_node)
3039 return NO_ABI_IDENTITY;
3040
3041 /* Zero-sized types disappear in the language->ABI mapping. */
3042 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3043 return NO_ABI_IDENTITY;
3044
3045 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
3046 piece p = {};
3047 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
3048 {
3049 machine_mode mode = TYPE_MODE_RAW (type);
3050 gcc_assert (VECTOR_MODE_P (mode)
3051 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
3052
3053 p.mode = p.orig_mode = mode;
3054 add_piece (p);
3055 return IS_PST;
3056 }
3057
3058 /* Check for user-defined PSTs. */
3059 if (TREE_CODE (type) == ARRAY_TYPE)
3060 return analyze_array (type);
3061 if (TREE_CODE (type) == RECORD_TYPE)
3062 return analyze_record (type);
3063
3064 return ISNT_PST;
3065 }
3066
3067 /* Analyze a type that is known not to be passed or returned in memory.
3068 Return true if it has an ABI identity and is a Pure Scalable Type. */
3069
3070 bool
3071 pure_scalable_type_info::analyze_registers (const_tree type)
3072 {
3073 analysis_result result = analyze (type);
3074 gcc_assert (result != DOESNT_MATTER);
3075 return result == IS_PST;
3076 }
3077
3078 /* Subroutine of analyze for handling ARRAY_TYPEs. */
3079
3080 pure_scalable_type_info::analysis_result
3081 pure_scalable_type_info::analyze_array (const_tree type)
3082 {
3083 /* Analyze the element type. */
3084 pure_scalable_type_info element_info;
3085 analysis_result result = element_info.analyze (TREE_TYPE (type));
3086 if (result != IS_PST)
3087 return result;
3088
3089 /* An array of unknown, flexible or variable length will be passed and
3090 returned by reference whatever we do. */
3091 tree nelts_minus_one = array_type_nelts (type);
3092 if (!tree_fits_uhwi_p (nelts_minus_one))
3093 return DOESNT_MATTER;
3094
3095 /* Likewise if the array is constant-sized but too big to be interesting.
3096 The double checks against MAX_PIECES are to protect against overflow. */
3097 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
3098 if (count > MAX_PIECES)
3099 return DOESNT_MATTER;
3100 count += 1;
3101 if (count * element_info.pieces.length () > MAX_PIECES)
3102 return DOESNT_MATTER;
3103
3104 /* The above checks should have weeded out elements of unknown size. */
3105 poly_uint64 element_bytes;
3106 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
3107 gcc_unreachable ();
3108
3109 /* Build up the list of individual vectors and predicates. */
3110 gcc_assert (!element_info.pieces.is_empty ());
3111 for (unsigned int i = 0; i < count; ++i)
3112 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
3113 {
3114 piece p = element_info.pieces[j];
3115 p.offset += i * element_bytes;
3116 add_piece (p);
3117 }
3118 return IS_PST;
3119 }
3120
3121 /* Subroutine of analyze for handling RECORD_TYPEs. */
3122
3123 pure_scalable_type_info::analysis_result
3124 pure_scalable_type_info::analyze_record (const_tree type)
3125 {
3126 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3127 {
3128 if (TREE_CODE (field) != FIELD_DECL)
3129 continue;
3130
3131 /* Zero-sized fields disappear in the language->ABI mapping. */
3132 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
3133 continue;
3134
3135 /* All fields with an ABI identity must be PSTs for the record as
3136 a whole to be a PST. If any individual field is too big to be
3137 interesting then the record is too. */
3138 pure_scalable_type_info field_info;
3139 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
3140 if (subresult == NO_ABI_IDENTITY)
3141 continue;
3142 if (subresult != IS_PST)
3143 return subresult;
3144
3145 /* Since all previous fields are PSTs, we ought to be able to track
3146 the field offset using poly_ints. */
3147 tree bitpos = bit_position (field);
3148 gcc_assert (poly_int_tree_p (bitpos));
3149
3150 /* For the same reason, it shouldn't be possible to create a PST field
3151 whose offset isn't byte-aligned. */
3152 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
3153 BITS_PER_UNIT);
3154
3155 /* Punt if the record is too big to be interesting. */
3156 poly_uint64 bytepos;
3157 if (!wide_bytepos.to_uhwi (&bytepos)
3158 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
3159 return DOESNT_MATTER;
3160
3161 /* Add the individual vectors and predicates in the field to the
3162 record's list. */
3163 gcc_assert (!field_info.pieces.is_empty ());
3164 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
3165 {
3166 piece p = field_info.pieces[i];
3167 p.offset += bytepos;
3168 add_piece (p);
3169 }
3170 }
3171 /* Empty structures disappear in the language->ABI mapping. */
3172 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
3173 }
3174
3175 /* Add P to the list of pieces in the type. */
3176
3177 void
3178 pure_scalable_type_info::add_piece (const piece &p)
3179 {
3180 /* Try to fold the new piece into the previous one to form a
3181 single-mode PST. For example, if we see three consecutive vectors
3182 of the same mode, we can represent them using the corresponding
3183 3-tuple mode.
3184
3185 This is purely an optimization. */
3186 if (!pieces.is_empty ())
3187 {
3188 piece &prev = pieces.last ();
3189 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
3190 unsigned int nelems1, nelems2;
3191 if (prev.orig_mode == p.orig_mode
3192 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
3193 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
3194 GET_MODE_NUNITS (p.orig_mode), &nelems1)
3195 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
3196 GET_MODE_NUNITS (p.orig_mode), &nelems2)
3197 && targetm.array_mode (p.orig_mode,
3198 nelems1 + nelems2).exists (&prev.mode))
3199 {
3200 prev.num_zr += p.num_zr;
3201 prev.num_pr += p.num_pr;
3202 return;
3203 }
3204 }
3205 pieces.quick_push (p);
3206 }
3207
3208 /* Return true if at least one possible value of type TYPE includes at
3209 least one object of Pure Scalable Type, in the sense of the AAPCS64.
3210
3211 This is a relatively expensive test for some types, so it should
3212 generally be made as late as possible. */
3213
3214 static bool
3215 aarch64_some_values_include_pst_objects_p (const_tree type)
3216 {
3217 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3218 return false;
3219
3220 if (aarch64_sve::builtin_type_p (type))
3221 return true;
3222
3223 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
3224 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
3225
3226 if (RECORD_OR_UNION_TYPE_P (type))
3227 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3228 if (TREE_CODE (field) == FIELD_DECL
3229 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
3230 return true;
3231
3232 return false;
3233 }
3234
3235 /* Return the descriptor of the SIMD ABI. */
3236
3237 static const predefined_function_abi &
3238 aarch64_simd_abi (void)
3239 {
3240 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
3241 if (!simd_abi.initialized_p ())
3242 {
3243 HARD_REG_SET full_reg_clobbers
3244 = default_function_abi.full_reg_clobbers ();
3245 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
3246 if (FP_SIMD_SAVED_REGNUM_P (regno))
3247 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3248 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
3249 }
3250 return simd_abi;
3251 }
3252
3253 /* Return the descriptor of the SVE PCS. */
3254
3255 static const predefined_function_abi &
3256 aarch64_sve_abi (void)
3257 {
3258 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
3259 if (!sve_abi.initialized_p ())
3260 {
3261 HARD_REG_SET full_reg_clobbers
3262 = default_function_abi.full_reg_clobbers ();
3263 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
3264 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3265 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
3266 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3267 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
3268 }
3269 return sve_abi;
3270 }
3271
3272 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
3273 wraps, otherwise return X itself. */
3274
3275 static rtx
3276 strip_salt (rtx x)
3277 {
3278 rtx search = x;
3279 if (GET_CODE (search) == CONST)
3280 search = XEXP (search, 0);
3281 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
3282 x = XVECEXP (search, 0, 0);
3283 return x;
3284 }
3285
3286 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
3287 expression. */
3288
3289 static rtx
3290 strip_offset_and_salt (rtx addr, poly_int64 *offset)
3291 {
3292 return strip_salt (strip_offset (addr, offset));
3293 }
3294
3295 /* Generate code to enable conditional branches in functions over 1 MiB. */
3296 const char *
3297 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
3298 const char * branch_format)
3299 {
3300 rtx_code_label * tmp_label = gen_label_rtx ();
3301 char label_buf[256];
3302 char buffer[128];
3303 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
3304 CODE_LABEL_NUMBER (tmp_label));
3305 const char *label_ptr = targetm.strip_name_encoding (label_buf);
3306 rtx dest_label = operands[pos_label];
3307 operands[pos_label] = tmp_label;
3308
3309 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
3310 output_asm_insn (buffer, operands);
3311
3312 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
3313 operands[pos_label] = dest_label;
3314 output_asm_insn (buffer, operands);
3315 return "";
3316 }
3317
3318 void
3319 aarch64_err_no_fpadvsimd (machine_mode mode)
3320 {
3321 if (TARGET_GENERAL_REGS_ONLY)
3322 if (FLOAT_MODE_P (mode))
3323 error ("%qs is incompatible with the use of floating-point types",
3324 "-mgeneral-regs-only");
3325 else
3326 error ("%qs is incompatible with the use of vector types",
3327 "-mgeneral-regs-only");
3328 else
3329 if (FLOAT_MODE_P (mode))
3330 error ("%qs feature modifier is incompatible with the use of"
3331 " floating-point types", "+nofp");
3332 else
3333 error ("%qs feature modifier is incompatible with the use of"
3334 " vector types", "+nofp");
3335 }
3336
3337 /* Report when we try to do something that requires SVE when SVE is disabled.
3338 This is an error of last resort and isn't very high-quality. It usually
3339 involves attempts to measure the vector length in some way. */
3340 static void
3341 aarch64_report_sve_required (void)
3342 {
3343 static bool reported_p = false;
3344
3345 /* Avoid reporting a slew of messages for a single oversight. */
3346 if (reported_p)
3347 return;
3348
3349 error ("this operation requires the SVE ISA extension");
3350 inform (input_location, "you can enable SVE using the command-line"
3351 " option %<-march%>, or by using the %<target%>"
3352 " attribute or pragma");
3353 reported_p = true;
3354 }
3355
3356 /* Return true if REGNO is P0-P15 or one of the special FFR-related
3357 registers. */
3358 inline bool
3359 pr_or_ffr_regnum_p (unsigned int regno)
3360 {
3361 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
3362 }
3363
3364 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
3365 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
3366 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
3367 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
3368 and GENERAL_REGS is lower than the memory cost (in this case the best class
3369 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
3370 cost results in bad allocations with many redundant int<->FP moves which
3371 are expensive on various cores.
3372 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
3373 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
3374 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
3375 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
3376 The result of this is that it is no longer inefficient to have a higher
3377 memory move cost than the register move cost.
3378 */
3379
3380 static reg_class_t
3381 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
3382 reg_class_t best_class)
3383 {
3384 machine_mode mode;
3385
3386 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
3387 || !reg_class_subset_p (FP_REGS, allocno_class))
3388 return allocno_class;
3389
3390 if (!reg_class_subset_p (GENERAL_REGS, best_class)
3391 || !reg_class_subset_p (FP_REGS, best_class))
3392 return best_class;
3393
3394 mode = PSEUDO_REGNO_MODE (regno);
3395 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
3396 }
3397
3398 static unsigned int
3399 aarch64_min_divisions_for_recip_mul (machine_mode mode)
3400 {
3401 if (GET_MODE_UNIT_SIZE (mode) == 4)
3402 return aarch64_tune_params.min_div_recip_mul_sf;
3403 return aarch64_tune_params.min_div_recip_mul_df;
3404 }
3405
3406 /* Return the reassociation width of treeop OPC with mode MODE. */
3407 static int
3408 aarch64_reassociation_width (unsigned opc, machine_mode mode)
3409 {
3410 if (VECTOR_MODE_P (mode))
3411 return aarch64_tune_params.vec_reassoc_width;
3412 if (INTEGRAL_MODE_P (mode))
3413 return aarch64_tune_params.int_reassoc_width;
3414 /* Reassociation reduces the number of FMAs which may result in worse
3415 performance. Use a per-CPU setting for FMA reassociation which allows
3416 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
3417 CPUs with many FP pipes to enable reassociation.
3418 Since the reassociation pass doesn't understand FMA at all, assume
3419 that any FP addition might turn into FMA. */
3420 if (FLOAT_MODE_P (mode))
3421 return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
3422 : aarch64_tune_params.fp_reassoc_width;
3423 return 1;
3424 }
3425
3426 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
3427 unsigned
3428 aarch64_debugger_regno (unsigned regno)
3429 {
3430 if (GP_REGNUM_P (regno))
3431 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
3432 else if (regno == SP_REGNUM)
3433 return AARCH64_DWARF_SP;
3434 else if (FP_REGNUM_P (regno))
3435 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
3436 else if (PR_REGNUM_P (regno))
3437 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
3438 else if (regno == VG_REGNUM)
3439 return AARCH64_DWARF_VG;
3440
3441 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
3442 equivalent DWARF register. */
3443 return DWARF_FRAME_REGISTERS;
3444 }
3445
3446 /* Implement TARGET_DWARF_FRAME_REG_MODE. */
3447 static machine_mode
3448 aarch64_dwarf_frame_reg_mode (int regno)
3449 {
3450 /* Predicate registers are call-clobbered in the EH ABI (which is
3451 ARM_PCS_AAPCS64), so they should not be described by CFI.
3452 Their size changes as VL changes, so any values computed by
3453 __builtin_init_dwarf_reg_size_table might not be valid for
3454 all frames. */
3455 if (PR_REGNUM_P (regno))
3456 return VOIDmode;
3457 return default_dwarf_frame_reg_mode (regno);
3458 }
3459
3460 /* If X is a CONST_DOUBLE, return its bit representation as a constant
3461 integer, otherwise return X unmodified. */
3462 static rtx
3463 aarch64_bit_representation (rtx x)
3464 {
3465 if (CONST_DOUBLE_P (x))
3466 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
3467 return x;
3468 }
3469
3470 /* Return an estimate for the number of quadwords in an SVE vector. This is
3471 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
3472 static unsigned int
3473 aarch64_estimated_sve_vq ()
3474 {
3475 return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
3476 }
3477
3478 /* Return true if MODE is an SVE predicate mode. */
3479 static bool
3480 aarch64_sve_pred_mode_p (machine_mode mode)
3481 {
3482 return (TARGET_SVE
3483 && (mode == VNx16BImode
3484 || mode == VNx8BImode
3485 || mode == VNx4BImode
3486 || mode == VNx2BImode));
3487 }
3488
3489 /* Three mutually-exclusive flags describing a vector or predicate type. */
3490 const unsigned int VEC_ADVSIMD = 1;
3491 const unsigned int VEC_SVE_DATA = 2;
3492 const unsigned int VEC_SVE_PRED = 4;
3493 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
3494 a structure of 2, 3 or 4 vectors. */
3495 const unsigned int VEC_STRUCT = 8;
3496 /* Can be used in combination with VEC_SVE_DATA to indicate that the
3497 vector has fewer significant bytes than a full SVE vector. */
3498 const unsigned int VEC_PARTIAL = 16;
3499 /* Useful combinations of the above. */
3500 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
3501 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
3502
3503 /* Return a set of flags describing the vector properties of mode MODE.
3504 Ignore modes that are not supported by the current target. */
3505 static unsigned int
3506 aarch64_classify_vector_mode (machine_mode mode)
3507 {
3508 if (aarch64_sve_pred_mode_p (mode))
3509 return VEC_SVE_PRED;
3510
3511 /* Make the decision based on the mode's enum value rather than its
3512 properties, so that we keep the correct classification regardless
3513 of -msve-vector-bits. */
3514 switch (mode)
3515 {
3516 /* Partial SVE QI vectors. */
3517 case E_VNx2QImode:
3518 case E_VNx4QImode:
3519 case E_VNx8QImode:
3520 /* Partial SVE HI vectors. */
3521 case E_VNx2HImode:
3522 case E_VNx4HImode:
3523 /* Partial SVE SI vector. */
3524 case E_VNx2SImode:
3525 /* Partial SVE HF vectors. */
3526 case E_VNx2HFmode:
3527 case E_VNx4HFmode:
3528 /* Partial SVE BF vectors. */
3529 case E_VNx2BFmode:
3530 case E_VNx4BFmode:
3531 /* Partial SVE SF vector. */
3532 case E_VNx2SFmode:
3533 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
3534
3535 case E_VNx16QImode:
3536 case E_VNx8HImode:
3537 case E_VNx4SImode:
3538 case E_VNx2DImode:
3539 case E_VNx8BFmode:
3540 case E_VNx8HFmode:
3541 case E_VNx4SFmode:
3542 case E_VNx2DFmode:
3543 return TARGET_SVE ? VEC_SVE_DATA : 0;
3544
3545 /* x2 SVE vectors. */
3546 case E_VNx32QImode:
3547 case E_VNx16HImode:
3548 case E_VNx8SImode:
3549 case E_VNx4DImode:
3550 case E_VNx16BFmode:
3551 case E_VNx16HFmode:
3552 case E_VNx8SFmode:
3553 case E_VNx4DFmode:
3554 /* x3 SVE vectors. */
3555 case E_VNx48QImode:
3556 case E_VNx24HImode:
3557 case E_VNx12SImode:
3558 case E_VNx6DImode:
3559 case E_VNx24BFmode:
3560 case E_VNx24HFmode:
3561 case E_VNx12SFmode:
3562 case E_VNx6DFmode:
3563 /* x4 SVE vectors. */
3564 case E_VNx64QImode:
3565 case E_VNx32HImode:
3566 case E_VNx16SImode:
3567 case E_VNx8DImode:
3568 case E_VNx32BFmode:
3569 case E_VNx32HFmode:
3570 case E_VNx16SFmode:
3571 case E_VNx8DFmode:
3572 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
3573
3574 case E_OImode:
3575 case E_CImode:
3576 case E_XImode:
3577 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3578
3579 /* Structures of 64-bit Advanced SIMD vectors. */
3580 case E_V2x8QImode:
3581 case E_V2x4HImode:
3582 case E_V2x2SImode:
3583 case E_V2x1DImode:
3584 case E_V2x4BFmode:
3585 case E_V2x4HFmode:
3586 case E_V2x2SFmode:
3587 case E_V2x1DFmode:
3588 case E_V3x8QImode:
3589 case E_V3x4HImode:
3590 case E_V3x2SImode:
3591 case E_V3x1DImode:
3592 case E_V3x4BFmode:
3593 case E_V3x4HFmode:
3594 case E_V3x2SFmode:
3595 case E_V3x1DFmode:
3596 case E_V4x8QImode:
3597 case E_V4x4HImode:
3598 case E_V4x2SImode:
3599 case E_V4x1DImode:
3600 case E_V4x4BFmode:
3601 case E_V4x4HFmode:
3602 case E_V4x2SFmode:
3603 case E_V4x1DFmode:
3604 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
3605
3606 /* Structures of 128-bit Advanced SIMD vectors. */
3607 case E_V2x16QImode:
3608 case E_V2x8HImode:
3609 case E_V2x4SImode:
3610 case E_V2x2DImode:
3611 case E_V2x8BFmode:
3612 case E_V2x8HFmode:
3613 case E_V2x4SFmode:
3614 case E_V2x2DFmode:
3615 case E_V3x16QImode:
3616 case E_V3x8HImode:
3617 case E_V3x4SImode:
3618 case E_V3x2DImode:
3619 case E_V3x8BFmode:
3620 case E_V3x8HFmode:
3621 case E_V3x4SFmode:
3622 case E_V3x2DFmode:
3623 case E_V4x16QImode:
3624 case E_V4x8HImode:
3625 case E_V4x4SImode:
3626 case E_V4x2DImode:
3627 case E_V4x8BFmode:
3628 case E_V4x8HFmode:
3629 case E_V4x4SFmode:
3630 case E_V4x2DFmode:
3631 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3632
3633 /* 64-bit Advanced SIMD vectors. */
3634 case E_V8QImode:
3635 case E_V4HImode:
3636 case E_V2SImode:
3637 case E_V1DImode:
3638 case E_V4HFmode:
3639 case E_V4BFmode:
3640 case E_V2SFmode:
3641 case E_V1DFmode:
3642 /* 128-bit Advanced SIMD vectors. */
3643 case E_V16QImode:
3644 case E_V8HImode:
3645 case E_V4SImode:
3646 case E_V2DImode:
3647 case E_V8HFmode:
3648 case E_V8BFmode:
3649 case E_V4SFmode:
3650 case E_V2DFmode:
3651 return TARGET_FLOAT ? VEC_ADVSIMD : 0;
3652
3653 default:
3654 return 0;
3655 }
3656 }
3657
3658 /* Return true if MODE is any of the Advanced SIMD structure modes. */
3659 bool
3660 aarch64_advsimd_struct_mode_p (machine_mode mode)
3661 {
3662 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3663 return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
3664 }
3665
3666 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
3667 static bool
3668 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
3669 {
3670 return (aarch64_classify_vector_mode (mode)
3671 == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
3672 }
3673
3674 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
3675 static bool
3676 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
3677 {
3678 return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
3679 }
3680
3681 /* Return true if MODE is any of the data vector modes, including
3682 structure modes. */
3683 static bool
3684 aarch64_vector_data_mode_p (machine_mode mode)
3685 {
3686 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
3687 }
3688
3689 /* Return true if MODE is any form of SVE mode, including predicates,
3690 vectors and structures. */
3691 bool
3692 aarch64_sve_mode_p (machine_mode mode)
3693 {
3694 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
3695 }
3696
3697 /* Return true if MODE is an SVE data vector mode; either a single vector
3698 or a structure of vectors. */
3699 static bool
3700 aarch64_sve_data_mode_p (machine_mode mode)
3701 {
3702 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
3703 }
3704
3705 /* Return the number of defined bytes in one constituent vector of
3706 SVE mode MODE, which has vector flags VEC_FLAGS. */
3707 static poly_int64
3708 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
3709 {
3710 if (vec_flags & VEC_PARTIAL)
3711 /* A single partial vector. */
3712 return GET_MODE_SIZE (mode);
3713
3714 if (vec_flags & VEC_SVE_DATA)
3715 /* A single vector or a tuple. */
3716 return BYTES_PER_SVE_VECTOR;
3717
3718 /* A single predicate. */
3719 gcc_assert (vec_flags & VEC_SVE_PRED);
3720 return BYTES_PER_SVE_PRED;
3721 }
3722
3723 /* If MODE holds an array of vectors, return the number of vectors
3724 in the array, otherwise return 1. */
3725
3726 static unsigned int
3727 aarch64_ldn_stn_vectors (machine_mode mode)
3728 {
3729 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3730 if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
3731 return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
3732 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
3733 return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
3734 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
3735 return exact_div (GET_MODE_SIZE (mode),
3736 BYTES_PER_SVE_VECTOR).to_constant ();
3737 return 1;
3738 }
3739
3740 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3741 corresponding vector structure mode. */
3742 static opt_machine_mode
3743 aarch64_advsimd_vector_array_mode (machine_mode mode,
3744 unsigned HOST_WIDE_INT nelems)
3745 {
3746 unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
3747 if (known_eq (GET_MODE_SIZE (mode), 8))
3748 flags |= VEC_PARTIAL;
3749
3750 machine_mode struct_mode;
3751 FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
3752 if (aarch64_classify_vector_mode (struct_mode) == flags
3753 && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
3754 && known_eq (GET_MODE_NUNITS (struct_mode),
3755 GET_MODE_NUNITS (mode) * nelems))
3756 return struct_mode;
3757 return opt_machine_mode ();
3758 }
3759
3760 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
3761
3762 opt_machine_mode
3763 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3764 {
3765 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3766 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3767 machine_mode mode;
3768 FOR_EACH_MODE_IN_CLASS (mode, mclass)
3769 if (inner_mode == GET_MODE_INNER (mode)
3770 && known_eq (nunits, GET_MODE_NUNITS (mode))
3771 && aarch64_sve_data_mode_p (mode))
3772 return mode;
3773 return opt_machine_mode ();
3774 }
3775
3776 /* Implement target hook TARGET_ARRAY_MODE. */
3777 static opt_machine_mode
3778 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
3779 {
3780 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
3781 && IN_RANGE (nelems, 2, 4))
3782 return aarch64_sve_data_mode (GET_MODE_INNER (mode),
3783 GET_MODE_NUNITS (mode) * nelems);
3784 if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
3785 && IN_RANGE (nelems, 2, 4))
3786 return aarch64_advsimd_vector_array_mode (mode, nelems);
3787
3788 return opt_machine_mode ();
3789 }
3790
3791 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
3792 static bool
3793 aarch64_array_mode_supported_p (machine_mode mode,
3794 unsigned HOST_WIDE_INT nelems)
3795 {
3796 if (TARGET_SIMD
3797 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
3798 || AARCH64_VALID_SIMD_DREG_MODE (mode))
3799 && (nelems >= 2 && nelems <= 4))
3800 return true;
3801
3802 return false;
3803 }
3804
3805 /* MODE is some form of SVE vector mode. For data modes, return the number
3806 of vector register bits that each element of MODE occupies, such as 64
3807 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3808 in a 64-bit container). For predicate modes, return the number of
3809 data bits controlled by each significant predicate bit. */
3810
3811 static unsigned int
3812 aarch64_sve_container_bits (machine_mode mode)
3813 {
3814 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3815 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
3816 ? BITS_PER_SVE_VECTOR
3817 : GET_MODE_BITSIZE (mode));
3818 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3819 }
3820
3821 /* Return the SVE predicate mode to use for elements that have
3822 ELEM_NBYTES bytes, if such a mode exists. */
3823
3824 opt_machine_mode
3825 aarch64_sve_pred_mode (unsigned int elem_nbytes)
3826 {
3827 if (TARGET_SVE)
3828 {
3829 if (elem_nbytes == 1)
3830 return VNx16BImode;
3831 if (elem_nbytes == 2)
3832 return VNx8BImode;
3833 if (elem_nbytes == 4)
3834 return VNx4BImode;
3835 if (elem_nbytes == 8)
3836 return VNx2BImode;
3837 }
3838 return opt_machine_mode ();
3839 }
3840
3841 /* Return the SVE predicate mode that should be used to control
3842 SVE mode MODE. */
3843
3844 machine_mode
3845 aarch64_sve_pred_mode (machine_mode mode)
3846 {
3847 unsigned int bits = aarch64_sve_container_bits (mode);
3848 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3849 }
3850
3851 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
3852
3853 static opt_machine_mode
3854 aarch64_get_mask_mode (machine_mode mode)
3855 {
3856 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3857 if (vec_flags & VEC_SVE_DATA)
3858 return aarch64_sve_pred_mode (mode);
3859
3860 return default_get_mask_mode (mode);
3861 }
3862
3863 /* Return the integer element mode associated with SVE mode MODE. */
3864
3865 static scalar_int_mode
3866 aarch64_sve_element_int_mode (machine_mode mode)
3867 {
3868 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3869 ? BITS_PER_SVE_VECTOR
3870 : GET_MODE_BITSIZE (mode));
3871 unsigned int elt_bits = vector_element_size (vector_bits,
3872 GET_MODE_NUNITS (mode));
3873 return int_mode_for_size (elt_bits, 0).require ();
3874 }
3875
3876 /* Return an integer element mode that contains exactly
3877 aarch64_sve_container_bits (MODE) bits. This is wider than
3878 aarch64_sve_element_int_mode if MODE is a partial vector,
3879 otherwise it's the same. */
3880
3881 static scalar_int_mode
3882 aarch64_sve_container_int_mode (machine_mode mode)
3883 {
3884 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3885 }
3886
3887 /* Return the integer vector mode associated with SVE mode MODE.
3888 Unlike related_int_vector_mode, this can handle the case in which
3889 MODE is a predicate (and thus has a different total size). */
3890
3891 machine_mode
3892 aarch64_sve_int_mode (machine_mode mode)
3893 {
3894 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3895 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3896 }
3897
3898 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
3899
3900 static opt_machine_mode
3901 aarch64_vectorize_related_mode (machine_mode vector_mode,
3902 scalar_mode element_mode,
3903 poly_uint64 nunits)
3904 {
3905 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3906
3907 /* If we're operating on SVE vectors, try to return an SVE mode. */
3908 poly_uint64 sve_nunits;
3909 if ((vec_flags & VEC_SVE_DATA)
3910 && multiple_p (BYTES_PER_SVE_VECTOR,
3911 GET_MODE_SIZE (element_mode), &sve_nunits))
3912 {
3913 machine_mode sve_mode;
3914 if (maybe_ne (nunits, 0U))
3915 {
3916 /* Try to find a full or partial SVE mode with exactly
3917 NUNITS units. */
3918 if (multiple_p (sve_nunits, nunits)
3919 && aarch64_sve_data_mode (element_mode,
3920 nunits).exists (&sve_mode))
3921 return sve_mode;
3922 }
3923 else
3924 {
3925 /* Take the preferred number of units from the number of bytes
3926 that fit in VECTOR_MODE. We always start by "autodetecting"
3927 a full vector mode with preferred_simd_mode, so vectors
3928 chosen here will also be full vector modes. Then
3929 autovectorize_vector_modes tries smaller starting modes
3930 and thus smaller preferred numbers of units. */
3931 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3932 if (aarch64_sve_data_mode (element_mode,
3933 sve_nunits).exists (&sve_mode))
3934 return sve_mode;
3935 }
3936 }
3937
3938 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
3939 if (TARGET_SIMD
3940 && (vec_flags & VEC_ADVSIMD)
3941 && known_eq (nunits, 0U)
3942 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3943 && maybe_ge (GET_MODE_BITSIZE (element_mode)
3944 * GET_MODE_NUNITS (vector_mode), 128U))
3945 {
3946 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3947 if (VECTOR_MODE_P (res))
3948 return res;
3949 }
3950
3951 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3952 }
3953
3954 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
3955 prefer to use the first arithmetic operand as the else value if
3956 the else value doesn't matter, since that exactly matches the SVE
3957 destructive merging form. For ternary operations we could either
3958 pick the first operand and use FMAD-like instructions or the last
3959 operand and use FMLA-like instructions; the latter seems more
3960 natural. */
3961
3962 static tree
3963 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3964 {
3965 return nops == 3 ? ops[2] : ops[0];
3966 }
3967
3968 /* Implement TARGET_HARD_REGNO_NREGS. */
3969
3970 static unsigned int
3971 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3972 {
3973 /* ??? Logically we should only need to provide a value when
3974 HARD_REGNO_MODE_OK says that the combination is valid,
3975 but at the moment we need to handle all modes. Just ignore
3976 any runtime parts for registers that can't store them. */
3977 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3978 switch (aarch64_regno_regclass (regno))
3979 {
3980 case FP_REGS:
3981 case FP_LO_REGS:
3982 case FP_LO8_REGS:
3983 {
3984 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3985 if (vec_flags & VEC_SVE_DATA)
3986 return exact_div (GET_MODE_SIZE (mode),
3987 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3988 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
3989 return GET_MODE_SIZE (mode).to_constant () / 8;
3990 return CEIL (lowest_size, UNITS_PER_VREG);
3991 }
3992 case PR_REGS:
3993 case PR_LO_REGS:
3994 case PR_HI_REGS:
3995 case FFR_REGS:
3996 case PR_AND_FFR_REGS:
3997 return 1;
3998 default:
3999 return CEIL (lowest_size, UNITS_PER_WORD);
4000 }
4001 gcc_unreachable ();
4002 }
4003
4004 /* Implement TARGET_HARD_REGNO_MODE_OK. */
4005
4006 static bool
4007 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
4008 {
4009 if (mode == V8DImode)
4010 return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
4011 && multiple_p (regno - R0_REGNUM, 2);
4012
4013 if (GET_MODE_CLASS (mode) == MODE_CC)
4014 return regno == CC_REGNUM;
4015
4016 if (regno == VG_REGNUM)
4017 /* This must have the same size as _Unwind_Word. */
4018 return mode == DImode;
4019
4020 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4021 if (vec_flags & VEC_SVE_PRED)
4022 return pr_or_ffr_regnum_p (regno);
4023
4024 if (pr_or_ffr_regnum_p (regno))
4025 return false;
4026
4027 if (regno == SP_REGNUM)
4028 /* The purpose of comparing with ptr_mode is to support the
4029 global register variable associated with the stack pointer
4030 register via the syntax of asm ("wsp") in ILP32. */
4031 return mode == Pmode || mode == ptr_mode;
4032
4033 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
4034 return mode == Pmode;
4035
4036 if (GP_REGNUM_P (regno))
4037 {
4038 if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
4039 return false;
4040 if (known_le (GET_MODE_SIZE (mode), 8))
4041 return true;
4042 if (known_le (GET_MODE_SIZE (mode), 16))
4043 return (regno & 1) == 0;
4044 }
4045 else if (FP_REGNUM_P (regno))
4046 {
4047 if (vec_flags & VEC_STRUCT)
4048 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
4049 else
4050 return !VECTOR_MODE_P (mode) || vec_flags != 0;
4051 }
4052
4053 return false;
4054 }
4055
4056 /* Return true if a function with type FNTYPE returns its value in
4057 SVE vector or predicate registers. */
4058
4059 static bool
4060 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
4061 {
4062 tree return_type = TREE_TYPE (fntype);
4063
4064 pure_scalable_type_info pst_info;
4065 switch (pst_info.analyze (return_type))
4066 {
4067 case pure_scalable_type_info::IS_PST:
4068 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
4069 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
4070
4071 case pure_scalable_type_info::DOESNT_MATTER:
4072 gcc_assert (aarch64_return_in_memory_1 (return_type));
4073 return false;
4074
4075 case pure_scalable_type_info::NO_ABI_IDENTITY:
4076 case pure_scalable_type_info::ISNT_PST:
4077 return false;
4078 }
4079 gcc_unreachable ();
4080 }
4081
4082 /* Return true if a function with type FNTYPE takes arguments in
4083 SVE vector or predicate registers. */
4084
4085 static bool
4086 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
4087 {
4088 CUMULATIVE_ARGS args_so_far_v;
4089 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
4090 NULL_TREE, 0, true);
4091 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
4092
4093 for (tree chain = TYPE_ARG_TYPES (fntype);
4094 chain && chain != void_list_node;
4095 chain = TREE_CHAIN (chain))
4096 {
4097 tree arg_type = TREE_VALUE (chain);
4098 if (arg_type == error_mark_node)
4099 return false;
4100
4101 function_arg_info arg (arg_type, /*named=*/true);
4102 apply_pass_by_reference_rules (&args_so_far_v, arg);
4103 pure_scalable_type_info pst_info;
4104 if (pst_info.analyze_registers (arg.type))
4105 {
4106 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
4107 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
4108 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
4109 return true;
4110 }
4111
4112 targetm.calls.function_arg_advance (args_so_far, arg);
4113 }
4114 return false;
4115 }
4116
4117 /* Implement TARGET_FNTYPE_ABI. */
4118
4119 static const predefined_function_abi &
4120 aarch64_fntype_abi (const_tree fntype)
4121 {
4122 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
4123 return aarch64_simd_abi ();
4124
4125 if (aarch64_returns_value_in_sve_regs_p (fntype)
4126 || aarch64_takes_arguments_in_sve_regs_p (fntype))
4127 return aarch64_sve_abi ();
4128
4129 return default_function_abi;
4130 }
4131
4132 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
4133
4134 static bool
4135 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
4136 {
4137 return (aarch64_sve::builtin_type_p (type1)
4138 == aarch64_sve::builtin_type_p (type2));
4139 }
4140
4141 /* Return true if we should emit CFI for register REGNO. */
4142
4143 static bool
4144 aarch64_emit_cfi_for_reg_p (unsigned int regno)
4145 {
4146 return (GP_REGNUM_P (regno)
4147 || !default_function_abi.clobbers_full_reg_p (regno));
4148 }
4149
4150 /* Return the mode we should use to save and restore register REGNO. */
4151
4152 static machine_mode
4153 aarch64_reg_save_mode (unsigned int regno)
4154 {
4155 if (GP_REGNUM_P (regno))
4156 return DImode;
4157
4158 if (FP_REGNUM_P (regno))
4159 switch (crtl->abi->id ())
4160 {
4161 case ARM_PCS_AAPCS64:
4162 /* Only the low 64 bits are saved by the base PCS. */
4163 return DFmode;
4164
4165 case ARM_PCS_SIMD:
4166 /* The vector PCS saves the low 128 bits (which is the full
4167 register on non-SVE targets). */
4168 return TFmode;
4169
4170 case ARM_PCS_SVE:
4171 /* Use vectors of DImode for registers that need frame
4172 information, so that the first 64 bytes of the save slot
4173 are always the equivalent of what storing D<n> would give. */
4174 if (aarch64_emit_cfi_for_reg_p (regno))
4175 return VNx2DImode;
4176
4177 /* Use vectors of bytes otherwise, so that the layout is
4178 endian-agnostic, and so that we can use LDR and STR for
4179 big-endian targets. */
4180 return VNx16QImode;
4181
4182 case ARM_PCS_TLSDESC:
4183 case ARM_PCS_UNKNOWN:
4184 break;
4185 }
4186
4187 if (PR_REGNUM_P (regno))
4188 /* Save the full predicate register. */
4189 return VNx16BImode;
4190
4191 gcc_unreachable ();
4192 }
4193
4194 /* Implement TARGET_INSN_CALLEE_ABI. */
4195
4196 const predefined_function_abi &
4197 aarch64_insn_callee_abi (const rtx_insn *insn)
4198 {
4199 rtx pat = PATTERN (insn);
4200 gcc_assert (GET_CODE (pat) == PARALLEL);
4201 rtx unspec = XVECEXP (pat, 0, 1);
4202 gcc_assert (GET_CODE (unspec) == UNSPEC
4203 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
4204 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
4205 }
4206
4207 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
4208 the lower 64 bits of a 128-bit register. Tell the compiler the callee
4209 clobbers the top 64 bits when restoring the bottom 64 bits. */
4210
4211 static bool
4212 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
4213 unsigned int regno,
4214 machine_mode mode)
4215 {
4216 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
4217 {
4218 poly_int64 per_register_size = GET_MODE_SIZE (mode);
4219 unsigned int nregs = hard_regno_nregs (regno, mode);
4220 if (nregs > 1)
4221 per_register_size = exact_div (per_register_size, nregs);
4222 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
4223 return maybe_gt (per_register_size, 16);
4224 return maybe_gt (per_register_size, 8);
4225 }
4226 return false;
4227 }
4228
4229 /* Implement REGMODE_NATURAL_SIZE. */
4230 poly_uint64
4231 aarch64_regmode_natural_size (machine_mode mode)
4232 {
4233 /* The natural size for SVE data modes is one SVE data vector,
4234 and similarly for predicates. We can't independently modify
4235 anything smaller than that. */
4236 /* ??? For now, only do this for variable-width SVE registers.
4237 Doing it for constant-sized registers breaks lower-subreg.cc. */
4238 /* ??? And once that's fixed, we should probably have similar
4239 code for Advanced SIMD. */
4240 if (!aarch64_sve_vg.is_constant ())
4241 {
4242 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4243 if (vec_flags & VEC_SVE_PRED)
4244 return BYTES_PER_SVE_PRED;
4245 if (vec_flags & VEC_SVE_DATA)
4246 return BYTES_PER_SVE_VECTOR;
4247 }
4248 return UNITS_PER_WORD;
4249 }
4250
4251 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
4252 machine_mode
4253 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
4254 machine_mode mode)
4255 {
4256 /* The predicate mode determines which bits are significant and
4257 which are "don't care". Decreasing the number of lanes would
4258 lose data while increasing the number of lanes would make bits
4259 unnecessarily significant. */
4260 if (PR_REGNUM_P (regno))
4261 return mode;
4262 if (known_ge (GET_MODE_SIZE (mode), 4))
4263 return mode;
4264 else
4265 return SImode;
4266 }
4267
4268 /* Return true if I's bits are consecutive ones from the MSB. */
4269 bool
4270 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
4271 {
4272 return exact_log2 (-i) != HOST_WIDE_INT_M1;
4273 }
4274
4275 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
4276 that strcpy from constants will be faster. */
4277
4278 static HOST_WIDE_INT
4279 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
4280 {
4281 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
4282 return MAX (align, BITS_PER_WORD);
4283 return align;
4284 }
4285
4286 /* Return true if calls to DECL should be treated as
4287 long-calls (ie called via a register). */
4288 static bool
4289 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
4290 {
4291 return false;
4292 }
4293
4294 /* Return true if calls to symbol-ref SYM should be treated as
4295 long-calls (ie called via a register). */
4296 bool
4297 aarch64_is_long_call_p (rtx sym)
4298 {
4299 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
4300 }
4301
4302 /* Return true if calls to symbol-ref SYM should not go through
4303 plt stubs. */
4304
4305 bool
4306 aarch64_is_noplt_call_p (rtx sym)
4307 {
4308 const_tree decl = SYMBOL_REF_DECL (sym);
4309
4310 if (flag_pic
4311 && decl
4312 && (!flag_plt
4313 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
4314 && !targetm.binds_local_p (decl))
4315 return true;
4316
4317 return false;
4318 }
4319
4320 /* Emit an insn that's a simple single-set. Both the operands must be
4321 known to be valid. */
4322 inline static rtx_insn *
4323 emit_set_insn (rtx x, rtx y)
4324 {
4325 return emit_insn (gen_rtx_SET (x, y));
4326 }
4327
4328 /* X and Y are two things to compare using CODE. Emit the compare insn and
4329 return the rtx for register 0 in the proper mode. */
4330 rtx
4331 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
4332 {
4333 machine_mode cmp_mode = GET_MODE (x);
4334 machine_mode cc_mode;
4335 rtx cc_reg;
4336
4337 if (cmp_mode == TImode)
4338 {
4339 gcc_assert (code == NE);
4340
4341 cc_mode = CCmode;
4342 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4343
4344 rtx x_lo = operand_subword (x, 0, 0, TImode);
4345 rtx y_lo = operand_subword (y, 0, 0, TImode);
4346 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
4347
4348 rtx x_hi = operand_subword (x, 1, 0, TImode);
4349 rtx y_hi = operand_subword (y, 1, 0, TImode);
4350 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
4351 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
4352 GEN_INT (AARCH64_EQ)));
4353 }
4354 else
4355 {
4356 cc_mode = SELECT_CC_MODE (code, x, y);
4357 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4358 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
4359 }
4360 return cc_reg;
4361 }
4362
4363 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
4364
4365 static rtx
4366 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
4367 machine_mode y_mode)
4368 {
4369 if (y_mode == E_QImode || y_mode == E_HImode)
4370 {
4371 if (CONST_INT_P (y))
4372 {
4373 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
4374 y_mode = SImode;
4375 }
4376 else
4377 {
4378 rtx t, cc_reg;
4379 machine_mode cc_mode;
4380
4381 t = gen_rtx_ZERO_EXTEND (SImode, y);
4382 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
4383 cc_mode = CC_SWPmode;
4384 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4385 emit_set_insn (cc_reg, t);
4386 return cc_reg;
4387 }
4388 }
4389
4390 if (!aarch64_plus_operand (y, y_mode))
4391 y = force_reg (y_mode, y);
4392
4393 return aarch64_gen_compare_reg (code, x, y);
4394 }
4395
4396 /* Consider the operation:
4397
4398 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
4399
4400 where:
4401
4402 - CODE is [SU]MAX or [SU]MIN
4403 - OPERANDS[2] and OPERANDS[3] are constant integers
4404 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
4405 - all operands have mode MODE
4406
4407 Decide whether it is possible to implement the operation using:
4408
4409 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
4410 or
4411 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
4412
4413 followed by:
4414
4415 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
4416
4417 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
4418 If GENERATE_P is true, also update OPERANDS as follows:
4419
4420 OPERANDS[4] = -OPERANDS[3]
4421 OPERANDS[5] = the rtl condition representing <cond>
4422 OPERANDS[6] = <tmp>
4423 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
4424 bool
4425 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
4426 {
4427 signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
4428 rtx dst = operands[0];
4429 rtx maxmin_op = operands[2];
4430 rtx add_op = operands[3];
4431 machine_mode mode = GET_MODE (dst);
4432
4433 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
4434 == (x >= y ? x : y) - z
4435 == (x > y ? x : y) - z
4436 == (x > y - 1 ? x : y) - z
4437
4438 min (x, y) - z == (x <= y - 1 ? x : y) - z
4439 == (x <= y ? x : y) - z
4440 == (x < y ? x : y) - z
4441 == (x < y + 1 ? x : y) - z
4442
4443 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
4444 which x is compared with z. Set DIFF to y - z. Thus the supported
4445 combinations are as follows, with DIFF being the value after the ":":
4446
4447 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
4448 == x >= y ? x - y : 0 [z == y]
4449 == x > y ? x - y : 0 [z == y]
4450 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
4451
4452 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
4453 == x <= y ? x - y : 0 [z == y]
4454 == x < y ? x - y : 0 [z == y]
4455 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
4456 auto maxmin_val = rtx_mode_t (maxmin_op, mode);
4457 auto add_val = rtx_mode_t (add_op, mode);
4458 auto sub_val = wi::neg (add_val);
4459 auto diff = wi::sub (maxmin_val, sub_val);
4460 if (!(diff == 0
4461 || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
4462 || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
4463 return false;
4464
4465 if (!generate_p)
4466 return true;
4467
4468 rtx_code cmp;
4469 switch (code)
4470 {
4471 case SMAX:
4472 cmp = diff == 1 ? GT : GE;
4473 break;
4474 case UMAX:
4475 cmp = diff == 1 ? GTU : GEU;
4476 break;
4477 case SMIN:
4478 cmp = diff == -1 ? LT : LE;
4479 break;
4480 case UMIN:
4481 cmp = diff == -1 ? LTU : LEU;
4482 break;
4483 default:
4484 gcc_unreachable ();
4485 }
4486 rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
4487
4488 operands[4] = immed_wide_int_const (sub_val, mode);
4489 operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
4490 if (can_create_pseudo_p ())
4491 operands[6] = gen_reg_rtx (mode);
4492 else
4493 operands[6] = dst;
4494 operands[7] = immed_wide_int_const (diff, mode);
4495
4496 return true;
4497 }
4498
4499
4500 /* Build the SYMBOL_REF for __tls_get_addr. */
4501
4502 static GTY(()) rtx tls_get_addr_libfunc;
4503
4504 rtx
4505 aarch64_tls_get_addr (void)
4506 {
4507 if (!tls_get_addr_libfunc)
4508 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
4509 return tls_get_addr_libfunc;
4510 }
4511
4512 /* Return the TLS model to use for ADDR. */
4513
4514 static enum tls_model
4515 tls_symbolic_operand_type (rtx addr)
4516 {
4517 enum tls_model tls_kind = TLS_MODEL_NONE;
4518 poly_int64 offset;
4519 addr = strip_offset_and_salt (addr, &offset);
4520 if (SYMBOL_REF_P (addr))
4521 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
4522
4523 return tls_kind;
4524 }
4525
4526 /* We'll allow lo_sum's in addresses in our legitimate addresses
4527 so that combine would take care of combining addresses where
4528 necessary, but for generation purposes, we'll generate the address
4529 as :
4530 RTL Absolute
4531 tmp = hi (symbol_ref); adrp x1, foo
4532 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
4533 nop
4534
4535 PIC TLS
4536 adrp x1, :got:foo adrp tmp, :tlsgd:foo
4537 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
4538 bl __tls_get_addr
4539 nop
4540
4541 Load TLS symbol, depending on TLS mechanism and TLS access model.
4542
4543 Global Dynamic - Traditional TLS:
4544 adrp tmp, :tlsgd:imm
4545 add dest, tmp, #:tlsgd_lo12:imm
4546 bl __tls_get_addr
4547
4548 Global Dynamic - TLS Descriptors:
4549 adrp dest, :tlsdesc:imm
4550 ldr tmp, [dest, #:tlsdesc_lo12:imm]
4551 add dest, dest, #:tlsdesc_lo12:imm
4552 blr tmp
4553 mrs tp, tpidr_el0
4554 add dest, dest, tp
4555
4556 Initial Exec:
4557 mrs tp, tpidr_el0
4558 adrp tmp, :gottprel:imm
4559 ldr dest, [tmp, #:gottprel_lo12:imm]
4560 add dest, dest, tp
4561
4562 Local Exec:
4563 mrs tp, tpidr_el0
4564 add t0, tp, #:tprel_hi12:imm, lsl #12
4565 add t0, t0, #:tprel_lo12_nc:imm
4566 */
4567
4568 static void
4569 aarch64_load_symref_appropriately (rtx dest, rtx imm,
4570 enum aarch64_symbol_type type)
4571 {
4572 switch (type)
4573 {
4574 case SYMBOL_SMALL_ABSOLUTE:
4575 {
4576 /* In ILP32, the mode of dest can be either SImode or DImode. */
4577 rtx tmp_reg = dest;
4578 machine_mode mode = GET_MODE (dest);
4579
4580 gcc_assert (mode == Pmode || mode == ptr_mode);
4581
4582 if (can_create_pseudo_p ())
4583 tmp_reg = gen_reg_rtx (mode);
4584
4585 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
4586 emit_insn (gen_add_losym (dest, tmp_reg, imm));
4587 return;
4588 }
4589
4590 case SYMBOL_TINY_ABSOLUTE:
4591 emit_insn (gen_rtx_SET (dest, imm));
4592 return;
4593
4594 case SYMBOL_SMALL_GOT_28K:
4595 {
4596 machine_mode mode = GET_MODE (dest);
4597 rtx gp_rtx = pic_offset_table_rtx;
4598 rtx insn;
4599 rtx mem;
4600
4601 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
4602 here before rtl expand. Tree IVOPT will generate rtl pattern to
4603 decide rtx costs, in which case pic_offset_table_rtx is not
4604 initialized. For that case no need to generate the first adrp
4605 instruction as the final cost for global variable access is
4606 one instruction. */
4607 if (gp_rtx != NULL)
4608 {
4609 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
4610 using the page base as GOT base, the first page may be wasted,
4611 in the worst scenario, there is only 28K space for GOT).
4612
4613 The generate instruction sequence for accessing global variable
4614 is:
4615
4616 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
4617
4618 Only one instruction needed. But we must initialize
4619 pic_offset_table_rtx properly. We generate initialize insn for
4620 every global access, and allow CSE to remove all redundant.
4621
4622 The final instruction sequences will look like the following
4623 for multiply global variables access.
4624
4625 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
4626
4627 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
4628 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
4629 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
4630 ... */
4631
4632 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
4633 crtl->uses_pic_offset_table = 1;
4634 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
4635
4636 if (mode != GET_MODE (gp_rtx))
4637 gp_rtx = gen_lowpart (mode, gp_rtx);
4638
4639 }
4640
4641 if (mode == ptr_mode)
4642 {
4643 if (mode == DImode)
4644 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
4645 else
4646 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
4647
4648 mem = XVECEXP (SET_SRC (insn), 0, 0);
4649 }
4650 else
4651 {
4652 gcc_assert (mode == Pmode);
4653
4654 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
4655 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
4656 }
4657
4658 /* The operand is expected to be MEM. Whenever the related insn
4659 pattern changed, above code which calculate mem should be
4660 updated. */
4661 gcc_assert (MEM_P (mem));
4662 MEM_READONLY_P (mem) = 1;
4663 MEM_NOTRAP_P (mem) = 1;
4664 emit_insn (insn);
4665 return;
4666 }
4667
4668 case SYMBOL_SMALL_GOT_4G:
4669 emit_insn (gen_rtx_SET (dest, imm));
4670 return;
4671
4672 case SYMBOL_SMALL_TLSGD:
4673 {
4674 rtx_insn *insns;
4675 /* The return type of __tls_get_addr is the C pointer type
4676 so use ptr_mode. */
4677 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
4678 rtx tmp_reg = dest;
4679
4680 if (GET_MODE (dest) != ptr_mode)
4681 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
4682
4683 start_sequence ();
4684 if (ptr_mode == SImode)
4685 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
4686 else
4687 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
4688 insns = get_insns ();
4689 end_sequence ();
4690
4691 RTL_CONST_CALL_P (insns) = 1;
4692 emit_libcall_block (insns, tmp_reg, result, imm);
4693 /* Convert back to the mode of the dest adding a zero_extend
4694 from SImode (ptr_mode) to DImode (Pmode). */
4695 if (dest != tmp_reg)
4696 convert_move (dest, tmp_reg, true);
4697 return;
4698 }
4699
4700 case SYMBOL_SMALL_TLSDESC:
4701 {
4702 machine_mode mode = GET_MODE (dest);
4703 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
4704 rtx tp;
4705
4706 gcc_assert (mode == Pmode || mode == ptr_mode);
4707
4708 /* In ILP32, the got entry is always of SImode size. Unlike
4709 small GOT, the dest is fixed at reg 0. */
4710 if (TARGET_ILP32)
4711 emit_insn (gen_tlsdesc_small_si (imm));
4712 else
4713 emit_insn (gen_tlsdesc_small_di (imm));
4714 tp = aarch64_load_tp (NULL);
4715
4716 if (mode != Pmode)
4717 tp = gen_lowpart (mode, tp);
4718
4719 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
4720 if (REG_P (dest))
4721 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4722 return;
4723 }
4724
4725 case SYMBOL_SMALL_TLSIE:
4726 {
4727 /* In ILP32, the mode of dest can be either SImode or DImode,
4728 while the got entry is always of SImode size. The mode of
4729 dest depends on how dest is used: if dest is assigned to a
4730 pointer (e.g. in the memory), it has SImode; it may have
4731 DImode if dest is dereferenced to access the memeory.
4732 This is why we have to handle three different tlsie_small
4733 patterns here (two patterns for ILP32). */
4734 machine_mode mode = GET_MODE (dest);
4735 rtx tmp_reg = gen_reg_rtx (mode);
4736 rtx tp = aarch64_load_tp (NULL);
4737
4738 if (mode == ptr_mode)
4739 {
4740 if (mode == DImode)
4741 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
4742 else
4743 {
4744 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
4745 tp = gen_lowpart (mode, tp);
4746 }
4747 }
4748 else
4749 {
4750 gcc_assert (mode == Pmode);
4751 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
4752 }
4753
4754 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
4755 if (REG_P (dest))
4756 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4757 return;
4758 }
4759
4760 case SYMBOL_TLSLE12:
4761 case SYMBOL_TLSLE24:
4762 case SYMBOL_TLSLE32:
4763 case SYMBOL_TLSLE48:
4764 {
4765 machine_mode mode = GET_MODE (dest);
4766 rtx tp = aarch64_load_tp (NULL);
4767
4768 if (mode != Pmode)
4769 tp = gen_lowpart (mode, tp);
4770
4771 switch (type)
4772 {
4773 case SYMBOL_TLSLE12:
4774 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
4775 (dest, tp, imm));
4776 break;
4777 case SYMBOL_TLSLE24:
4778 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
4779 (dest, tp, imm));
4780 break;
4781 case SYMBOL_TLSLE32:
4782 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
4783 (dest, imm));
4784 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4785 (dest, dest, tp));
4786 break;
4787 case SYMBOL_TLSLE48:
4788 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
4789 (dest, imm));
4790 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4791 (dest, dest, tp));
4792 break;
4793 default:
4794 gcc_unreachable ();
4795 }
4796
4797 if (REG_P (dest))
4798 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4799 return;
4800 }
4801
4802 case SYMBOL_TINY_GOT:
4803 {
4804 rtx insn;
4805 machine_mode mode = GET_MODE (dest);
4806
4807 if (mode == ptr_mode)
4808 insn = gen_ldr_got_tiny (mode, dest, imm);
4809 else
4810 {
4811 gcc_assert (mode == Pmode);
4812 insn = gen_ldr_got_tiny_sidi (dest, imm);
4813 }
4814
4815 emit_insn (insn);
4816 return;
4817 }
4818
4819 case SYMBOL_TINY_TLSIE:
4820 {
4821 machine_mode mode = GET_MODE (dest);
4822 rtx tp = aarch64_load_tp (NULL);
4823
4824 if (mode == ptr_mode)
4825 {
4826 if (mode == DImode)
4827 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
4828 else
4829 {
4830 tp = gen_lowpart (mode, tp);
4831 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
4832 }
4833 }
4834 else
4835 {
4836 gcc_assert (mode == Pmode);
4837 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
4838 }
4839
4840 if (REG_P (dest))
4841 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4842 return;
4843 }
4844
4845 default:
4846 gcc_unreachable ();
4847 }
4848 }
4849
4850 /* Emit a move from SRC to DEST. Assume that the move expanders can
4851 handle all moves if !can_create_pseudo_p (). The distinction is
4852 important because, unlike emit_move_insn, the move expanders know
4853 how to force Pmode objects into the constant pool even when the
4854 constant pool address is not itself legitimate. */
4855 static rtx
4856 aarch64_emit_move (rtx dest, rtx src)
4857 {
4858 return (can_create_pseudo_p ()
4859 ? emit_move_insn (dest, src)
4860 : emit_move_insn_1 (dest, src));
4861 }
4862
4863 /* Apply UNOPTAB to OP and store the result in DEST. */
4864
4865 static void
4866 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
4867 {
4868 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
4869 if (dest != tmp)
4870 emit_move_insn (dest, tmp);
4871 }
4872
4873 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
4874
4875 static void
4876 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4877 {
4878 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4879 OPTAB_DIRECT);
4880 if (dest != tmp)
4881 emit_move_insn (dest, tmp);
4882 }
4883
4884 /* Split a 128-bit move operation into two 64-bit move operations,
4885 taking care to handle partial overlap of register to register
4886 copies. Special cases are needed when moving between GP regs and
4887 FP regs. SRC can be a register, constant or memory; DST a register
4888 or memory. If either operand is memory it must not have any side
4889 effects. */
4890 void
4891 aarch64_split_128bit_move (rtx dst, rtx src)
4892 {
4893 rtx dst_lo, dst_hi;
4894 rtx src_lo, src_hi;
4895
4896 machine_mode mode = GET_MODE (dst);
4897
4898 gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
4899 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4900 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
4901
4902 if (REG_P (dst) && REG_P (src))
4903 {
4904 int src_regno = REGNO (src);
4905 int dst_regno = REGNO (dst);
4906
4907 /* Handle FP <-> GP regs. */
4908 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4909 {
4910 src_lo = gen_lowpart (word_mode, src);
4911 src_hi = gen_highpart (word_mode, src);
4912
4913 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4914 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
4915 return;
4916 }
4917 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4918 {
4919 dst_lo = gen_lowpart (word_mode, dst);
4920 dst_hi = gen_highpart (word_mode, dst);
4921
4922 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4923 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
4924 return;
4925 }
4926 }
4927
4928 dst_lo = gen_lowpart (word_mode, dst);
4929 dst_hi = gen_highpart (word_mode, dst);
4930 src_lo = gen_lowpart (word_mode, src);
4931 src_hi = gen_highpart_mode (word_mode, mode, src);
4932
4933 /* At most one pairing may overlap. */
4934 if (reg_overlap_mentioned_p (dst_lo, src_hi))
4935 {
4936 aarch64_emit_move (dst_hi, src_hi);
4937 aarch64_emit_move (dst_lo, src_lo);
4938 }
4939 else
4940 {
4941 aarch64_emit_move (dst_lo, src_lo);
4942 aarch64_emit_move (dst_hi, src_hi);
4943 }
4944 }
4945
4946 /* Return true if we should split a move from 128-bit value SRC
4947 to 128-bit register DEST. */
4948
4949 bool
4950 aarch64_split_128bit_move_p (rtx dst, rtx src)
4951 {
4952 if (FP_REGNUM_P (REGNO (dst)))
4953 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4954 /* All moves to GPRs need to be split. */
4955 return true;
4956 }
4957
4958 /* Split a complex SIMD move. */
4959
4960 void
4961 aarch64_split_simd_move (rtx dst, rtx src)
4962 {
4963 machine_mode src_mode = GET_MODE (src);
4964 machine_mode dst_mode = GET_MODE (dst);
4965
4966 gcc_assert (VECTOR_MODE_P (dst_mode));
4967
4968 if (REG_P (dst) && REG_P (src))
4969 {
4970 gcc_assert (VECTOR_MODE_P (src_mode));
4971 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
4972 }
4973 }
4974
4975 bool
4976 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4977 machine_mode ymode, rtx y)
4978 {
4979 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4980 gcc_assert (r != NULL);
4981 return rtx_equal_p (x, r);
4982 }
4983
4984 /* Return TARGET if it is nonnull and a register of mode MODE.
4985 Otherwise, return a fresh register of mode MODE if we can,
4986 or TARGET reinterpreted as MODE if we can't. */
4987
4988 static rtx
4989 aarch64_target_reg (rtx target, machine_mode mode)
4990 {
4991 if (target && REG_P (target) && GET_MODE (target) == mode)
4992 return target;
4993 if (!can_create_pseudo_p ())
4994 {
4995 gcc_assert (target);
4996 return gen_lowpart (mode, target);
4997 }
4998 return gen_reg_rtx (mode);
4999 }
5000
5001 /* Return a register that contains the constant in BUILDER, given that
5002 the constant is a legitimate move operand. Use TARGET as the register
5003 if it is nonnull and convenient. */
5004
5005 static rtx
5006 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
5007 {
5008 rtx src = builder.build ();
5009 target = aarch64_target_reg (target, GET_MODE (src));
5010 emit_insn (gen_rtx_SET (target, src));
5011 return target;
5012 }
5013
5014 static rtx
5015 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
5016 {
5017 if (can_create_pseudo_p ())
5018 return force_reg (mode, value);
5019 else
5020 {
5021 gcc_assert (x);
5022 aarch64_emit_move (x, value);
5023 return x;
5024 }
5025 }
5026
5027 /* Return true if predicate value X is a constant in which every element
5028 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
5029 value, i.e. as a predicate in which all bits are significant. */
5030
5031 static bool
5032 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
5033 {
5034 if (!CONST_VECTOR_P (x))
5035 return false;
5036
5037 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
5038 GET_MODE_NUNITS (GET_MODE (x)));
5039 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
5040 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
5041 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
5042
5043 unsigned int nelts = const_vector_encoded_nelts (x);
5044 for (unsigned int i = 0; i < nelts; ++i)
5045 {
5046 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
5047 if (!CONST_INT_P (elt))
5048 return false;
5049
5050 builder.quick_push (elt);
5051 for (unsigned int j = 1; j < factor; ++j)
5052 builder.quick_push (const0_rtx);
5053 }
5054 builder.finalize ();
5055 return true;
5056 }
5057
5058 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
5059 widest predicate element size it can have (that is, the largest size
5060 for which each element would still be 0 or 1). */
5061
5062 unsigned int
5063 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
5064 {
5065 /* Start with the most optimistic assumption: that we only need
5066 one bit per pattern. This is what we will use if only the first
5067 bit in each pattern is ever set. */
5068 unsigned int mask = GET_MODE_SIZE (DImode);
5069 mask |= builder.npatterns ();
5070
5071 /* Look for set bits. */
5072 unsigned int nelts = builder.encoded_nelts ();
5073 for (unsigned int i = 1; i < nelts; ++i)
5074 if (INTVAL (builder.elt (i)) != 0)
5075 {
5076 if (i & 1)
5077 return 1;
5078 mask |= i;
5079 }
5080 return mask & -mask;
5081 }
5082
5083 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
5084 return that predicate mode, otherwise return opt_machine_mode (). */
5085
5086 opt_machine_mode
5087 aarch64_ptrue_all_mode (rtx x)
5088 {
5089 gcc_assert (GET_MODE (x) == VNx16BImode);
5090 if (!CONST_VECTOR_P (x)
5091 || !CONST_VECTOR_DUPLICATE_P (x)
5092 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
5093 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
5094 return opt_machine_mode ();
5095
5096 unsigned int nelts = const_vector_encoded_nelts (x);
5097 for (unsigned int i = 1; i < nelts; ++i)
5098 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
5099 return opt_machine_mode ();
5100
5101 return aarch64_sve_pred_mode (nelts);
5102 }
5103
5104 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
5105 that the constant would have with predicate element size ELT_SIZE
5106 (ignoring the upper bits in each element) and return:
5107
5108 * -1 if all bits are set
5109 * N if the predicate has N leading set bits followed by all clear bits
5110 * 0 if the predicate does not have any of these forms. */
5111
5112 int
5113 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
5114 unsigned int elt_size)
5115 {
5116 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
5117 followed by set bits. */
5118 if (builder.nelts_per_pattern () == 3)
5119 return 0;
5120
5121 /* Skip over leading set bits. */
5122 unsigned int nelts = builder.encoded_nelts ();
5123 unsigned int i = 0;
5124 for (; i < nelts; i += elt_size)
5125 if (INTVAL (builder.elt (i)) == 0)
5126 break;
5127 unsigned int vl = i / elt_size;
5128
5129 /* Check for the all-true case. */
5130 if (i == nelts)
5131 return -1;
5132
5133 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
5134 repeating pattern of set bits followed by clear bits. */
5135 if (builder.nelts_per_pattern () != 2)
5136 return 0;
5137
5138 /* We have a "foreground" value and a duplicated "background" value.
5139 If the background might repeat and the last set bit belongs to it,
5140 we might have set bits followed by clear bits followed by set bits. */
5141 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
5142 return 0;
5143
5144 /* Make sure that the rest are all clear. */
5145 for (; i < nelts; i += elt_size)
5146 if (INTVAL (builder.elt (i)) != 0)
5147 return 0;
5148
5149 return vl;
5150 }
5151
5152 /* See if there is an svpattern that encodes an SVE predicate of mode
5153 PRED_MODE in which the first VL bits are set and the rest are clear.
5154 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
5155 A VL of -1 indicates an all-true vector. */
5156
5157 aarch64_svpattern
5158 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
5159 {
5160 if (vl < 0)
5161 return AARCH64_SV_ALL;
5162
5163 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
5164 return AARCH64_NUM_SVPATTERNS;
5165
5166 if (vl >= 1 && vl <= 8)
5167 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
5168
5169 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
5170 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
5171
5172 int max_vl;
5173 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
5174 {
5175 if (vl == (max_vl / 3) * 3)
5176 return AARCH64_SV_MUL3;
5177 /* These would only trigger for non-power-of-2 lengths. */
5178 if (vl == (max_vl & -4))
5179 return AARCH64_SV_MUL4;
5180 if (vl == (1 << floor_log2 (max_vl)))
5181 return AARCH64_SV_POW2;
5182 if (vl == max_vl)
5183 return AARCH64_SV_ALL;
5184 }
5185 return AARCH64_NUM_SVPATTERNS;
5186 }
5187
5188 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
5189 bits has the lowest bit set and the upper bits clear. This is the
5190 VNx16BImode equivalent of a PTRUE for controlling elements of
5191 ELT_SIZE bytes. However, because the constant is VNx16BImode,
5192 all bits are significant, even the upper zeros. */
5193
5194 rtx
5195 aarch64_ptrue_all (unsigned int elt_size)
5196 {
5197 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
5198 builder.quick_push (const1_rtx);
5199 for (unsigned int i = 1; i < elt_size; ++i)
5200 builder.quick_push (const0_rtx);
5201 return builder.build ();
5202 }
5203
5204 /* Return an all-true predicate register of mode MODE. */
5205
5206 rtx
5207 aarch64_ptrue_reg (machine_mode mode)
5208 {
5209 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5210 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
5211 return gen_lowpart (mode, reg);
5212 }
5213
5214 /* Return an all-false predicate register of mode MODE. */
5215
5216 rtx
5217 aarch64_pfalse_reg (machine_mode mode)
5218 {
5219 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5220 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
5221 return gen_lowpart (mode, reg);
5222 }
5223
5224 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
5225 for it. PRED2[0] is the predicate for the instruction whose result
5226 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
5227 for it. Return true if we can prove that the two predicates are
5228 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
5229 with PRED1[0] without changing behavior. */
5230
5231 bool
5232 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
5233 {
5234 machine_mode mode = GET_MODE (pred1[0]);
5235 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
5236 && mode == GET_MODE (pred2[0])
5237 && aarch64_sve_ptrue_flag (pred1[1], SImode)
5238 && aarch64_sve_ptrue_flag (pred2[1], SImode));
5239
5240 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
5241 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
5242 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
5243 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
5244 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
5245 }
5246
5247 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
5248 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
5249 Use TARGET as the target register if nonnull and convenient. */
5250
5251 static rtx
5252 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
5253 machine_mode data_mode, rtx op1, rtx op2)
5254 {
5255 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
5256 expand_operand ops[5];
5257 create_output_operand (&ops[0], target, pred_mode);
5258 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
5259 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
5260 create_input_operand (&ops[3], op1, data_mode);
5261 create_input_operand (&ops[4], op2, data_mode);
5262 expand_insn (icode, 5, ops);
5263 return ops[0].value;
5264 }
5265
5266 /* Use a comparison to convert integer vector SRC into MODE, which is
5267 the corresponding SVE predicate mode. Use TARGET for the result
5268 if it's nonnull and convenient. */
5269
5270 rtx
5271 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
5272 {
5273 machine_mode src_mode = GET_MODE (src);
5274 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
5275 src, CONST0_RTX (src_mode));
5276 }
5277
5278 /* Return the assembly token for svprfop value PRFOP. */
5279
5280 static const char *
5281 svprfop_token (enum aarch64_svprfop prfop)
5282 {
5283 switch (prfop)
5284 {
5285 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
5286 AARCH64_FOR_SVPRFOP (CASE)
5287 #undef CASE
5288 case AARCH64_NUM_SVPRFOPS:
5289 break;
5290 }
5291 gcc_unreachable ();
5292 }
5293
5294 /* Return the assembly string for an SVE prefetch operation with
5295 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
5296 and that SUFFIX is the format for the remaining operands. */
5297
5298 char *
5299 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
5300 const char *suffix)
5301 {
5302 static char buffer[128];
5303 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
5304 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
5305 mnemonic, svprfop_token (prfop), suffix);
5306 gcc_assert (written < sizeof (buffer));
5307 return buffer;
5308 }
5309
5310 /* Check whether we can calculate the number of elements in PATTERN
5311 at compile time, given that there are NELTS_PER_VQ elements per
5312 128-bit block. Return the value if so, otherwise return -1. */
5313
5314 HOST_WIDE_INT
5315 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
5316 {
5317 unsigned int vl, const_vg;
5318 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
5319 vl = 1 + (pattern - AARCH64_SV_VL1);
5320 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
5321 vl = 16 << (pattern - AARCH64_SV_VL16);
5322 else if (aarch64_sve_vg.is_constant (&const_vg))
5323 {
5324 /* There are two vector granules per quadword. */
5325 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
5326 switch (pattern)
5327 {
5328 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
5329 case AARCH64_SV_MUL4: return nelts & -4;
5330 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
5331 case AARCH64_SV_ALL: return nelts;
5332 default: gcc_unreachable ();
5333 }
5334 }
5335 else
5336 return -1;
5337
5338 /* There are two vector granules per quadword. */
5339 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
5340 if (known_le (vl, nelts_all))
5341 return vl;
5342
5343 /* Requesting more elements than are available results in a PFALSE. */
5344 if (known_gt (vl, nelts_all))
5345 return 0;
5346
5347 return -1;
5348 }
5349
5350 /* Return true if we can move VALUE into a register using a single
5351 CNT[BHWD] instruction. */
5352
5353 static bool
5354 aarch64_sve_cnt_immediate_p (poly_int64 value)
5355 {
5356 HOST_WIDE_INT factor = value.coeffs[0];
5357 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
5358 return (value.coeffs[1] == factor
5359 && IN_RANGE (factor, 2, 16 * 16)
5360 && (factor & 1) == 0
5361 && factor <= 16 * (factor & -factor));
5362 }
5363
5364 /* Likewise for rtx X. */
5365
5366 bool
5367 aarch64_sve_cnt_immediate_p (rtx x)
5368 {
5369 poly_int64 value;
5370 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
5371 }
5372
5373 /* Return the asm string for an instruction with a CNT-like vector size
5374 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5375 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5376 first part of the operands template (the part that comes before the
5377 vector size itself). PATTERN is the pattern to use. FACTOR is the
5378 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
5379 in each quadword. If it is zero, we can use any element size. */
5380
5381 static char *
5382 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5383 aarch64_svpattern pattern,
5384 unsigned int factor,
5385 unsigned int nelts_per_vq)
5386 {
5387 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
5388
5389 if (nelts_per_vq == 0)
5390 /* There is some overlap in the ranges of the four CNT instructions.
5391 Here we always use the smallest possible element size, so that the
5392 multiplier is 1 whereever possible. */
5393 nelts_per_vq = factor & -factor;
5394 int shift = std::min (exact_log2 (nelts_per_vq), 4);
5395 gcc_assert (IN_RANGE (shift, 1, 4));
5396 char suffix = "dwhb"[shift - 1];
5397
5398 factor >>= shift;
5399 unsigned int written;
5400 if (pattern == AARCH64_SV_ALL && factor == 1)
5401 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
5402 prefix, suffix, operands);
5403 else if (factor == 1)
5404 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
5405 prefix, suffix, operands, svpattern_token (pattern));
5406 else
5407 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
5408 prefix, suffix, operands, svpattern_token (pattern),
5409 factor);
5410 gcc_assert (written < sizeof (buffer));
5411 return buffer;
5412 }
5413
5414 /* Return the asm string for an instruction with a CNT-like vector size
5415 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5416 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5417 first part of the operands template (the part that comes before the
5418 vector size itself). X is the value of the vector size operand,
5419 as a polynomial integer rtx; we need to convert this into an "all"
5420 pattern with a multiplier. */
5421
5422 char *
5423 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5424 rtx x)
5425 {
5426 poly_int64 value = rtx_to_poly_int64 (x);
5427 gcc_assert (aarch64_sve_cnt_immediate_p (value));
5428 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
5429 value.coeffs[1], 0);
5430 }
5431
5432 /* Return the asm string for an instruction with a CNT-like vector size
5433 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5434 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5435 first part of the operands template (the part that comes before the
5436 vector size itself). CNT_PAT[0..2] are the operands of the
5437 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
5438
5439 char *
5440 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
5441 const char *operands, rtx *cnt_pat)
5442 {
5443 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
5444 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
5445 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
5446 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
5447 factor, nelts_per_vq);
5448 }
5449
5450 /* Return true if we can add X using a single SVE INC or DEC instruction. */
5451
5452 bool
5453 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
5454 {
5455 poly_int64 value;
5456 return (poly_int_rtx_p (x, &value)
5457 && (aarch64_sve_cnt_immediate_p (value)
5458 || aarch64_sve_cnt_immediate_p (-value)));
5459 }
5460
5461 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
5462 operand 0. */
5463
5464 char *
5465 aarch64_output_sve_scalar_inc_dec (rtx offset)
5466 {
5467 poly_int64 offset_value = rtx_to_poly_int64 (offset);
5468 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
5469 if (offset_value.coeffs[1] > 0)
5470 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
5471 offset_value.coeffs[1], 0);
5472 else
5473 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
5474 -offset_value.coeffs[1], 0);
5475 }
5476
5477 /* Return true if we can add VALUE to a register using a single ADDVL
5478 or ADDPL instruction. */
5479
5480 static bool
5481 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
5482 {
5483 HOST_WIDE_INT factor = value.coeffs[0];
5484 if (factor == 0 || value.coeffs[1] != factor)
5485 return false;
5486 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
5487 and a value of 16 is one vector width. */
5488 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
5489 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
5490 }
5491
5492 /* Likewise for rtx X. */
5493
5494 bool
5495 aarch64_sve_addvl_addpl_immediate_p (rtx x)
5496 {
5497 poly_int64 value;
5498 return (poly_int_rtx_p (x, &value)
5499 && aarch64_sve_addvl_addpl_immediate_p (value));
5500 }
5501
5502 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
5503 to operand 1 and storing the result in operand 0. */
5504
5505 char *
5506 aarch64_output_sve_addvl_addpl (rtx offset)
5507 {
5508 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
5509 poly_int64 offset_value = rtx_to_poly_int64 (offset);
5510 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
5511
5512 int factor = offset_value.coeffs[1];
5513 if ((factor & 15) == 0)
5514 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
5515 else
5516 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
5517 return buffer;
5518 }
5519
5520 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5521 instruction. If it is, store the number of elements in each vector
5522 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
5523 factor in *FACTOR_OUT (if nonnull). */
5524
5525 bool
5526 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
5527 unsigned int *nelts_per_vq_out)
5528 {
5529 rtx elt;
5530 poly_int64 value;
5531
5532 if (!const_vec_duplicate_p (x, &elt)
5533 || !poly_int_rtx_p (elt, &value))
5534 return false;
5535
5536 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
5537 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
5538 /* There's no vector INCB. */
5539 return false;
5540
5541 HOST_WIDE_INT factor = value.coeffs[0];
5542 if (value.coeffs[1] != factor)
5543 return false;
5544
5545 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
5546 if ((factor % nelts_per_vq) != 0
5547 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
5548 return false;
5549
5550 if (factor_out)
5551 *factor_out = factor;
5552 if (nelts_per_vq_out)
5553 *nelts_per_vq_out = nelts_per_vq;
5554 return true;
5555 }
5556
5557 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5558 instruction. */
5559
5560 bool
5561 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
5562 {
5563 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
5564 }
5565
5566 /* Return the asm template for an SVE vector INC or DEC instruction.
5567 OPERANDS gives the operands before the vector count and X is the
5568 value of the vector count operand itself. */
5569
5570 char *
5571 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
5572 {
5573 int factor;
5574 unsigned int nelts_per_vq;
5575 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
5576 gcc_unreachable ();
5577 if (factor < 0)
5578 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
5579 -factor, nelts_per_vq);
5580 else
5581 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
5582 factor, nelts_per_vq);
5583 }
5584
5585 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5586
5587 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5588 {
5589 0x0000000100000001ull,
5590 0x0001000100010001ull,
5591 0x0101010101010101ull,
5592 0x1111111111111111ull,
5593 0x5555555555555555ull,
5594 };
5595
5596
5597
5598 /* Return true if 64-bit VAL is a valid bitmask immediate. */
5599 static bool
5600 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
5601 {
5602 unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
5603 int bits;
5604
5605 /* Check for a single sequence of one bits and return quickly if so.
5606 The special cases of all ones and all zeroes returns false. */
5607 tmp = val + (val & -val);
5608
5609 if (tmp == (tmp & -tmp))
5610 return (val + 1) > 1;
5611
5612 /* Invert if the immediate doesn't start with a zero bit - this means we
5613 only need to search for sequences of one bits. */
5614 if (val & 1)
5615 val = ~val;
5616
5617 /* Find the first set bit and set tmp to val with the first sequence of one
5618 bits removed. Return success if there is a single sequence of ones. */
5619 first_one = val & -val;
5620 tmp = val & (val + first_one);
5621
5622 if (tmp == 0)
5623 return true;
5624
5625 /* Find the next set bit and compute the difference in bit position. */
5626 next_one = tmp & -tmp;
5627 bits = clz_hwi (first_one) - clz_hwi (next_one);
5628 mask = val ^ tmp;
5629
5630 /* Check the bit position difference is a power of 2, and that the first
5631 sequence of one bits fits within 'bits' bits. */
5632 if ((mask >> bits) != 0 || bits != (bits & -bits))
5633 return false;
5634
5635 /* Check the sequence of one bits is repeated 64/bits times. */
5636 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5637 }
5638
5639
5640 /* Return true if VAL is a valid bitmask immediate for MODE. */
5641 bool
5642 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5643 {
5644 if (mode == DImode)
5645 return aarch64_bitmask_imm (val);
5646
5647 if (mode == SImode)
5648 return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
5649
5650 /* Replicate small immediates to fit 64 bits. */
5651 int size = GET_MODE_UNIT_PRECISION (mode);
5652 val &= (HOST_WIDE_INT_1U << size) - 1;
5653 val *= bitmask_imm_mul[__builtin_clz (size) - 26];
5654
5655 return aarch64_bitmask_imm (val);
5656 }
5657
5658
5659 /* Return true if the immediate VAL can be a bitfield immediate
5660 by changing the given MASK bits in VAL to zeroes, ones or bits
5661 from the other half of VAL. Return the new immediate in VAL2. */
5662 static inline bool
5663 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
5664 unsigned HOST_WIDE_INT &val2,
5665 unsigned HOST_WIDE_INT mask)
5666 {
5667 val2 = val & ~mask;
5668 if (val2 != val && aarch64_bitmask_imm (val2))
5669 return true;
5670 val2 = val | mask;
5671 if (val2 != val && aarch64_bitmask_imm (val2))
5672 return true;
5673 val = val & ~mask;
5674 val2 = val | (((val >> 32) | (val << 32)) & mask);
5675 if (val2 != val && aarch64_bitmask_imm (val2))
5676 return true;
5677 val2 = val | (((val >> 16) | (val << 48)) & mask);
5678 if (val2 != val && aarch64_bitmask_imm (val2))
5679 return true;
5680 return false;
5681 }
5682
5683
5684 /* Return true if VAL is a valid MOVZ immediate. */
5685 static inline bool
5686 aarch64_is_movz (unsigned HOST_WIDE_INT val)
5687 {
5688 return (val >> (ctz_hwi (val) & 48)) < 65536;
5689 }
5690
5691
5692 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
5693 bool
5694 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
5695 {
5696 return aarch64_is_movz (val) || aarch64_is_movz (~val)
5697 || aarch64_bitmask_imm (val);
5698 }
5699
5700
5701 /* Return true if VAL is an immediate that can be created by a single
5702 MOV instruction. */
5703 bool
5704 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5705 {
5706 gcc_assert (mode == SImode || mode == DImode);
5707
5708 if (val < 65536)
5709 return true;
5710
5711 unsigned HOST_WIDE_INT mask =
5712 (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
5713
5714 if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
5715 return true;
5716
5717 val = (val & mask) | ((val << 32) & ~mask);
5718 return aarch64_bitmask_imm (val);
5719 }
5720
5721
5722 static int
5723 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
5724 machine_mode mode)
5725 {
5726 int i;
5727 unsigned HOST_WIDE_INT val, val2, mask;
5728 int one_match, zero_match;
5729 int num_insns;
5730
5731 gcc_assert (mode == SImode || mode == DImode);
5732
5733 val = INTVAL (imm);
5734
5735 if (aarch64_move_imm (val, mode))
5736 {
5737 if (generate)
5738 emit_insn (gen_rtx_SET (dest, imm));
5739 return 1;
5740 }
5741
5742 if ((val >> 32) == 0 || mode == SImode)
5743 {
5744 if (generate)
5745 {
5746 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
5747 if (mode == SImode)
5748 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
5749 GEN_INT ((val >> 16) & 0xffff)));
5750 else
5751 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
5752 GEN_INT ((val >> 16) & 0xffff)));
5753 }
5754 return 2;
5755 }
5756
5757 /* Remaining cases are all for DImode. */
5758
5759 mask = 0xffff;
5760 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
5761 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
5762 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
5763 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
5764
5765 /* Try a bitmask immediate and a movk to generate the immediate
5766 in 2 instructions. */
5767
5768 if (zero_match < 2 && one_match < 2)
5769 {
5770 for (i = 0; i < 64; i += 16)
5771 {
5772 if (aarch64_check_bitmask (val, val2, mask << i))
5773 break;
5774
5775 val2 = val & ~(mask << i);
5776 if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
5777 break;
5778 }
5779
5780 if (i != 64)
5781 {
5782 if (generate)
5783 {
5784 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5785 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5786 GEN_INT ((val >> i) & 0xffff)));
5787 }
5788 return 2;
5789 }
5790 }
5791
5792 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
5793 if (zero_match + one_match == 0)
5794 {
5795 for (i = 0; i < 48; i += 16)
5796 for (int j = i + 16; j < 64; j += 16)
5797 if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
5798 {
5799 if (generate)
5800 {
5801 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5802 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5803 GEN_INT ((val >> i) & 0xffff)));
5804 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
5805 GEN_INT ((val >> j) & 0xffff)));
5806 }
5807 return 3;
5808 }
5809 }
5810
5811 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
5812 are emitted by the initial mov. If one_match > zero_match, skip set bits,
5813 otherwise skip zero bits. */
5814
5815 num_insns = 1;
5816 mask = 0xffff;
5817 val2 = one_match > zero_match ? ~val : val;
5818 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
5819
5820 if (generate)
5821 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
5822 ? (val | ~(mask << i))
5823 : (val & (mask << i)))));
5824 for (i += 16; i < 64; i += 16)
5825 {
5826 if ((val2 & (mask << i)) == 0)
5827 continue;
5828 if (generate)
5829 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5830 GEN_INT ((val >> i) & 0xffff)));
5831 num_insns ++;
5832 }
5833
5834 return num_insns;
5835 }
5836
5837 /* Return whether imm is a 128-bit immediate which is simple enough to
5838 expand inline. */
5839 bool
5840 aarch64_mov128_immediate (rtx imm)
5841 {
5842 if (CONST_INT_P (imm))
5843 return true;
5844
5845 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
5846
5847 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
5848 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
5849
5850 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
5851 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
5852 }
5853
5854
5855 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5856 a left shift of 0 or 12 bits. */
5857 bool
5858 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
5859 {
5860 return val < 4096 || (val & 0xfff000) == val;
5861 }
5862
5863 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5864 that can be created with a left shift of 0 or 12. */
5865 static HOST_WIDE_INT
5866 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
5867 {
5868 /* Check to see if the value fits in 24 bits, as that is the maximum we can
5869 handle correctly. */
5870 gcc_assert (val < 0x1000000);
5871
5872 if (val < 4096)
5873 return val;
5874
5875 return val & 0xfff000;
5876 }
5877
5878
5879 /* Test whether:
5880
5881 X = (X & AND_VAL) | IOR_VAL;
5882
5883 can be implemented using:
5884
5885 MOVK X, #(IOR_VAL >> shift), LSL #shift
5886
5887 Return the shift if so, otherwise return -1. */
5888 int
5889 aarch64_movk_shift (const wide_int_ref &and_val,
5890 const wide_int_ref &ior_val)
5891 {
5892 unsigned int precision = and_val.get_precision ();
5893 unsigned HOST_WIDE_INT mask = 0xffff;
5894 for (unsigned int shift = 0; shift < precision; shift += 16)
5895 {
5896 if (and_val == ~mask && (ior_val & mask) == ior_val)
5897 return shift;
5898 mask <<= 16;
5899 }
5900 return -1;
5901 }
5902
5903 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5904 Assumed precondition: VAL_IN Is not zero. */
5905
5906 unsigned HOST_WIDE_INT
5907 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5908 {
5909 int lowest_bit_set = ctz_hwi (val_in);
5910 int highest_bit_set = floor_log2 (val_in);
5911 gcc_assert (val_in != 0);
5912
5913 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5914 (HOST_WIDE_INT_1U << lowest_bit_set));
5915 }
5916
5917 /* Create constant where bits outside of lowest bit set to highest bit set
5918 are set to 1. */
5919
5920 unsigned HOST_WIDE_INT
5921 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5922 {
5923 return val_in | ~aarch64_and_split_imm1 (val_in);
5924 }
5925
5926 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5927
5928 bool
5929 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5930 {
5931 scalar_int_mode int_mode;
5932 if (!is_a <scalar_int_mode> (mode, &int_mode))
5933 return false;
5934
5935 if (aarch64_bitmask_imm (val_in, int_mode))
5936 return false;
5937
5938 if (aarch64_move_imm (val_in, int_mode))
5939 return false;
5940
5941 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5942
5943 return aarch64_bitmask_imm (imm2, int_mode);
5944 }
5945
5946 /* Return the number of temporary registers that aarch64_add_offset_1
5947 would need to add OFFSET to a register. */
5948
5949 static unsigned int
5950 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
5951 {
5952 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
5953 }
5954
5955 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
5956 a non-polynomial OFFSET. MODE is the mode of the addition.
5957 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5958 be set and CFA adjustments added to the generated instructions.
5959
5960 TEMP1, if nonnull, is a register of mode MODE that can be used as a
5961 temporary if register allocation is already complete. This temporary
5962 register may overlap DEST but must not overlap SRC. If TEMP1 is known
5963 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5964 the immediate again.
5965
5966 Since this function may be used to adjust the stack pointer, we must
5967 ensure that it cannot cause transient stack deallocation (for example
5968 by first incrementing SP and then decrementing when adjusting by a
5969 large immediate). */
5970
5971 static void
5972 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
5973 rtx src, HOST_WIDE_INT offset, rtx temp1,
5974 bool frame_related_p, bool emit_move_imm)
5975 {
5976 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5977 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5978
5979 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
5980 rtx_insn *insn;
5981
5982 if (!moffset)
5983 {
5984 if (!rtx_equal_p (dest, src))
5985 {
5986 insn = emit_insn (gen_rtx_SET (dest, src));
5987 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5988 }
5989 return;
5990 }
5991
5992 /* Single instruction adjustment. */
5993 if (aarch64_uimm12_shift (moffset))
5994 {
5995 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
5996 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5997 return;
5998 }
5999
6000 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
6001 and either:
6002
6003 a) the offset cannot be loaded by a 16-bit move or
6004 b) there is no spare register into which we can move it. */
6005 if (moffset < 0x1000000
6006 && ((!temp1 && !can_create_pseudo_p ())
6007 || !aarch64_move_imm (moffset, mode)))
6008 {
6009 HOST_WIDE_INT low_off = moffset & 0xfff;
6010
6011 low_off = offset < 0 ? -low_off : low_off;
6012 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
6013 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6014 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
6015 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6016 return;
6017 }
6018
6019 /* Emit a move immediate if required and an addition/subtraction. */
6020 if (emit_move_imm)
6021 {
6022 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
6023 temp1 = aarch64_force_temporary (mode, temp1,
6024 gen_int_mode (moffset, mode));
6025 }
6026 insn = emit_insn (offset < 0
6027 ? gen_sub3_insn (dest, src, temp1)
6028 : gen_add3_insn (dest, src, temp1));
6029 if (frame_related_p)
6030 {
6031 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6032 rtx adj = plus_constant (mode, src, offset);
6033 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
6034 }
6035 }
6036
6037 /* Return the number of temporary registers that aarch64_add_offset
6038 would need to move OFFSET into a register or add OFFSET to a register;
6039 ADD_P is true if we want the latter rather than the former. */
6040
6041 static unsigned int
6042 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
6043 {
6044 /* This follows the same structure as aarch64_add_offset. */
6045 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
6046 return 0;
6047
6048 unsigned int count = 0;
6049 HOST_WIDE_INT factor = offset.coeffs[1];
6050 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6051 poly_int64 poly_offset (factor, factor);
6052 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6053 /* Need one register for the ADDVL/ADDPL result. */
6054 count += 1;
6055 else if (factor != 0)
6056 {
6057 factor = abs (factor);
6058 if (factor > 16 * (factor & -factor))
6059 /* Need one register for the CNT result and one for the multiplication
6060 factor. If necessary, the second temporary can be reused for the
6061 constant part of the offset. */
6062 return 2;
6063 /* Need one register for the CNT result (which might then
6064 be shifted). */
6065 count += 1;
6066 }
6067 return count + aarch64_add_offset_1_temporaries (constant);
6068 }
6069
6070 /* If X can be represented as a poly_int64, return the number
6071 of temporaries that are required to add it to a register.
6072 Return -1 otherwise. */
6073
6074 int
6075 aarch64_add_offset_temporaries (rtx x)
6076 {
6077 poly_int64 offset;
6078 if (!poly_int_rtx_p (x, &offset))
6079 return -1;
6080 return aarch64_offset_temporaries (true, offset);
6081 }
6082
6083 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
6084 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
6085 be set and CFA adjustments added to the generated instructions.
6086
6087 TEMP1, if nonnull, is a register of mode MODE that can be used as a
6088 temporary if register allocation is already complete. This temporary
6089 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
6090 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
6091 false to avoid emitting the immediate again.
6092
6093 TEMP2, if nonnull, is a second temporary register that doesn't
6094 overlap either DEST or REG.
6095
6096 Since this function may be used to adjust the stack pointer, we must
6097 ensure that it cannot cause transient stack deallocation (for example
6098 by first incrementing SP and then decrementing when adjusting by a
6099 large immediate). */
6100
6101 static void
6102 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6103 poly_int64 offset, rtx temp1, rtx temp2,
6104 bool frame_related_p, bool emit_move_imm = true)
6105 {
6106 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
6107 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
6108 gcc_assert (temp1 == NULL_RTX
6109 || !frame_related_p
6110 || !reg_overlap_mentioned_p (temp1, dest));
6111 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
6112
6113 /* Try using ADDVL or ADDPL to add the whole value. */
6114 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
6115 {
6116 rtx offset_rtx = gen_int_mode (offset, mode);
6117 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6118 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6119 return;
6120 }
6121
6122 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
6123 SVE vector register, over and above the minimum size of 128 bits.
6124 This is equivalent to half the value returned by CNTD with a
6125 vector shape of ALL. */
6126 HOST_WIDE_INT factor = offset.coeffs[1];
6127 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6128
6129 /* Try using ADDVL or ADDPL to add the VG-based part. */
6130 poly_int64 poly_offset (factor, factor);
6131 if (src != const0_rtx
6132 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6133 {
6134 rtx offset_rtx = gen_int_mode (poly_offset, mode);
6135 if (frame_related_p)
6136 {
6137 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6138 RTX_FRAME_RELATED_P (insn) = true;
6139 src = dest;
6140 }
6141 else
6142 {
6143 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
6144 src = aarch64_force_temporary (mode, temp1, addr);
6145 temp1 = temp2;
6146 temp2 = NULL_RTX;
6147 }
6148 }
6149 /* Otherwise use a CNT-based sequence. */
6150 else if (factor != 0)
6151 {
6152 /* Use a subtraction if we have a negative factor. */
6153 rtx_code code = PLUS;
6154 if (factor < 0)
6155 {
6156 factor = -factor;
6157 code = MINUS;
6158 }
6159
6160 /* Calculate CNTD * FACTOR / 2. First try to fold the division
6161 into the multiplication. */
6162 rtx val;
6163 int shift = 0;
6164 if (factor & 1)
6165 /* Use a right shift by 1. */
6166 shift = -1;
6167 else
6168 factor /= 2;
6169 HOST_WIDE_INT low_bit = factor & -factor;
6170 if (factor <= 16 * low_bit)
6171 {
6172 if (factor > 16 * 8)
6173 {
6174 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
6175 the value with the minimum multiplier and shift it into
6176 position. */
6177 int extra_shift = exact_log2 (low_bit);
6178 shift += extra_shift;
6179 factor >>= extra_shift;
6180 }
6181 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
6182 }
6183 else
6184 {
6185 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
6186 directly, since that should increase the chances of being
6187 able to use a shift and add sequence. If LOW_BIT itself
6188 is out of range, just use CNTD. */
6189 if (low_bit <= 16 * 8)
6190 factor /= low_bit;
6191 else
6192 low_bit = 1;
6193
6194 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
6195 val = aarch64_force_temporary (mode, temp1, val);
6196
6197 if (can_create_pseudo_p ())
6198 {
6199 rtx coeff1 = gen_int_mode (factor, mode);
6200 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
6201 }
6202 else
6203 {
6204 /* Go back to using a negative multiplication factor if we have
6205 no register from which to subtract. */
6206 if (code == MINUS && src == const0_rtx)
6207 {
6208 factor = -factor;
6209 code = PLUS;
6210 }
6211 rtx coeff1 = gen_int_mode (factor, mode);
6212 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
6213 val = gen_rtx_MULT (mode, val, coeff1);
6214 }
6215 }
6216
6217 if (shift > 0)
6218 {
6219 /* Multiply by 1 << SHIFT. */
6220 val = aarch64_force_temporary (mode, temp1, val);
6221 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
6222 }
6223 else if (shift == -1)
6224 {
6225 /* Divide by 2. */
6226 val = aarch64_force_temporary (mode, temp1, val);
6227 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
6228 }
6229
6230 /* Calculate SRC +/- CNTD * FACTOR / 2. */
6231 if (src != const0_rtx)
6232 {
6233 val = aarch64_force_temporary (mode, temp1, val);
6234 val = gen_rtx_fmt_ee (code, mode, src, val);
6235 }
6236 else if (code == MINUS)
6237 {
6238 val = aarch64_force_temporary (mode, temp1, val);
6239 val = gen_rtx_NEG (mode, val);
6240 }
6241
6242 if (constant == 0 || frame_related_p)
6243 {
6244 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
6245 if (frame_related_p)
6246 {
6247 RTX_FRAME_RELATED_P (insn) = true;
6248 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6249 gen_rtx_SET (dest, plus_constant (Pmode, src,
6250 poly_offset)));
6251 }
6252 src = dest;
6253 if (constant == 0)
6254 return;
6255 }
6256 else
6257 {
6258 src = aarch64_force_temporary (mode, temp1, val);
6259 temp1 = temp2;
6260 temp2 = NULL_RTX;
6261 }
6262
6263 emit_move_imm = true;
6264 }
6265
6266 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
6267 frame_related_p, emit_move_imm);
6268 }
6269
6270 /* Like aarch64_add_offset, but the offset is given as an rtx rather
6271 than a poly_int64. */
6272
6273 void
6274 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6275 rtx offset_rtx, rtx temp1, rtx temp2)
6276 {
6277 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
6278 temp1, temp2, false);
6279 }
6280
6281 /* Add DELTA to the stack pointer, marking the instructions frame-related.
6282 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
6283 if TEMP1 already contains abs (DELTA). */
6284
6285 static inline void
6286 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
6287 {
6288 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
6289 temp1, temp2, true, emit_move_imm);
6290 }
6291
6292 /* Subtract DELTA from the stack pointer, marking the instructions
6293 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
6294 if nonnull. */
6295
6296 static inline void
6297 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
6298 bool emit_move_imm = true)
6299 {
6300 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
6301 temp1, temp2, frame_related_p, emit_move_imm);
6302 }
6303
6304 /* Set DEST to (vec_series BASE STEP). */
6305
6306 static void
6307 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
6308 {
6309 machine_mode mode = GET_MODE (dest);
6310 scalar_mode inner = GET_MODE_INNER (mode);
6311
6312 /* Each operand can be a register or an immediate in the range [-16, 15]. */
6313 if (!aarch64_sve_index_immediate_p (base))
6314 base = force_reg (inner, base);
6315 if (!aarch64_sve_index_immediate_p (step))
6316 step = force_reg (inner, step);
6317
6318 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
6319 }
6320
6321 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
6322 register of mode MODE. Use TARGET for the result if it's nonnull
6323 and convenient.
6324
6325 The two vector modes must have the same element mode. The behavior
6326 is to duplicate architectural lane N of SRC into architectural lanes
6327 N + I * STEP of the result. On big-endian targets, architectural
6328 lane 0 of an Advanced SIMD vector is the last element of the vector
6329 in memory layout, so for big-endian targets this operation has the
6330 effect of reversing SRC before duplicating it. Callers need to
6331 account for this. */
6332
6333 rtx
6334 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
6335 {
6336 machine_mode src_mode = GET_MODE (src);
6337 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
6338 insn_code icode = (BYTES_BIG_ENDIAN
6339 ? code_for_aarch64_vec_duplicate_vq_be (mode)
6340 : code_for_aarch64_vec_duplicate_vq_le (mode));
6341
6342 unsigned int i = 0;
6343 expand_operand ops[3];
6344 create_output_operand (&ops[i++], target, mode);
6345 create_output_operand (&ops[i++], src, src_mode);
6346 if (BYTES_BIG_ENDIAN)
6347 {
6348 /* Create a PARALLEL describing the reversal of SRC. */
6349 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
6350 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
6351 nelts_per_vq - 1, -1);
6352 create_fixed_operand (&ops[i++], sel);
6353 }
6354 expand_insn (icode, i, ops);
6355 return ops[0].value;
6356 }
6357
6358 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
6359 the memory image into DEST. Return true on success. */
6360
6361 static bool
6362 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
6363 {
6364 src = force_const_mem (GET_MODE (src), src);
6365 if (!src)
6366 return false;
6367
6368 /* Make sure that the address is legitimate. */
6369 if (!aarch64_sve_ld1rq_operand_p (src))
6370 {
6371 rtx addr = force_reg (Pmode, XEXP (src, 0));
6372 src = replace_equiv_address (src, addr);
6373 }
6374
6375 machine_mode mode = GET_MODE (dest);
6376 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6377 rtx ptrue = aarch64_ptrue_reg (pred_mode);
6378 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
6379 return true;
6380 }
6381
6382 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
6383 by N "background" values. Try to move it into TARGET using:
6384
6385 PTRUE PRED.<T>, VL<N>
6386 MOV TRUE.<T>, #<foreground>
6387 MOV FALSE.<T>, #<background>
6388 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
6389
6390 The PTRUE is always a single instruction but the MOVs might need a
6391 longer sequence. If the background value is zero (as it often is),
6392 the sequence can sometimes collapse to a PTRUE followed by a
6393 zero-predicated move.
6394
6395 Return the target on success, otherwise return null. */
6396
6397 static rtx
6398 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
6399 {
6400 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
6401
6402 /* Make sure that the PTRUE is valid. */
6403 machine_mode mode = GET_MODE (src);
6404 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6405 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6406 if (aarch64_svpattern_for_vl (pred_mode, npatterns)
6407 == AARCH64_NUM_SVPATTERNS)
6408 return NULL_RTX;
6409
6410 rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
6411 rtx_vector_builder true_builder (mode, npatterns, 1);
6412 rtx_vector_builder false_builder (mode, npatterns, 1);
6413 for (unsigned int i = 0; i < npatterns; ++i)
6414 {
6415 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6416 pred_builder.quick_push (CONST1_RTX (BImode));
6417 }
6418 for (unsigned int i = 0; i < npatterns; ++i)
6419 {
6420 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
6421 pred_builder.quick_push (CONST0_RTX (BImode));
6422 }
6423 expand_operand ops[4];
6424 create_output_operand (&ops[0], target, mode);
6425 create_input_operand (&ops[1], true_builder.build (), mode);
6426 create_input_operand (&ops[2], false_builder.build (), mode);
6427 create_input_operand (&ops[3], pred_builder.build (), pred_mode);
6428 expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
6429 return target;
6430 }
6431
6432 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
6433 SVE data mode and isn't a legitimate constant. Use TARGET for the
6434 result if convenient.
6435
6436 The returned register can have whatever mode seems most natural
6437 given the contents of SRC. */
6438
6439 static rtx
6440 aarch64_expand_sve_const_vector (rtx target, rtx src)
6441 {
6442 machine_mode mode = GET_MODE (src);
6443 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6444 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
6445 scalar_mode elt_mode = GET_MODE_INNER (mode);
6446 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
6447 unsigned int container_bits = aarch64_sve_container_bits (mode);
6448 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
6449
6450 if (nelts_per_pattern == 1
6451 && encoded_bits <= 128
6452 && container_bits != elt_bits)
6453 {
6454 /* We have a partial vector mode and a constant whose full-vector
6455 equivalent would occupy a repeating 128-bit sequence. Build that
6456 full-vector equivalent instead, so that we have the option of
6457 using LD1RQ and Advanced SIMD operations. */
6458 unsigned int repeat = container_bits / elt_bits;
6459 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
6460 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
6461 for (unsigned int i = 0; i < npatterns; ++i)
6462 for (unsigned int j = 0; j < repeat; ++j)
6463 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6464 target = aarch64_target_reg (target, full_mode);
6465 return aarch64_expand_sve_const_vector (target, builder.build ());
6466 }
6467
6468 if (nelts_per_pattern == 1 && encoded_bits == 128)
6469 {
6470 /* The constant is a duplicated quadword but can't be narrowed
6471 beyond a quadword. Get the memory image of the first quadword
6472 as a 128-bit vector and try using LD1RQ to load it from memory.
6473
6474 The effect for both endiannesses is to load memory lane N into
6475 architectural lanes N + I * STEP of the result. On big-endian
6476 targets, the layout of the 128-bit vector in an Advanced SIMD
6477 register would be different from its layout in an SVE register,
6478 but this 128-bit vector is a memory value only. */
6479 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6480 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
6481 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
6482 return target;
6483 }
6484
6485 if (nelts_per_pattern == 1 && encoded_bits < 128)
6486 {
6487 /* The vector is a repeating sequence of 64 bits or fewer.
6488 See if we can load them using an Advanced SIMD move and then
6489 duplicate it to fill a vector. This is better than using a GPR
6490 move because it keeps everything in the same register file. */
6491 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6492 rtx_vector_builder builder (vq_mode, npatterns, 1);
6493 for (unsigned int i = 0; i < npatterns; ++i)
6494 {
6495 /* We want memory lane N to go into architectural lane N,
6496 so reverse for big-endian targets. The DUP .Q pattern
6497 has a compensating reverse built-in. */
6498 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
6499 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
6500 }
6501 rtx vq_src = builder.build ();
6502 if (aarch64_simd_valid_immediate (vq_src, NULL))
6503 {
6504 vq_src = force_reg (vq_mode, vq_src);
6505 return aarch64_expand_sve_dupq (target, mode, vq_src);
6506 }
6507
6508 /* Get an integer representation of the repeating part of Advanced
6509 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
6510 which for big-endian targets is lane-swapped wrt a normal
6511 Advanced SIMD vector. This means that for both endiannesses,
6512 memory lane N of SVE vector SRC corresponds to architectural
6513 lane N of a register holding VQ_SRC. This in turn means that
6514 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
6515 as a single 128-bit value) and thus that memory lane 0 of SRC is
6516 in the lsb of the integer. Duplicating the integer therefore
6517 ensures that memory lane N of SRC goes into architectural lane
6518 N + I * INDEX of the SVE register. */
6519 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
6520 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
6521 if (elt_value)
6522 {
6523 /* Pretend that we had a vector of INT_MODE to start with. */
6524 elt_mode = int_mode;
6525 mode = aarch64_full_sve_mode (int_mode).require ();
6526
6527 /* If the integer can be moved into a general register by a
6528 single instruction, do that and duplicate the result. */
6529 if (CONST_INT_P (elt_value)
6530 && aarch64_move_imm (INTVAL (elt_value),
6531 encoded_bits <= 32 ? SImode : DImode))
6532 {
6533 elt_value = force_reg (elt_mode, elt_value);
6534 return expand_vector_broadcast (mode, elt_value);
6535 }
6536 }
6537 else if (npatterns == 1)
6538 /* We're duplicating a single value, but can't do better than
6539 force it to memory and load from there. This handles things
6540 like symbolic constants. */
6541 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
6542
6543 if (elt_value)
6544 {
6545 /* Load the element from memory if we can, otherwise move it into
6546 a register and use a DUP. */
6547 rtx op = force_const_mem (elt_mode, elt_value);
6548 if (!op)
6549 op = force_reg (elt_mode, elt_value);
6550 return expand_vector_broadcast (mode, op);
6551 }
6552 }
6553
6554 /* Try using INDEX. */
6555 rtx base, step;
6556 if (const_vec_series_p (src, &base, &step))
6557 {
6558 aarch64_expand_vec_series (target, base, step);
6559 return target;
6560 }
6561
6562 /* From here on, it's better to force the whole constant to memory
6563 if we can. */
6564 if (GET_MODE_NUNITS (mode).is_constant ())
6565 return NULL_RTX;
6566
6567 if (nelts_per_pattern == 2)
6568 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
6569 return res;
6570
6571 /* Expand each pattern individually. */
6572 gcc_assert (npatterns > 1);
6573 rtx_vector_builder builder;
6574 auto_vec<rtx, 16> vectors (npatterns);
6575 for (unsigned int i = 0; i < npatterns; ++i)
6576 {
6577 builder.new_vector (mode, 1, nelts_per_pattern);
6578 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
6579 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
6580 vectors.quick_push (force_reg (mode, builder.build ()));
6581 }
6582
6583 /* Use permutes to interleave the separate vectors. */
6584 while (npatterns > 1)
6585 {
6586 npatterns /= 2;
6587 for (unsigned int i = 0; i < npatterns; ++i)
6588 {
6589 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
6590 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
6591 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
6592 vectors[i] = tmp;
6593 }
6594 }
6595 gcc_assert (vectors[0] == target);
6596 return target;
6597 }
6598
6599 /* Use WHILE to set a predicate register of mode MODE in which the first
6600 VL bits are set and the rest are clear. Use TARGET for the register
6601 if it's nonnull and convenient. */
6602
6603 static rtx
6604 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
6605 unsigned int vl)
6606 {
6607 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
6608 target = aarch64_target_reg (target, mode);
6609 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
6610 target, const0_rtx, limit));
6611 return target;
6612 }
6613
6614 static rtx
6615 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
6616
6617 /* BUILDER is a constant predicate in which the index of every set bit
6618 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6619 by inverting every element at a multiple of ELT_SIZE and EORing the
6620 result with an ELT_SIZE PTRUE.
6621
6622 Return a register that contains the constant on success, otherwise
6623 return null. Use TARGET as the register if it is nonnull and
6624 convenient. */
6625
6626 static rtx
6627 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
6628 unsigned int elt_size)
6629 {
6630 /* Invert every element at a multiple of ELT_SIZE, keeping the
6631 other bits zero. */
6632 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
6633 builder.nelts_per_pattern ());
6634 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6635 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
6636 inv_builder.quick_push (const1_rtx);
6637 else
6638 inv_builder.quick_push (const0_rtx);
6639 inv_builder.finalize ();
6640
6641 /* See if we can load the constant cheaply. */
6642 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
6643 if (!inv)
6644 return NULL_RTX;
6645
6646 /* EOR the result with an ELT_SIZE PTRUE. */
6647 rtx mask = aarch64_ptrue_all (elt_size);
6648 mask = force_reg (VNx16BImode, mask);
6649 inv = gen_lowpart (VNx16BImode, inv);
6650 target = aarch64_target_reg (target, VNx16BImode);
6651 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
6652 return target;
6653 }
6654
6655 /* BUILDER is a constant predicate in which the index of every set bit
6656 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6657 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
6658 register on success, otherwise return null. Use TARGET as the register
6659 if nonnull and convenient. */
6660
6661 static rtx
6662 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
6663 unsigned int elt_size,
6664 unsigned int permute_size)
6665 {
6666 /* We're going to split the constant into two new constants A and B,
6667 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
6668 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6669
6670 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6671 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6672
6673 where _ indicates elements that will be discarded by the permute.
6674
6675 First calculate the ELT_SIZEs for A and B. */
6676 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6677 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6678 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6679 if (INTVAL (builder.elt (i)) != 0)
6680 {
6681 if (i & permute_size)
6682 b_elt_size |= i - permute_size;
6683 else
6684 a_elt_size |= i;
6685 }
6686 a_elt_size &= -a_elt_size;
6687 b_elt_size &= -b_elt_size;
6688
6689 /* Now construct the vectors themselves. */
6690 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6691 builder.nelts_per_pattern ());
6692 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6693 builder.nelts_per_pattern ());
6694 unsigned int nelts = builder.encoded_nelts ();
6695 for (unsigned int i = 0; i < nelts; ++i)
6696 if (i & (elt_size - 1))
6697 {
6698 a_builder.quick_push (const0_rtx);
6699 b_builder.quick_push (const0_rtx);
6700 }
6701 else if ((i & permute_size) == 0)
6702 {
6703 /* The A and B elements are significant. */
6704 a_builder.quick_push (builder.elt (i));
6705 b_builder.quick_push (builder.elt (i + permute_size));
6706 }
6707 else
6708 {
6709 /* The A and B elements are going to be discarded, so pick whatever
6710 is likely to give a nice constant. We are targeting element
6711 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6712 with the aim of each being a sequence of ones followed by
6713 a sequence of zeros. So:
6714
6715 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6716 duplicate the last X_ELT_SIZE element, to extend the
6717 current sequence of ones or zeros.
6718
6719 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6720 zero, so that the constant really does have X_ELT_SIZE and
6721 not a smaller size. */
6722 if (a_elt_size > permute_size)
6723 a_builder.quick_push (const0_rtx);
6724 else
6725 a_builder.quick_push (a_builder.elt (i - a_elt_size));
6726 if (b_elt_size > permute_size)
6727 b_builder.quick_push (const0_rtx);
6728 else
6729 b_builder.quick_push (b_builder.elt (i - b_elt_size));
6730 }
6731 a_builder.finalize ();
6732 b_builder.finalize ();
6733
6734 /* Try loading A into a register. */
6735 rtx_insn *last = get_last_insn ();
6736 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6737 if (!a)
6738 return NULL_RTX;
6739
6740 /* Try loading B into a register. */
6741 rtx b = a;
6742 if (a_builder != b_builder)
6743 {
6744 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6745 if (!b)
6746 {
6747 delete_insns_since (last);
6748 return NULL_RTX;
6749 }
6750 }
6751
6752 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
6753 operands but permutes them as though they had mode MODE. */
6754 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
6755 target = aarch64_target_reg (target, GET_MODE (a));
6756 rtx type_reg = CONST0_RTX (mode);
6757 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
6758 return target;
6759 }
6760
6761 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
6762 constant in BUILDER into an SVE predicate register. Return the register
6763 on success, otherwise return null. Use TARGET for the register if
6764 nonnull and convenient.
6765
6766 ALLOW_RECURSE_P is true if we can use methods that would call this
6767 function recursively. */
6768
6769 static rtx
6770 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6771 bool allow_recurse_p)
6772 {
6773 if (builder.encoded_nelts () == 1)
6774 /* A PFALSE or a PTRUE .B ALL. */
6775 return aarch64_emit_set_immediate (target, builder);
6776
6777 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6778 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6779 {
6780 /* If we can load the constant using PTRUE, use it as-is. */
6781 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6782 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6783 return aarch64_emit_set_immediate (target, builder);
6784
6785 /* Otherwise use WHILE to set the first VL bits. */
6786 return aarch64_sve_move_pred_via_while (target, mode, vl);
6787 }
6788
6789 if (!allow_recurse_p)
6790 return NULL_RTX;
6791
6792 /* Try inverting the vector in element size ELT_SIZE and then EORing
6793 the result with an ELT_SIZE PTRUE. */
6794 if (INTVAL (builder.elt (0)) == 0)
6795 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6796 elt_size))
6797 return res;
6798
6799 /* Try using TRN1 to permute two simpler constants. */
6800 for (unsigned int i = elt_size; i <= 8; i *= 2)
6801 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6802 elt_size, i))
6803 return res;
6804
6805 return NULL_RTX;
6806 }
6807
6808 /* Return an SVE predicate register that contains the VNx16BImode
6809 constant in BUILDER, without going through the move expanders.
6810
6811 The returned register can have whatever mode seems most natural
6812 given the contents of BUILDER. Use TARGET for the result if
6813 convenient. */
6814
6815 static rtx
6816 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6817 {
6818 /* Try loading the constant using pure predicate operations. */
6819 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
6820 return res;
6821
6822 /* Try forcing the constant to memory. */
6823 if (builder.full_nelts ().is_constant ())
6824 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6825 {
6826 target = aarch64_target_reg (target, VNx16BImode);
6827 emit_move_insn (target, mem);
6828 return target;
6829 }
6830
6831 /* The last resort is to load the constant as an integer and then
6832 compare it against zero. Use -1 for set bits in order to increase
6833 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
6834 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6835 builder.nelts_per_pattern ());
6836 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6837 int_builder.quick_push (INTVAL (builder.elt (i))
6838 ? constm1_rtx : const0_rtx);
6839 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6840 int_builder.build ());
6841 }
6842
6843 /* Set DEST to immediate IMM. */
6844
6845 void
6846 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6847 {
6848 machine_mode mode = GET_MODE (dest);
6849
6850 /* Check on what type of symbol it is. */
6851 scalar_int_mode int_mode;
6852 if ((SYMBOL_REF_P (imm)
6853 || LABEL_REF_P (imm)
6854 || GET_CODE (imm) == CONST
6855 || GET_CODE (imm) == CONST_POLY_INT)
6856 && is_a <scalar_int_mode> (mode, &int_mode))
6857 {
6858 rtx mem;
6859 poly_int64 offset;
6860 HOST_WIDE_INT const_offset;
6861 enum aarch64_symbol_type sty;
6862
6863 /* If we have (const (plus symbol offset)), separate out the offset
6864 before we start classifying the symbol. */
6865 rtx base = strip_offset (imm, &offset);
6866
6867 /* We must always add an offset involving VL separately, rather than
6868 folding it into the relocation. */
6869 if (!offset.is_constant (&const_offset))
6870 {
6871 if (!TARGET_SVE)
6872 {
6873 aarch64_report_sve_required ();
6874 return;
6875 }
6876 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
6877 emit_insn (gen_rtx_SET (dest, imm));
6878 else
6879 {
6880 /* Do arithmetic on 32-bit values if the result is smaller
6881 than that. */
6882 if (partial_subreg_p (int_mode, SImode))
6883 {
6884 /* It is invalid to do symbol calculations in modes
6885 narrower than SImode. */
6886 gcc_assert (base == const0_rtx);
6887 dest = gen_lowpart (SImode, dest);
6888 int_mode = SImode;
6889 }
6890 if (base != const0_rtx)
6891 {
6892 base = aarch64_force_temporary (int_mode, dest, base);
6893 aarch64_add_offset (int_mode, dest, base, offset,
6894 NULL_RTX, NULL_RTX, false);
6895 }
6896 else
6897 aarch64_add_offset (int_mode, dest, base, offset,
6898 dest, NULL_RTX, false);
6899 }
6900 return;
6901 }
6902
6903 sty = aarch64_classify_symbol (base, const_offset);
6904 switch (sty)
6905 {
6906 case SYMBOL_FORCE_TO_MEM:
6907 if (int_mode != ptr_mode)
6908 imm = convert_memory_address (ptr_mode, imm);
6909
6910 if (const_offset != 0
6911 && targetm.cannot_force_const_mem (ptr_mode, imm))
6912 {
6913 gcc_assert (can_create_pseudo_p ());
6914 base = aarch64_force_temporary (int_mode, dest, base);
6915 aarch64_add_offset (int_mode, dest, base, const_offset,
6916 NULL_RTX, NULL_RTX, false);
6917 return;
6918 }
6919
6920 mem = force_const_mem (ptr_mode, imm);
6921 gcc_assert (mem);
6922
6923 /* If we aren't generating PC relative literals, then
6924 we need to expand the literal pool access carefully.
6925 This is something that needs to be done in a number
6926 of places, so could well live as a separate function. */
6927 if (!aarch64_pcrelative_literal_loads)
6928 {
6929 gcc_assert (can_create_pseudo_p ());
6930 base = gen_reg_rtx (ptr_mode);
6931 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6932 if (ptr_mode != Pmode)
6933 base = convert_memory_address (Pmode, base);
6934 mem = gen_rtx_MEM (ptr_mode, base);
6935 }
6936
6937 if (int_mode != ptr_mode)
6938 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6939
6940 emit_insn (gen_rtx_SET (dest, mem));
6941
6942 return;
6943
6944 case SYMBOL_SMALL_TLSGD:
6945 case SYMBOL_SMALL_TLSDESC:
6946 case SYMBOL_SMALL_TLSIE:
6947 case SYMBOL_SMALL_GOT_28K:
6948 case SYMBOL_SMALL_GOT_4G:
6949 case SYMBOL_TINY_GOT:
6950 case SYMBOL_TINY_TLSIE:
6951 if (const_offset != 0)
6952 {
6953 gcc_assert(can_create_pseudo_p ());
6954 base = aarch64_force_temporary (int_mode, dest, base);
6955 aarch64_add_offset (int_mode, dest, base, const_offset,
6956 NULL_RTX, NULL_RTX, false);
6957 return;
6958 }
6959 /* FALLTHRU */
6960
6961 case SYMBOL_SMALL_ABSOLUTE:
6962 case SYMBOL_TINY_ABSOLUTE:
6963 case SYMBOL_TLSLE12:
6964 case SYMBOL_TLSLE24:
6965 case SYMBOL_TLSLE32:
6966 case SYMBOL_TLSLE48:
6967 aarch64_load_symref_appropriately (dest, imm, sty);
6968 return;
6969
6970 default:
6971 gcc_unreachable ();
6972 }
6973 }
6974
6975 if (!CONST_INT_P (imm))
6976 {
6977 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
6978 {
6979 /* Only the low bit of each .H, .S and .D element is defined,
6980 so we can set the upper bits to whatever we like. If the
6981 predicate is all-true in MODE, prefer to set all the undefined
6982 bits as well, so that we can share a single .B predicate for
6983 all modes. */
6984 if (imm == CONSTM1_RTX (mode))
6985 imm = CONSTM1_RTX (VNx16BImode);
6986
6987 /* All methods for constructing predicate modes wider than VNx16BI
6988 will set the upper bits of each element to zero. Expose this
6989 by moving such constants as a VNx16BI, so that all bits are
6990 significant and so that constants for different modes can be
6991 shared. The wider constant will still be available as a
6992 REG_EQUAL note. */
6993 rtx_vector_builder builder;
6994 if (aarch64_get_sve_pred_bits (builder, imm))
6995 {
6996 rtx res = aarch64_expand_sve_const_pred (dest, builder);
6997 if (dest != res)
6998 emit_move_insn (dest, gen_lowpart (mode, res));
6999 return;
7000 }
7001 }
7002
7003 if (GET_CODE (imm) == HIGH
7004 || aarch64_simd_valid_immediate (imm, NULL))
7005 {
7006 emit_insn (gen_rtx_SET (dest, imm));
7007 return;
7008 }
7009
7010 if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
7011 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
7012 {
7013 if (dest != res)
7014 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
7015 return;
7016 }
7017
7018 rtx mem = force_const_mem (mode, imm);
7019 gcc_assert (mem);
7020 emit_move_insn (dest, mem);
7021 return;
7022 }
7023
7024 aarch64_internal_mov_immediate (dest, imm, true, mode);
7025 }
7026
7027 /* Return the MEM rtx that provides the canary value that should be used
7028 for stack-smashing protection. MODE is the mode of the memory.
7029 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
7030 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
7031 indicates whether the caller is performing a SET or a TEST operation. */
7032
7033 rtx
7034 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
7035 aarch64_salt_type salt_type)
7036 {
7037 rtx addr;
7038 if (aarch64_stack_protector_guard == SSP_GLOBAL)
7039 {
7040 gcc_assert (MEM_P (decl_rtl));
7041 addr = XEXP (decl_rtl, 0);
7042 poly_int64 offset;
7043 rtx base = strip_offset_and_salt (addr, &offset);
7044 if (!SYMBOL_REF_P (base))
7045 return decl_rtl;
7046
7047 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
7048 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
7049 addr = gen_rtx_CONST (Pmode, addr);
7050 addr = plus_constant (Pmode, addr, offset);
7051 }
7052 else
7053 {
7054 /* Calculate the address from the system register. */
7055 rtx salt = GEN_INT (salt_type);
7056 addr = gen_reg_rtx (mode);
7057 if (mode == DImode)
7058 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
7059 else
7060 {
7061 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
7062 addr = convert_memory_address (Pmode, addr);
7063 }
7064 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
7065 }
7066 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
7067 }
7068
7069 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
7070 that is known to contain PTRUE. */
7071
7072 void
7073 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
7074 {
7075 expand_operand ops[3];
7076 machine_mode mode = GET_MODE (dest);
7077 create_output_operand (&ops[0], dest, mode);
7078 create_input_operand (&ops[1], pred, GET_MODE(pred));
7079 create_input_operand (&ops[2], src, mode);
7080 temporary_volatile_ok v (true);
7081 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
7082 }
7083
7084 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
7085 operand is in memory. In this case we need to use the predicated LD1
7086 and ST1 instead of LDR and STR, both for correctness on big-endian
7087 targets and because LD1 and ST1 support a wider range of addressing modes.
7088 PRED_MODE is the mode of the predicate.
7089
7090 See the comment at the head of aarch64-sve.md for details about the
7091 big-endian handling. */
7092
7093 void
7094 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
7095 {
7096 machine_mode mode = GET_MODE (dest);
7097 rtx ptrue = aarch64_ptrue_reg (pred_mode);
7098 if (!register_operand (src, mode)
7099 && !register_operand (dest, mode))
7100 {
7101 rtx tmp = gen_reg_rtx (mode);
7102 if (MEM_P (src))
7103 aarch64_emit_sve_pred_move (tmp, ptrue, src);
7104 else
7105 emit_move_insn (tmp, src);
7106 src = tmp;
7107 }
7108 aarch64_emit_sve_pred_move (dest, ptrue, src);
7109 }
7110
7111 /* Called only on big-endian targets. See whether an SVE vector move
7112 from SRC to DEST is effectively a REV[BHW] instruction, because at
7113 least one operand is a subreg of an SVE vector that has wider or
7114 narrower elements. Return true and emit the instruction if so.
7115
7116 For example:
7117
7118 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
7119
7120 represents a VIEW_CONVERT between the following vectors, viewed
7121 in memory order:
7122
7123 R2: { [0].high, [0].low, [1].high, [1].low, ... }
7124 R1: { [0], [1], [2], [3], ... }
7125
7126 The high part of lane X in R2 should therefore correspond to lane X*2
7127 of R1, but the register representations are:
7128
7129 msb lsb
7130 R2: ...... [1].high [1].low [0].high [0].low
7131 R1: ...... [3] [2] [1] [0]
7132
7133 where the low part of lane X in R2 corresponds to lane X*2 in R1.
7134 We therefore need a reverse operation to swap the high and low values
7135 around.
7136
7137 This is purely an optimization. Without it we would spill the
7138 subreg operand to the stack in one mode and reload it in the
7139 other mode, which has the same effect as the REV. */
7140
7141 bool
7142 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
7143 {
7144 gcc_assert (BYTES_BIG_ENDIAN);
7145
7146 /* Do not try to optimize subregs that LRA has created for matched
7147 reloads. These subregs only exist as a temporary measure to make
7148 the RTL well-formed, but they are exempt from the usual
7149 TARGET_CAN_CHANGE_MODE_CLASS rules.
7150
7151 For example, if we have:
7152
7153 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
7154
7155 and the constraints require R1 and R2 to be in the same register,
7156 LRA may need to create RTL such as:
7157
7158 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
7159 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
7160 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
7161
7162 which forces both the input and output of the original instruction
7163 to use the same hard register. But for this to work, the normal
7164 rules have to be suppressed on the subreg input, otherwise LRA
7165 would need to reload that input too, meaning that the process
7166 would never terminate. To compensate for this, the normal rules
7167 are also suppressed for the subreg output of the first move.
7168 Ignoring the special case and handling the first move normally
7169 would therefore generate wrong code: we would reverse the elements
7170 for the first subreg but not reverse them back for the second subreg. */
7171 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
7172 dest = SUBREG_REG (dest);
7173 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
7174 src = SUBREG_REG (src);
7175
7176 /* The optimization handles two single SVE REGs with different element
7177 sizes. */
7178 if (!REG_P (dest)
7179 || !REG_P (src)
7180 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
7181 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
7182 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
7183 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
7184 return false;
7185
7186 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
7187 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
7188 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
7189 UNSPEC_REV_SUBREG);
7190 emit_insn (gen_rtx_SET (dest, unspec));
7191 return true;
7192 }
7193
7194 /* Return a copy of X with mode MODE, without changing its other
7195 attributes. Unlike gen_lowpart, this doesn't care whether the
7196 mode change is valid. */
7197
7198 rtx
7199 aarch64_replace_reg_mode (rtx x, machine_mode mode)
7200 {
7201 if (GET_MODE (x) == mode)
7202 return x;
7203
7204 x = shallow_copy_rtx (x);
7205 set_mode_and_regno (x, mode, REGNO (x));
7206 return x;
7207 }
7208
7209 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
7210 stored in wider integer containers. */
7211
7212 static unsigned int
7213 aarch64_sve_rev_unspec (machine_mode mode)
7214 {
7215 switch (GET_MODE_UNIT_SIZE (mode))
7216 {
7217 case 1: return UNSPEC_REVB;
7218 case 2: return UNSPEC_REVH;
7219 case 4: return UNSPEC_REVW;
7220 }
7221 gcc_unreachable ();
7222 }
7223
7224 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
7225 operands. */
7226
7227 void
7228 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
7229 {
7230 /* Decide which REV operation we need. The mode with wider elements
7231 determines the mode of the operands and the mode with the narrower
7232 elements determines the reverse width. */
7233 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
7234 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
7235 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
7236 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
7237 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
7238
7239 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
7240 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
7241
7242 /* Get the operands in the appropriate modes and emit the instruction. */
7243 ptrue = gen_lowpart (pred_mode, ptrue);
7244 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
7245 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
7246 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
7247 dest, ptrue, src));
7248 }
7249
7250 static bool
7251 aarch64_function_ok_for_sibcall (tree, tree exp)
7252 {
7253 if (crtl->abi->id () != expr_callee_abi (exp).id ())
7254 return false;
7255
7256 return true;
7257 }
7258
7259 /* Subroutine of aarch64_pass_by_reference for arguments that are not
7260 passed in SVE registers. */
7261
7262 static bool
7263 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
7264 const function_arg_info &arg)
7265 {
7266 HOST_WIDE_INT size;
7267 machine_mode dummymode;
7268 int nregs;
7269
7270 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
7271 if (arg.mode == BLKmode && arg.type)
7272 size = int_size_in_bytes (arg.type);
7273 else
7274 /* No frontends can create types with variable-sized modes, so we
7275 shouldn't be asked to pass or return them. */
7276 size = GET_MODE_SIZE (arg.mode).to_constant ();
7277
7278 /* Aggregates are passed by reference based on their size. */
7279 if (arg.aggregate_type_p ())
7280 size = int_size_in_bytes (arg.type);
7281
7282 /* Variable sized arguments are always returned by reference. */
7283 if (size < 0)
7284 return true;
7285
7286 /* Can this be a candidate to be passed in fp/simd register(s)? */
7287 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
7288 &dummymode, &nregs, NULL,
7289 !pcum || pcum->silent_p))
7290 return false;
7291
7292 /* Arguments which are variable sized or larger than 2 registers are
7293 passed by reference unless they are a homogenous floating point
7294 aggregate. */
7295 return size > 2 * UNITS_PER_WORD;
7296 }
7297
7298 /* Implement TARGET_PASS_BY_REFERENCE. */
7299
7300 static bool
7301 aarch64_pass_by_reference (cumulative_args_t pcum_v,
7302 const function_arg_info &arg)
7303 {
7304 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7305
7306 if (!arg.type)
7307 return aarch64_pass_by_reference_1 (pcum, arg);
7308
7309 pure_scalable_type_info pst_info;
7310 switch (pst_info.analyze (arg.type))
7311 {
7312 case pure_scalable_type_info::IS_PST:
7313 if (pcum && !pcum->silent_p && !TARGET_SVE)
7314 /* We can't gracefully recover at this point, so make this a
7315 fatal error. */
7316 fatal_error (input_location, "arguments of type %qT require"
7317 " the SVE ISA extension", arg.type);
7318
7319 /* Variadic SVE types are passed by reference. Normal non-variadic
7320 arguments are too if we've run out of registers. */
7321 return (!arg.named
7322 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
7323 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
7324
7325 case pure_scalable_type_info::DOESNT_MATTER:
7326 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
7327 return true;
7328
7329 case pure_scalable_type_info::NO_ABI_IDENTITY:
7330 case pure_scalable_type_info::ISNT_PST:
7331 return aarch64_pass_by_reference_1 (pcum, arg);
7332 }
7333 gcc_unreachable ();
7334 }
7335
7336 /* Return TRUE if VALTYPE is padded to its least significant bits. */
7337 static bool
7338 aarch64_return_in_msb (const_tree valtype)
7339 {
7340 machine_mode dummy_mode;
7341 int dummy_int;
7342
7343 /* Never happens in little-endian mode. */
7344 if (!BYTES_BIG_ENDIAN)
7345 return false;
7346
7347 /* Only composite types smaller than or equal to 16 bytes can
7348 be potentially returned in registers. */
7349 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
7350 || int_size_in_bytes (valtype) <= 0
7351 || int_size_in_bytes (valtype) > 16)
7352 return false;
7353
7354 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
7355 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
7356 is always passed/returned in the least significant bits of fp/simd
7357 register(s). */
7358 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
7359 &dummy_mode, &dummy_int, NULL,
7360 false))
7361 return false;
7362
7363 /* Likewise pure scalable types for SVE vector and predicate registers. */
7364 pure_scalable_type_info pst_info;
7365 if (pst_info.analyze_registers (valtype))
7366 return false;
7367
7368 return true;
7369 }
7370
7371 /* Implement TARGET_FUNCTION_VALUE.
7372 Define how to find the value returned by a function. */
7373
7374 static rtx
7375 aarch64_function_value (const_tree type, const_tree func,
7376 bool outgoing ATTRIBUTE_UNUSED)
7377 {
7378 machine_mode mode;
7379 int unsignedp;
7380
7381 mode = TYPE_MODE (type);
7382 if (INTEGRAL_TYPE_P (type))
7383 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
7384
7385 pure_scalable_type_info pst_info;
7386 if (type && pst_info.analyze_registers (type))
7387 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
7388
7389 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7390 are returned in memory, not by value. */
7391 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7392 bool sve_p = (vec_flags & VEC_ANY_SVE);
7393
7394 if (aarch64_return_in_msb (type))
7395 {
7396 HOST_WIDE_INT size = int_size_in_bytes (type);
7397
7398 if (size % UNITS_PER_WORD != 0)
7399 {
7400 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
7401 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
7402 }
7403 }
7404
7405 int count;
7406 machine_mode ag_mode;
7407 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
7408 NULL, false))
7409 {
7410 gcc_assert (!sve_p);
7411 if (!aarch64_composite_type_p (type, mode))
7412 {
7413 gcc_assert (count == 1 && mode == ag_mode);
7414 return gen_rtx_REG (mode, V0_REGNUM);
7415 }
7416 else if (aarch64_advsimd_full_struct_mode_p (mode)
7417 && known_eq (GET_MODE_SIZE (ag_mode), 16))
7418 return gen_rtx_REG (mode, V0_REGNUM);
7419 else if (aarch64_advsimd_partial_struct_mode_p (mode)
7420 && known_eq (GET_MODE_SIZE (ag_mode), 8))
7421 return gen_rtx_REG (mode, V0_REGNUM);
7422 else
7423 {
7424 int i;
7425 rtx par;
7426
7427 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
7428 for (i = 0; i < count; i++)
7429 {
7430 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
7431 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
7432 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7433 XVECEXP (par, 0, i) = tmp;
7434 }
7435 return par;
7436 }
7437 }
7438 else
7439 {
7440 if (sve_p)
7441 {
7442 /* Vector types can acquire a partial SVE mode using things like
7443 __attribute__((vector_size(N))), and this is potentially useful.
7444 However, the choice of mode doesn't affect the type's ABI
7445 identity, so we should treat the types as though they had
7446 the associated integer mode, just like they did before SVE
7447 was introduced.
7448
7449 We know that the vector must be 128 bits or smaller,
7450 otherwise we'd have returned it in memory instead. */
7451 gcc_assert (type
7452 && (aarch64_some_values_include_pst_objects_p (type)
7453 || (vec_flags & VEC_PARTIAL)));
7454
7455 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
7456 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
7457 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
7458 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
7459 }
7460 return gen_rtx_REG (mode, R0_REGNUM);
7461 }
7462 }
7463
7464 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
7465 Return true if REGNO is the number of a hard register in which the values
7466 of called function may come back. */
7467
7468 static bool
7469 aarch64_function_value_regno_p (const unsigned int regno)
7470 {
7471 /* Maximum of 16 bytes can be returned in the general registers. Examples
7472 of 16-byte return values are: 128-bit integers and 16-byte small
7473 structures (excluding homogeneous floating-point aggregates). */
7474 if (regno == R0_REGNUM || regno == R1_REGNUM)
7475 return true;
7476
7477 /* Up to four fp/simd registers can return a function value, e.g. a
7478 homogeneous floating-point aggregate having four members. */
7479 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
7480 return TARGET_FLOAT;
7481
7482 return false;
7483 }
7484
7485 /* Subroutine for aarch64_return_in_memory for types that are not returned
7486 in SVE registers. */
7487
7488 static bool
7489 aarch64_return_in_memory_1 (const_tree type)
7490 {
7491 HOST_WIDE_INT size;
7492 machine_mode ag_mode;
7493 int count;
7494
7495 if (!AGGREGATE_TYPE_P (type)
7496 && TREE_CODE (type) != COMPLEX_TYPE
7497 && TREE_CODE (type) != VECTOR_TYPE)
7498 /* Simple scalar types always returned in registers. */
7499 return false;
7500
7501 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7502 &ag_mode, &count, NULL, false))
7503 return false;
7504
7505 /* Types larger than 2 registers returned in memory. */
7506 size = int_size_in_bytes (type);
7507 return (size < 0 || size > 2 * UNITS_PER_WORD);
7508 }
7509
7510 /* Implement TARGET_RETURN_IN_MEMORY.
7511
7512 If the type T of the result of a function is such that
7513 void func (T arg)
7514 would require that arg be passed as a value in a register (or set of
7515 registers) according to the parameter passing rules, then the result
7516 is returned in the same registers as would be used for such an
7517 argument. */
7518
7519 static bool
7520 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
7521 {
7522 pure_scalable_type_info pst_info;
7523 switch (pst_info.analyze (type))
7524 {
7525 case pure_scalable_type_info::IS_PST:
7526 return (pst_info.num_zr () > NUM_FP_ARG_REGS
7527 || pst_info.num_pr () > NUM_PR_ARG_REGS);
7528
7529 case pure_scalable_type_info::DOESNT_MATTER:
7530 gcc_assert (aarch64_return_in_memory_1 (type));
7531 return true;
7532
7533 case pure_scalable_type_info::NO_ABI_IDENTITY:
7534 case pure_scalable_type_info::ISNT_PST:
7535 return aarch64_return_in_memory_1 (type);
7536 }
7537 gcc_unreachable ();
7538 }
7539
7540 static bool
7541 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
7542 const_tree type, int *nregs)
7543 {
7544 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7545 return aarch64_vfp_is_call_or_return_candidate (mode, type,
7546 &pcum->aapcs_vfp_rmode,
7547 nregs, NULL, pcum->silent_p);
7548 }
7549
7550 /* Given MODE and TYPE of a function argument, return the alignment in
7551 bits. The idea is to suppress any stronger alignment requested by
7552 the user and opt for the natural alignment (specified in AAPCS64 \S
7553 4.1). ABI_BREAK is set to the old alignment if the alignment was
7554 incorrectly calculated in versions of GCC prior to GCC-9.
7555 ABI_BREAK_PACKED is set to the old alignment if it was incorrectly
7556 calculated in versions between GCC-9 and GCC-13. This is a helper
7557 function for local use only. */
7558
7559 static unsigned int
7560 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
7561 unsigned int *abi_break,
7562 unsigned int *abi_break_packed)
7563 {
7564 *abi_break = 0;
7565 *abi_break_packed = 0;
7566 if (!type)
7567 return GET_MODE_ALIGNMENT (mode);
7568
7569 if (integer_zerop (TYPE_SIZE (type)))
7570 return 0;
7571
7572 gcc_assert (TYPE_MODE (type) == mode);
7573
7574 if (!AGGREGATE_TYPE_P (type))
7575 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
7576
7577 if (TREE_CODE (type) == ARRAY_TYPE)
7578 return TYPE_ALIGN (TREE_TYPE (type));
7579
7580 unsigned int alignment = 0;
7581 unsigned int bitfield_alignment_with_packed = 0;
7582 unsigned int bitfield_alignment = 0;
7583 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7584 if (TREE_CODE (field) == FIELD_DECL)
7585 {
7586 /* Note that we explicitly consider zero-sized fields here,
7587 even though they don't map to AAPCS64 machine types.
7588 For example, in:
7589
7590 struct __attribute__((aligned(8))) empty {};
7591
7592 struct s {
7593 [[no_unique_address]] empty e;
7594 int x;
7595 };
7596
7597 "s" contains only one Fundamental Data Type (the int field)
7598 but gains 8-byte alignment and size thanks to "e". */
7599 alignment = std::max (alignment, DECL_ALIGN (field));
7600 if (DECL_BIT_FIELD_TYPE (field))
7601 {
7602 /* Take the bit-field type's alignment into account only
7603 if the user didn't reduce this field's alignment with
7604 the packed attribute. */
7605 if (!DECL_PACKED (field))
7606 bitfield_alignment
7607 = std::max (bitfield_alignment,
7608 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7609
7610 /* Compute the alignment even if the bit-field is
7611 packed, so that we can emit a warning in case the
7612 alignment changed between GCC versions. */
7613 bitfield_alignment_with_packed
7614 = std::max (bitfield_alignment_with_packed,
7615 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7616 }
7617 }
7618
7619 /* Emit a warning if the alignment is different when taking the
7620 'packed' attribute into account. */
7621 if (bitfield_alignment != bitfield_alignment_with_packed
7622 && bitfield_alignment_with_packed > alignment)
7623 *abi_break_packed = bitfield_alignment_with_packed;
7624
7625 if (bitfield_alignment > alignment)
7626 {
7627 *abi_break = alignment;
7628 return bitfield_alignment;
7629 }
7630
7631 return alignment;
7632 }
7633
7634 /* Layout a function argument according to the AAPCS64 rules. The rule
7635 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
7636 mode that was originally given to us by the target hook, whereas the
7637 mode in ARG might be the result of replacing partial SVE modes with
7638 the equivalent integer mode. */
7639
7640 static void
7641 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7642 {
7643 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7644 tree type = arg.type;
7645 machine_mode mode = arg.mode;
7646 int ncrn, nvrn, nregs;
7647 bool allocate_ncrn, allocate_nvrn;
7648 HOST_WIDE_INT size;
7649 unsigned int abi_break;
7650 unsigned int abi_break_packed;
7651
7652 /* We need to do this once per argument. */
7653 if (pcum->aapcs_arg_processed)
7654 return;
7655
7656 bool warn_pcs_change
7657 = (warn_psabi
7658 && !pcum->silent_p
7659 && (currently_expanding_function_start
7660 || currently_expanding_gimple_stmt));
7661
7662 /* There are several things to note here:
7663
7664 - Both the C and AAPCS64 interpretations of a type's alignment should
7665 give a value that is no greater than the type's size.
7666
7667 - Types bigger than 16 bytes are passed indirectly.
7668
7669 - If an argument of type T is passed indirectly, TYPE and MODE describe
7670 a pointer to T rather than T iself.
7671
7672 It follows that the AAPCS64 alignment of TYPE must be no greater
7673 than 16 bytes.
7674
7675 Versions prior to GCC 9.1 ignored a bitfield's underlying type
7676 and so could calculate an alignment that was too small. If this
7677 happened for TYPE then ABI_BREAK is this older, too-small alignment.
7678
7679 Although GCC 9.1 fixed that bug, it introduced a different one:
7680 it would consider the alignment of a bitfield's underlying type even
7681 if the field was packed (which should have the effect of overriding
7682 the alignment of the underlying type). This was fixed in GCC 13.1.
7683
7684 As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
7685 that was too big. If this happened for TYPE, ABI_BREAK_PACKED is
7686 this older, too-big alignment.
7687
7688 Also, the fact that GCC 9 to GCC 12 considered irrelevant
7689 alignments meant they could calculate type alignments that were
7690 bigger than the type's size, contrary to the assumption above.
7691 The handling of register arguments was nevertheless (and justifiably)
7692 written to follow the assumption that the alignment can never be
7693 greater than the size. The same was not true for stack arguments;
7694 their alignment was instead handled by MIN bounds in
7695 aarch64_function_arg_boundary.
7696
7697 The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
7698 an alignment of more than 16 bytes for TYPE then:
7699
7700 - If the argument was passed in registers, these GCC versions
7701 would treat the alignment as though it was *less than* 16 bytes.
7702
7703 - If the argument was passed on the stack, these GCC versions
7704 would treat the alignment as though it was *equal to* 16 bytes.
7705
7706 Both behaviors were wrong, but in different cases. */
7707 unsigned int alignment
7708 = aarch64_function_arg_alignment (mode, type, &abi_break,
7709 &abi_break_packed);
7710 gcc_assert (alignment <= 16 * BITS_PER_UNIT
7711 && (!alignment || abi_break < alignment)
7712 && (!abi_break_packed || alignment < abi_break_packed));
7713
7714 pcum->aapcs_arg_processed = true;
7715
7716 pure_scalable_type_info pst_info;
7717 if (type && pst_info.analyze_registers (type))
7718 {
7719 /* aarch64_function_arg_alignment has never had an effect on
7720 this case. */
7721
7722 /* The PCS says that it is invalid to pass an SVE value to an
7723 unprototyped function. There is no ABI-defined location we
7724 can return in this case, so we have no real choice but to raise
7725 an error immediately, even though this is only a query function. */
7726 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7727 {
7728 gcc_assert (!pcum->silent_p);
7729 error ("SVE type %qT cannot be passed to an unprototyped function",
7730 arg.type);
7731 /* Avoid repeating the message, and avoid tripping the assert
7732 below. */
7733 pcum->pcs_variant = ARM_PCS_SVE;
7734 }
7735
7736 /* We would have converted the argument into pass-by-reference
7737 form if it didn't fit in registers. */
7738 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7739 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
7740 gcc_assert (arg.named
7741 && pcum->pcs_variant == ARM_PCS_SVE
7742 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7743 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
7744 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7745 P0_REGNUM + pcum->aapcs_nprn);
7746 return;
7747 }
7748
7749 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7750 are passed by reference, not by value. */
7751 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7752 bool sve_p = (vec_flags & VEC_ANY_SVE);
7753 if (sve_p)
7754 /* Vector types can acquire a partial SVE mode using things like
7755 __attribute__((vector_size(N))), and this is potentially useful.
7756 However, the choice of mode doesn't affect the type's ABI
7757 identity, so we should treat the types as though they had
7758 the associated integer mode, just like they did before SVE
7759 was introduced.
7760
7761 We know that the vector must be 128 bits or smaller,
7762 otherwise we'd have passed it in memory instead. */
7763 gcc_assert (type
7764 && (aarch64_some_values_include_pst_objects_p (type)
7765 || (vec_flags & VEC_PARTIAL)));
7766
7767 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
7768 if (type)
7769 size = int_size_in_bytes (type);
7770 else
7771 /* No frontends can create types with variable-sized modes, so we
7772 shouldn't be asked to pass or return them. */
7773 size = GET_MODE_SIZE (mode).to_constant ();
7774 size = ROUND_UP (size, UNITS_PER_WORD);
7775
7776 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7777 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7778 mode,
7779 type,
7780 &nregs);
7781 gcc_assert (!sve_p || !allocate_nvrn);
7782
7783 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7784 The following code thus handles passing by SIMD/FP registers first. */
7785
7786 nvrn = pcum->aapcs_nvrn;
7787
7788 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7789 and homogenous short-vector aggregates (HVA). */
7790 if (allocate_nvrn)
7791 {
7792 /* aarch64_function_arg_alignment has never had an effect on
7793 this case. */
7794 if (!pcum->silent_p && !TARGET_FLOAT)
7795 aarch64_err_no_fpadvsimd (mode);
7796
7797 if (nvrn + nregs <= NUM_FP_ARG_REGS)
7798 {
7799 pcum->aapcs_nextnvrn = nvrn + nregs;
7800 if (!aarch64_composite_type_p (type, mode))
7801 {
7802 gcc_assert (nregs == 1);
7803 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7804 }
7805 else if (aarch64_advsimd_full_struct_mode_p (mode)
7806 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7807 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7808 else if (aarch64_advsimd_partial_struct_mode_p (mode)
7809 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7810 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7811 else
7812 {
7813 rtx par;
7814 int i;
7815 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7816 for (i = 0; i < nregs; i++)
7817 {
7818 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7819 V0_REGNUM + nvrn + i);
7820 rtx offset = gen_int_mode
7821 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7822 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7823 XVECEXP (par, 0, i) = tmp;
7824 }
7825 pcum->aapcs_reg = par;
7826 }
7827 return;
7828 }
7829 else
7830 {
7831 /* C.3 NSRN is set to 8. */
7832 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7833 goto on_stack;
7834 }
7835 }
7836
7837 ncrn = pcum->aapcs_ncrn;
7838 nregs = size / UNITS_PER_WORD;
7839
7840 /* C6 - C9. though the sign and zero extension semantics are
7841 handled elsewhere. This is the case where the argument fits
7842 entirely general registers. */
7843 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7844 {
7845 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7846
7847 /* C.8 if the argument has an alignment of 16 then the NGRN is
7848 rounded up to the next even number. */
7849 if (nregs == 2
7850 && ncrn % 2)
7851 {
7852 /* Emit a warning if the alignment changed when taking the
7853 'packed' attribute into account. */
7854 if (warn_pcs_change
7855 && abi_break_packed
7856 && ((abi_break_packed == 16 * BITS_PER_UNIT)
7857 != (alignment == 16 * BITS_PER_UNIT)))
7858 inform (input_location, "parameter passing for argument of type "
7859 "%qT changed in GCC 13.1", type);
7860
7861 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7862 comparison is there because for > 16 * BITS_PER_UNIT
7863 alignment nregs should be > 2 and therefore it should be
7864 passed by reference rather than value. */
7865 if (alignment == 16 * BITS_PER_UNIT)
7866 {
7867 if (warn_pcs_change && abi_break)
7868 inform (input_location, "parameter passing for argument of type "
7869 "%qT changed in GCC 9.1", type);
7870 ++ncrn;
7871 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7872 }
7873 }
7874
7875 /* If an argument with an SVE mode needs to be shifted up to the
7876 high part of the register, treat it as though it had an integer mode.
7877 Using the normal (parallel [...]) would suppress the shifting. */
7878 if (sve_p
7879 && BYTES_BIG_ENDIAN
7880 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7881 && aarch64_pad_reg_upward (mode, type, false))
7882 {
7883 mode = int_mode_for_mode (mode).require ();
7884 sve_p = false;
7885 }
7886
7887 /* NREGS can be 0 when e.g. an empty structure is to be passed.
7888 A reg is still generated for it, but the caller should be smart
7889 enough not to use it. */
7890 if (nregs == 0
7891 || (nregs == 1 && !sve_p)
7892 || GET_MODE_CLASS (mode) == MODE_INT)
7893 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7894 else
7895 {
7896 rtx par;
7897 int i;
7898
7899 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7900 for (i = 0; i < nregs; i++)
7901 {
7902 scalar_int_mode reg_mode = word_mode;
7903 if (nregs == 1)
7904 reg_mode = int_mode_for_mode (mode).require ();
7905 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7906 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7907 GEN_INT (i * UNITS_PER_WORD));
7908 XVECEXP (par, 0, i) = tmp;
7909 }
7910 pcum->aapcs_reg = par;
7911 }
7912
7913 pcum->aapcs_nextncrn = ncrn + nregs;
7914 return;
7915 }
7916
7917 /* C.11 */
7918 pcum->aapcs_nextncrn = NUM_ARG_REGS;
7919
7920 /* The argument is passed on stack; record the needed number of words for
7921 this argument and align the total size if necessary. */
7922 on_stack:
7923 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7924
7925 if (warn_pcs_change
7926 && abi_break_packed
7927 && ((abi_break_packed >= 16 * BITS_PER_UNIT)
7928 != (alignment >= 16 * BITS_PER_UNIT)))
7929 inform (input_location, "parameter passing for argument of type "
7930 "%qT changed in GCC 13.1", type);
7931
7932 if (alignment == 16 * BITS_PER_UNIT)
7933 {
7934 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7935 if (pcum->aapcs_stack_size != new_size)
7936 {
7937 if (warn_pcs_change && abi_break)
7938 inform (input_location, "parameter passing for argument of type "
7939 "%qT changed in GCC 9.1", type);
7940 pcum->aapcs_stack_size = new_size;
7941 }
7942 }
7943 return;
7944 }
7945
7946 /* Implement TARGET_FUNCTION_ARG. */
7947
7948 static rtx
7949 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7950 {
7951 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7952 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7953 || pcum->pcs_variant == ARM_PCS_SIMD
7954 || pcum->pcs_variant == ARM_PCS_SVE);
7955
7956 if (arg.end_marker_p ())
7957 return gen_int_mode (pcum->pcs_variant, DImode);
7958
7959 aarch64_layout_arg (pcum_v, arg);
7960 return pcum->aapcs_reg;
7961 }
7962
7963 void
7964 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7965 const_tree fntype,
7966 rtx libname ATTRIBUTE_UNUSED,
7967 const_tree fndecl ATTRIBUTE_UNUSED,
7968 unsigned n_named ATTRIBUTE_UNUSED,
7969 bool silent_p)
7970 {
7971 pcum->aapcs_ncrn = 0;
7972 pcum->aapcs_nvrn = 0;
7973 pcum->aapcs_nprn = 0;
7974 pcum->aapcs_nextncrn = 0;
7975 pcum->aapcs_nextnvrn = 0;
7976 pcum->aapcs_nextnprn = 0;
7977 if (fntype)
7978 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7979 else
7980 pcum->pcs_variant = ARM_PCS_AAPCS64;
7981 pcum->aapcs_reg = NULL_RTX;
7982 pcum->aapcs_arg_processed = false;
7983 pcum->aapcs_stack_words = 0;
7984 pcum->aapcs_stack_size = 0;
7985 pcum->silent_p = silent_p;
7986
7987 if (!silent_p
7988 && !TARGET_FLOAT
7989 && fntype && fntype != error_mark_node)
7990 {
7991 const_tree type = TREE_TYPE (fntype);
7992 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
7993 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
7994 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7995 &mode, &nregs, NULL, false))
7996 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7997 }
7998
7999 if (!silent_p
8000 && !TARGET_SVE
8001 && pcum->pcs_variant == ARM_PCS_SVE)
8002 {
8003 /* We can't gracefully recover at this point, so make this a
8004 fatal error. */
8005 if (fndecl)
8006 fatal_error (input_location, "%qE requires the SVE ISA extension",
8007 fndecl);
8008 else
8009 fatal_error (input_location, "calls to functions of type %qT require"
8010 " the SVE ISA extension", fntype);
8011 }
8012 }
8013
8014 static void
8015 aarch64_function_arg_advance (cumulative_args_t pcum_v,
8016 const function_arg_info &arg)
8017 {
8018 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
8019 if (pcum->pcs_variant == ARM_PCS_AAPCS64
8020 || pcum->pcs_variant == ARM_PCS_SIMD
8021 || pcum->pcs_variant == ARM_PCS_SVE)
8022 {
8023 aarch64_layout_arg (pcum_v, arg);
8024 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
8025 != (pcum->aapcs_stack_words != 0));
8026 pcum->aapcs_arg_processed = false;
8027 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
8028 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
8029 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
8030 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
8031 pcum->aapcs_stack_words = 0;
8032 pcum->aapcs_reg = NULL_RTX;
8033 }
8034 }
8035
8036 bool
8037 aarch64_function_arg_regno_p (unsigned regno)
8038 {
8039 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
8040 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
8041 }
8042
8043 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
8044 PARM_BOUNDARY bits of alignment, but will be given anything up
8045 to STACK_BOUNDARY bits if the type requires it. This makes sure
8046 that both before and after the layout of each argument, the Next
8047 Stacked Argument Address (NSAA) will have a minimum alignment of
8048 8 bytes. */
8049
8050 static unsigned int
8051 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
8052 {
8053 unsigned int abi_break;
8054 unsigned int abi_break_packed;
8055 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
8056 &abi_break,
8057 &abi_break_packed);
8058 /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
8059 to emit warnings about ABI incompatibility. */
8060 alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
8061 return alignment;
8062 }
8063
8064 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
8065
8066 static fixed_size_mode
8067 aarch64_get_reg_raw_mode (int regno)
8068 {
8069 if (TARGET_SVE && FP_REGNUM_P (regno))
8070 /* Don't use the SVE part of the register for __builtin_apply and
8071 __builtin_return. The SVE registers aren't used by the normal PCS,
8072 so using them there would be a waste of time. The PCS extensions
8073 for SVE types are fundamentally incompatible with the
8074 __builtin_return/__builtin_apply interface. */
8075 return as_a <fixed_size_mode> (V16QImode);
8076 return default_get_reg_raw_mode (regno);
8077 }
8078
8079 /* Implement TARGET_FUNCTION_ARG_PADDING.
8080
8081 Small aggregate types are placed in the lowest memory address.
8082
8083 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
8084
8085 static pad_direction
8086 aarch64_function_arg_padding (machine_mode mode, const_tree type)
8087 {
8088 /* On little-endian targets, the least significant byte of every stack
8089 argument is passed at the lowest byte address of the stack slot. */
8090 if (!BYTES_BIG_ENDIAN)
8091 return PAD_UPWARD;
8092
8093 /* Otherwise, integral, floating-point and pointer types are padded downward:
8094 the least significant byte of a stack argument is passed at the highest
8095 byte address of the stack slot. */
8096 if (type
8097 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
8098 || POINTER_TYPE_P (type))
8099 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
8100 return PAD_DOWNWARD;
8101
8102 /* Everything else padded upward, i.e. data in first byte of stack slot. */
8103 return PAD_UPWARD;
8104 }
8105
8106 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
8107
8108 It specifies padding for the last (may also be the only)
8109 element of a block move between registers and memory. If
8110 assuming the block is in the memory, padding upward means that
8111 the last element is padded after its highest significant byte,
8112 while in downward padding, the last element is padded at the
8113 its least significant byte side.
8114
8115 Small aggregates and small complex types are always padded
8116 upwards.
8117
8118 We don't need to worry about homogeneous floating-point or
8119 short-vector aggregates; their move is not affected by the
8120 padding direction determined here. Regardless of endianness,
8121 each element of such an aggregate is put in the least
8122 significant bits of a fp/simd register.
8123
8124 Return !BYTES_BIG_ENDIAN if the least significant byte of the
8125 register has useful data, and return the opposite if the most
8126 significant byte does. */
8127
8128 bool
8129 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
8130 bool first ATTRIBUTE_UNUSED)
8131 {
8132
8133 /* Aside from pure scalable types, small composite types are always
8134 padded upward. */
8135 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
8136 {
8137 HOST_WIDE_INT size;
8138 if (type)
8139 size = int_size_in_bytes (type);
8140 else
8141 /* No frontends can create types with variable-sized modes, so we
8142 shouldn't be asked to pass or return them. */
8143 size = GET_MODE_SIZE (mode).to_constant ();
8144 if (size < 2 * UNITS_PER_WORD)
8145 {
8146 pure_scalable_type_info pst_info;
8147 if (pst_info.analyze_registers (type))
8148 return false;
8149 return true;
8150 }
8151 }
8152
8153 /* Otherwise, use the default padding. */
8154 return !BYTES_BIG_ENDIAN;
8155 }
8156
8157 static scalar_int_mode
8158 aarch64_libgcc_cmp_return_mode (void)
8159 {
8160 return SImode;
8161 }
8162
8163 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
8164
8165 /* We use the 12-bit shifted immediate arithmetic instructions so values
8166 must be multiple of (1 << 12), i.e. 4096. */
8167 #define ARITH_FACTOR 4096
8168
8169 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
8170 #error Cannot use simple address calculation for stack probing
8171 #endif
8172
8173 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
8174 inclusive. These are offsets from the current stack pointer. */
8175
8176 static void
8177 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
8178 {
8179 HOST_WIDE_INT size;
8180 if (!poly_size.is_constant (&size))
8181 {
8182 sorry ("stack probes for SVE frames");
8183 return;
8184 }
8185
8186 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
8187
8188 /* See the same assertion on PROBE_INTERVAL above. */
8189 gcc_assert ((first % ARITH_FACTOR) == 0);
8190
8191 /* See if we have a constant small number of probes to generate. If so,
8192 that's the easy case. */
8193 if (size <= PROBE_INTERVAL)
8194 {
8195 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
8196
8197 emit_set_insn (reg1,
8198 plus_constant (Pmode,
8199 stack_pointer_rtx, -(first + base)));
8200 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
8201 }
8202
8203 /* The run-time loop is made up of 8 insns in the generic case while the
8204 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
8205 else if (size <= 4 * PROBE_INTERVAL)
8206 {
8207 HOST_WIDE_INT i, rem;
8208
8209 emit_set_insn (reg1,
8210 plus_constant (Pmode,
8211 stack_pointer_rtx,
8212 -(first + PROBE_INTERVAL)));
8213 emit_stack_probe (reg1);
8214
8215 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
8216 it exceeds SIZE. If only two probes are needed, this will not
8217 generate any code. Then probe at FIRST + SIZE. */
8218 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
8219 {
8220 emit_set_insn (reg1,
8221 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
8222 emit_stack_probe (reg1);
8223 }
8224
8225 rem = size - (i - PROBE_INTERVAL);
8226 if (rem > 256)
8227 {
8228 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8229
8230 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
8231 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
8232 }
8233 else
8234 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
8235 }
8236
8237 /* Otherwise, do the same as above, but in a loop. Note that we must be
8238 extra careful with variables wrapping around because we might be at
8239 the very top (or the very bottom) of the address space and we have
8240 to be able to handle this case properly; in particular, we use an
8241 equality test for the loop condition. */
8242 else
8243 {
8244 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
8245
8246 /* Step 1: round SIZE to the previous multiple of the interval. */
8247
8248 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
8249
8250
8251 /* Step 2: compute initial and final value of the loop counter. */
8252
8253 /* TEST_ADDR = SP + FIRST. */
8254 emit_set_insn (reg1,
8255 plus_constant (Pmode, stack_pointer_rtx, -first));
8256
8257 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
8258 HOST_WIDE_INT adjustment = - (first + rounded_size);
8259 if (! aarch64_uimm12_shift (adjustment))
8260 {
8261 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
8262 true, Pmode);
8263 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
8264 }
8265 else
8266 emit_set_insn (reg2,
8267 plus_constant (Pmode, stack_pointer_rtx, adjustment));
8268
8269 /* Step 3: the loop
8270
8271 do
8272 {
8273 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
8274 probe at TEST_ADDR
8275 }
8276 while (TEST_ADDR != LAST_ADDR)
8277
8278 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
8279 until it is equal to ROUNDED_SIZE. */
8280
8281 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
8282
8283
8284 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
8285 that SIZE is equal to ROUNDED_SIZE. */
8286
8287 if (size != rounded_size)
8288 {
8289 HOST_WIDE_INT rem = size - rounded_size;
8290
8291 if (rem > 256)
8292 {
8293 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8294
8295 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
8296 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
8297 }
8298 else
8299 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
8300 }
8301 }
8302
8303 /* Make sure nothing is scheduled before we are done. */
8304 emit_insn (gen_blockage ());
8305 }
8306
8307 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
8308 absolute addresses. */
8309
8310 const char *
8311 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
8312 {
8313 static int labelno = 0;
8314 char loop_lab[32];
8315 rtx xops[2];
8316
8317 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
8318
8319 /* Loop. */
8320 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
8321
8322 HOST_WIDE_INT stack_clash_probe_interval
8323 = 1 << param_stack_clash_protection_guard_size;
8324
8325 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
8326 xops[0] = reg1;
8327 HOST_WIDE_INT interval;
8328 if (flag_stack_clash_protection)
8329 interval = stack_clash_probe_interval;
8330 else
8331 interval = PROBE_INTERVAL;
8332
8333 gcc_assert (aarch64_uimm12_shift (interval));
8334 xops[1] = GEN_INT (interval);
8335
8336 output_asm_insn ("sub\t%0, %0, %1", xops);
8337
8338 /* If doing stack clash protection then we probe up by the ABI specified
8339 amount. We do this because we're dropping full pages at a time in the
8340 loop. But if we're doing non-stack clash probing, probe at SP 0. */
8341 if (flag_stack_clash_protection)
8342 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
8343 else
8344 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
8345
8346 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
8347 by this amount for each iteration. */
8348 output_asm_insn ("str\txzr, [%0, %1]", xops);
8349
8350 /* Test if TEST_ADDR == LAST_ADDR. */
8351 xops[1] = reg2;
8352 output_asm_insn ("cmp\t%0, %1", xops);
8353
8354 /* Branch. */
8355 fputs ("\tb.ne\t", asm_out_file);
8356 assemble_name_raw (asm_out_file, loop_lab);
8357 fputc ('\n', asm_out_file);
8358
8359 return "";
8360 }
8361
8362 /* Emit the probe loop for doing stack clash probes and stack adjustments for
8363 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
8364 of GUARD_SIZE. When a probe is emitted it is done at most
8365 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
8366 at most MIN_PROBE_THRESHOLD. By the end of this function
8367 BASE = BASE - ADJUSTMENT. */
8368
8369 const char *
8370 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
8371 rtx min_probe_threshold, rtx guard_size)
8372 {
8373 /* This function is not allowed to use any instruction generation function
8374 like gen_ and friends. If you do you'll likely ICE during CFG validation,
8375 so instead emit the code you want using output_asm_insn. */
8376 gcc_assert (flag_stack_clash_protection);
8377 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
8378 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
8379
8380 /* The minimum required allocation before the residual requires probing. */
8381 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
8382
8383 /* Clamp the value down to the nearest value that can be used with a cmp. */
8384 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
8385 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
8386
8387 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
8388 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
8389
8390 static int labelno = 0;
8391 char loop_start_lab[32];
8392 char loop_end_lab[32];
8393 rtx xops[2];
8394
8395 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
8396 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
8397
8398 /* Emit loop start label. */
8399 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
8400
8401 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
8402 xops[0] = adjustment;
8403 xops[1] = probe_offset_value_rtx;
8404 output_asm_insn ("cmp\t%0, %1", xops);
8405
8406 /* Branch to end if not enough adjustment to probe. */
8407 fputs ("\tb.lt\t", asm_out_file);
8408 assemble_name_raw (asm_out_file, loop_end_lab);
8409 fputc ('\n', asm_out_file);
8410
8411 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
8412 xops[0] = base;
8413 xops[1] = probe_offset_value_rtx;
8414 output_asm_insn ("sub\t%0, %0, %1", xops);
8415
8416 /* Probe at BASE. */
8417 xops[1] = const0_rtx;
8418 output_asm_insn ("str\txzr, [%0, %1]", xops);
8419
8420 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
8421 xops[0] = adjustment;
8422 xops[1] = probe_offset_value_rtx;
8423 output_asm_insn ("sub\t%0, %0, %1", xops);
8424
8425 /* Branch to start if still more bytes to allocate. */
8426 fputs ("\tb\t", asm_out_file);
8427 assemble_name_raw (asm_out_file, loop_start_lab);
8428 fputc ('\n', asm_out_file);
8429
8430 /* No probe leave. */
8431 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
8432
8433 /* BASE = BASE - ADJUSTMENT. */
8434 xops[0] = base;
8435 xops[1] = adjustment;
8436 output_asm_insn ("sub\t%0, %0, %1", xops);
8437 return "";
8438 }
8439
8440 /* Determine whether a frame chain needs to be generated. */
8441 static bool
8442 aarch64_needs_frame_chain (void)
8443 {
8444 /* Force a frame chain for EH returns so the return address is at FP+8. */
8445 if (frame_pointer_needed || crtl->calls_eh_return)
8446 return true;
8447
8448 /* A leaf function cannot have calls or write LR. */
8449 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
8450
8451 /* Don't use a frame chain in leaf functions if leaf frame pointers
8452 are disabled. */
8453 if (flag_omit_leaf_frame_pointer && is_leaf)
8454 return false;
8455
8456 return aarch64_use_frame_pointer;
8457 }
8458
8459 /* Mark the registers that need to be saved by the callee and calculate
8460 the size of the callee-saved registers area and frame record (both FP
8461 and LR may be omitted). */
8462 static void
8463 aarch64_layout_frame (void)
8464 {
8465 poly_int64 offset = 0;
8466 int regno, last_fp_reg = INVALID_REGNUM;
8467 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8468 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8469 bool frame_related_fp_reg_p = false;
8470 aarch64_frame &frame = cfun->machine->frame;
8471
8472 frame.emit_frame_chain = aarch64_needs_frame_chain ();
8473
8474 /* Adjust the outgoing arguments size if required. Keep it in sync with what
8475 the mid-end is doing. */
8476 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8477
8478 #define SLOT_NOT_REQUIRED (-2)
8479 #define SLOT_REQUIRED (-1)
8480
8481 frame.wb_push_candidate1 = INVALID_REGNUM;
8482 frame.wb_push_candidate2 = INVALID_REGNUM;
8483 frame.spare_pred_reg = INVALID_REGNUM;
8484
8485 /* First mark all the registers that really need to be saved... */
8486 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8487 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
8488
8489 /* ... that includes the eh data registers (if needed)... */
8490 if (crtl->calls_eh_return)
8491 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
8492 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
8493
8494 /* ... and any callee saved register that dataflow says is live. */
8495 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8496 if (df_regs_ever_live_p (regno)
8497 && !fixed_regs[regno]
8498 && (regno == R30_REGNUM
8499 || !crtl->abi->clobbers_full_reg_p (regno)))
8500 frame.reg_offset[regno] = SLOT_REQUIRED;
8501
8502 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8503 if (df_regs_ever_live_p (regno)
8504 && !fixed_regs[regno]
8505 && !crtl->abi->clobbers_full_reg_p (regno))
8506 {
8507 frame.reg_offset[regno] = SLOT_REQUIRED;
8508 last_fp_reg = regno;
8509 if (aarch64_emit_cfi_for_reg_p (regno))
8510 frame_related_fp_reg_p = true;
8511 }
8512
8513 /* Big-endian SVE frames need a spare predicate register in order
8514 to save Z8-Z15. Decide which register they should use. Prefer
8515 an unused argument register if possible, so that we don't force P4
8516 to be saved unnecessarily. */
8517 if (frame_related_fp_reg_p
8518 && crtl->abi->id () == ARM_PCS_SVE
8519 && BYTES_BIG_ENDIAN)
8520 {
8521 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8522 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8523 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8524 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8525 break;
8526 gcc_assert (regno <= P7_REGNUM);
8527 frame.spare_pred_reg = regno;
8528 df_set_regs_ever_live (regno, true);
8529 }
8530
8531 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8532 if (df_regs_ever_live_p (regno)
8533 && !fixed_regs[regno]
8534 && !crtl->abi->clobbers_full_reg_p (regno))
8535 frame.reg_offset[regno] = SLOT_REQUIRED;
8536
8537 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
8538 LR counts as an implicit probe which allows us to maintain the invariant
8539 described in the comment at expand_prologue. */
8540 gcc_assert (crtl->is_leaf
8541 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
8542
8543 /* Now assign stack slots for the registers. Start with the predicate
8544 registers, since predicate LDR and STR have a relatively small
8545 offset range. These saves happen below the hard frame pointer. */
8546 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8547 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8548 {
8549 frame.reg_offset[regno] = offset;
8550 offset += BYTES_PER_SVE_PRED;
8551 }
8552
8553 if (maybe_ne (offset, 0))
8554 {
8555 /* If we have any vector registers to save above the predicate registers,
8556 the offset of the vector register save slots need to be a multiple
8557 of the vector size. This lets us use the immediate forms of LDR/STR
8558 (or LD1/ST1 for big-endian).
8559
8560 A vector register is 8 times the size of a predicate register,
8561 and we need to save a maximum of 12 predicate registers, so the
8562 first vector register will be at either #1, MUL VL or #2, MUL VL.
8563
8564 If we don't have any vector registers to save, and we know how
8565 big the predicate save area is, we can just round it up to the
8566 next 16-byte boundary. */
8567 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
8568 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8569 else
8570 {
8571 if (known_le (offset, vector_save_size))
8572 offset = vector_save_size;
8573 else if (known_le (offset, vector_save_size * 2))
8574 offset = vector_save_size * 2;
8575 else
8576 gcc_unreachable ();
8577 }
8578 }
8579
8580 /* If we need to save any SVE vector registers, add them next. */
8581 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8582 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8583 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8584 {
8585 frame.reg_offset[regno] = offset;
8586 offset += vector_save_size;
8587 }
8588
8589 /* OFFSET is now the offset of the hard frame pointer from the bottom
8590 of the callee save area. */
8591 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
8592 frame.below_hard_fp_saved_regs_size = offset;
8593 if (frame.emit_frame_chain)
8594 {
8595 /* FP and LR are placed in the linkage record. */
8596 frame.reg_offset[R29_REGNUM] = offset;
8597 frame.wb_push_candidate1 = R29_REGNUM;
8598 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
8599 frame.wb_push_candidate2 = R30_REGNUM;
8600 offset += 2 * UNITS_PER_WORD;
8601 }
8602
8603 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8604 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8605 {
8606 frame.reg_offset[regno] = offset;
8607 if (frame.wb_push_candidate1 == INVALID_REGNUM)
8608 frame.wb_push_candidate1 = regno;
8609 else if (frame.wb_push_candidate2 == INVALID_REGNUM)
8610 frame.wb_push_candidate2 = regno;
8611 offset += UNITS_PER_WORD;
8612 }
8613
8614 poly_int64 max_int_offset = offset;
8615 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8616 bool has_align_gap = maybe_ne (offset, max_int_offset);
8617
8618 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8619 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8620 {
8621 /* If there is an alignment gap between integer and fp callee-saves,
8622 allocate the last fp register to it if possible. */
8623 if (regno == last_fp_reg
8624 && has_align_gap
8625 && known_eq (vector_save_size, 8)
8626 && multiple_p (offset, 16))
8627 {
8628 frame.reg_offset[regno] = max_int_offset;
8629 break;
8630 }
8631
8632 frame.reg_offset[regno] = offset;
8633 if (frame.wb_push_candidate1 == INVALID_REGNUM)
8634 frame.wb_push_candidate1 = regno;
8635 else if (frame.wb_push_candidate2 == INVALID_REGNUM
8636 && frame.wb_push_candidate1 >= V0_REGNUM)
8637 frame.wb_push_candidate2 = regno;
8638 offset += vector_save_size;
8639 }
8640
8641 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8642
8643 frame.saved_regs_size = offset;
8644
8645 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
8646
8647 poly_int64 above_outgoing_args
8648 = aligned_upper_bound (varargs_and_saved_regs_size
8649 + get_frame_size (),
8650 STACK_BOUNDARY / BITS_PER_UNIT);
8651
8652 frame.hard_fp_offset
8653 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
8654
8655 /* Both these values are already aligned. */
8656 gcc_assert (multiple_p (crtl->outgoing_args_size,
8657 STACK_BOUNDARY / BITS_PER_UNIT));
8658 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
8659
8660 frame.locals_offset = frame.saved_varargs_size;
8661
8662 frame.initial_adjust = 0;
8663 frame.final_adjust = 0;
8664 frame.callee_adjust = 0;
8665 frame.sve_callee_adjust = 0;
8666 frame.callee_offset = 0;
8667
8668 frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8669 frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8670
8671 /* Shadow call stack only deals with functions where the LR is pushed
8672 onto the stack and without specifying the "no_sanitize" attribute
8673 with the argument "shadow-call-stack". */
8674 frame.is_scs_enabled
8675 = (!crtl->calls_eh_return
8676 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8677 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
8678
8679 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8680 restore x30, and we don't need to pop x30 again in the traditional
8681 way. Pop candidates record the registers that need to be popped
8682 eventually. */
8683 if (frame.is_scs_enabled)
8684 {
8685 if (frame.wb_pop_candidate2 == R30_REGNUM)
8686 frame.wb_pop_candidate2 = INVALID_REGNUM;
8687 else if (frame.wb_pop_candidate1 == R30_REGNUM)
8688 frame.wb_pop_candidate1 = INVALID_REGNUM;
8689 }
8690
8691 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8692 256 to ensure that the offset meets the requirements of emit_move_insn.
8693 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8694 max_push_offset to 0, because no registers are popped at this time,
8695 so callee_adjust cannot be adjusted. */
8696 HOST_WIDE_INT max_push_offset = 0;
8697 if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8698 max_push_offset = 512;
8699 else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8700 max_push_offset = 256;
8701
8702 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
8703 HOST_WIDE_INT const_saved_regs_size;
8704 if (frame.frame_size.is_constant (&const_size)
8705 && const_size < max_push_offset
8706 && known_eq (frame.hard_fp_offset, const_size))
8707 {
8708 /* Simple, small frame with no outgoing arguments:
8709
8710 stp reg1, reg2, [sp, -frame_size]!
8711 stp reg3, reg4, [sp, 16] */
8712 frame.callee_adjust = const_size;
8713 }
8714 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
8715 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
8716 && const_outgoing_args_size + const_saved_regs_size < 512
8717 /* We could handle this case even with outgoing args, provided
8718 that the number of args left us with valid offsets for all
8719 predicate and vector save slots. It's such a rare case that
8720 it hardly seems worth the effort though. */
8721 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
8722 && !(cfun->calls_alloca
8723 && frame.hard_fp_offset.is_constant (&const_fp_offset)
8724 && const_fp_offset < max_push_offset))
8725 {
8726 /* Frame with small outgoing arguments:
8727
8728 sub sp, sp, frame_size
8729 stp reg1, reg2, [sp, outgoing_args_size]
8730 stp reg3, reg4, [sp, outgoing_args_size + 16] */
8731 frame.initial_adjust = frame.frame_size;
8732 frame.callee_offset = const_outgoing_args_size;
8733 }
8734 else if (saves_below_hard_fp_p
8735 && known_eq (frame.saved_regs_size,
8736 frame.below_hard_fp_saved_regs_size))
8737 {
8738 /* Frame in which all saves are SVE saves:
8739
8740 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
8741 save SVE registers relative to SP
8742 sub sp, sp, outgoing_args_size */
8743 frame.initial_adjust = (frame.hard_fp_offset
8744 + frame.below_hard_fp_saved_regs_size);
8745 frame.final_adjust = crtl->outgoing_args_size;
8746 }
8747 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
8748 && const_fp_offset < max_push_offset)
8749 {
8750 /* Frame with large outgoing arguments or SVE saves, but with
8751 a small local area:
8752
8753 stp reg1, reg2, [sp, -hard_fp_offset]!
8754 stp reg3, reg4, [sp, 16]
8755 [sub sp, sp, below_hard_fp_saved_regs_size]
8756 [save SVE registers relative to SP]
8757 sub sp, sp, outgoing_args_size */
8758 frame.callee_adjust = const_fp_offset;
8759 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8760 frame.final_adjust = crtl->outgoing_args_size;
8761 }
8762 else
8763 {
8764 /* Frame with large local area and outgoing arguments or SVE saves,
8765 using frame pointer:
8766
8767 sub sp, sp, hard_fp_offset
8768 stp x29, x30, [sp, 0]
8769 add x29, sp, 0
8770 stp reg3, reg4, [sp, 16]
8771 [sub sp, sp, below_hard_fp_saved_regs_size]
8772 [save SVE registers relative to SP]
8773 sub sp, sp, outgoing_args_size */
8774 frame.initial_adjust = frame.hard_fp_offset;
8775 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8776 frame.final_adjust = crtl->outgoing_args_size;
8777 }
8778
8779 /* Make sure the individual adjustments add up to the full frame size. */
8780 gcc_assert (known_eq (frame.initial_adjust
8781 + frame.callee_adjust
8782 + frame.sve_callee_adjust
8783 + frame.final_adjust, frame.frame_size));
8784
8785 if (!frame.emit_frame_chain && frame.callee_adjust == 0)
8786 {
8787 /* We've decided not to associate any register saves with the initial
8788 stack allocation. */
8789 frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
8790 frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
8791 }
8792
8793 frame.laid_out = true;
8794 }
8795
8796 /* Return true if the register REGNO is saved on entry to
8797 the current function. */
8798
8799 static bool
8800 aarch64_register_saved_on_entry (int regno)
8801 {
8802 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8803 }
8804
8805 /* Return the next register up from REGNO up to LIMIT for the callee
8806 to save. */
8807
8808 static unsigned
8809 aarch64_next_callee_save (unsigned regno, unsigned limit)
8810 {
8811 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
8812 regno ++;
8813 return regno;
8814 }
8815
8816 /* Push the register number REGNO of mode MODE to the stack with write-back
8817 adjusting the stack by ADJUSTMENT. */
8818
8819 static void
8820 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8821 HOST_WIDE_INT adjustment)
8822 {
8823 rtx base_rtx = stack_pointer_rtx;
8824 rtx insn, reg, mem;
8825
8826 reg = gen_rtx_REG (mode, regno);
8827 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8828 plus_constant (Pmode, base_rtx, -adjustment));
8829 mem = gen_frame_mem (mode, mem);
8830
8831 insn = emit_move_insn (mem, reg);
8832 RTX_FRAME_RELATED_P (insn) = 1;
8833 }
8834
8835 /* Generate and return an instruction to store the pair of registers
8836 REG and REG2 of mode MODE to location BASE with write-back adjusting
8837 the stack location BASE by ADJUSTMENT. */
8838
8839 static rtx
8840 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8841 HOST_WIDE_INT adjustment)
8842 {
8843 switch (mode)
8844 {
8845 case E_DImode:
8846 return gen_storewb_pairdi_di (base, base, reg, reg2,
8847 GEN_INT (-adjustment),
8848 GEN_INT (UNITS_PER_WORD - adjustment));
8849 case E_DFmode:
8850 return gen_storewb_pairdf_di (base, base, reg, reg2,
8851 GEN_INT (-adjustment),
8852 GEN_INT (UNITS_PER_WORD - adjustment));
8853 case E_TFmode:
8854 return gen_storewb_pairtf_di (base, base, reg, reg2,
8855 GEN_INT (-adjustment),
8856 GEN_INT (UNITS_PER_VREG - adjustment));
8857 default:
8858 gcc_unreachable ();
8859 }
8860 }
8861
8862 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8863 stack pointer by ADJUSTMENT. */
8864
8865 static void
8866 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8867 {
8868 rtx_insn *insn;
8869 machine_mode mode = aarch64_reg_save_mode (regno1);
8870
8871 if (regno2 == INVALID_REGNUM)
8872 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8873
8874 rtx reg1 = gen_rtx_REG (mode, regno1);
8875 rtx reg2 = gen_rtx_REG (mode, regno2);
8876
8877 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8878 reg2, adjustment));
8879 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8880 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8881 RTX_FRAME_RELATED_P (insn) = 1;
8882 }
8883
8884 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8885 adjusting it by ADJUSTMENT afterwards. */
8886
8887 static rtx
8888 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8889 HOST_WIDE_INT adjustment)
8890 {
8891 switch (mode)
8892 {
8893 case E_DImode:
8894 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
8895 GEN_INT (UNITS_PER_WORD));
8896 case E_DFmode:
8897 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
8898 GEN_INT (UNITS_PER_WORD));
8899 case E_TFmode:
8900 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
8901 GEN_INT (UNITS_PER_VREG));
8902 default:
8903 gcc_unreachable ();
8904 }
8905 }
8906
8907 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8908 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8909 into CFI_OPS. */
8910
8911 static void
8912 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8913 rtx *cfi_ops)
8914 {
8915 machine_mode mode = aarch64_reg_save_mode (regno1);
8916 rtx reg1 = gen_rtx_REG (mode, regno1);
8917
8918 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8919
8920 if (regno2 == INVALID_REGNUM)
8921 {
8922 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8923 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8924 emit_move_insn (reg1, gen_frame_mem (mode, mem));
8925 }
8926 else
8927 {
8928 rtx reg2 = gen_rtx_REG (mode, regno2);
8929 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8930 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8931 reg2, adjustment));
8932 }
8933 }
8934
8935 /* Generate and return a store pair instruction of mode MODE to store
8936 register REG1 to MEM1 and register REG2 to MEM2. */
8937
8938 static rtx
8939 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
8940 rtx reg2)
8941 {
8942 switch (mode)
8943 {
8944 case E_DImode:
8945 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
8946
8947 case E_DFmode:
8948 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
8949
8950 case E_TFmode:
8951 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
8952
8953 case E_V4SImode:
8954 return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
8955
8956 case E_V16QImode:
8957 return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
8958
8959 default:
8960 gcc_unreachable ();
8961 }
8962 }
8963
8964 /* Generate and regurn a load pair isntruction of mode MODE to load register
8965 REG1 from MEM1 and register REG2 from MEM2. */
8966
8967 static rtx
8968 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
8969 rtx mem2)
8970 {
8971 switch (mode)
8972 {
8973 case E_DImode:
8974 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
8975
8976 case E_DFmode:
8977 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
8978
8979 case E_TFmode:
8980 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
8981
8982 case E_V4SImode:
8983 return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
8984
8985 default:
8986 gcc_unreachable ();
8987 }
8988 }
8989
8990 /* Return TRUE if return address signing should be enabled for the current
8991 function, otherwise return FALSE. */
8992
8993 bool
8994 aarch64_return_address_signing_enabled (void)
8995 {
8996 /* This function should only be called after frame laid out. */
8997 gcc_assert (cfun->machine->frame.laid_out);
8998
8999 /* Turn return address signing off in any function that uses
9000 __builtin_eh_return. The address passed to __builtin_eh_return
9001 is not signed so either it has to be signed (with original sp)
9002 or the code path that uses it has to avoid authenticating it.
9003 Currently eh return introduces a return to anywhere gadget, no
9004 matter what we do here since it uses ret with user provided
9005 address. An ideal fix for that is to use indirect branch which
9006 can be protected with BTI j (to some extent). */
9007 if (crtl->calls_eh_return)
9008 return false;
9009
9010 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
9011 if its LR is pushed onto stack. */
9012 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
9013 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
9014 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
9015 }
9016
9017 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
9018 bool
9019 aarch64_bti_enabled (void)
9020 {
9021 return (aarch64_enable_bti == 1);
9022 }
9023
9024 /* The caller is going to use ST1D or LD1D to save or restore an SVE
9025 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
9026 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
9027
9028 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
9029 or LD1D address
9030
9031 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
9032 if the variable isn't already nonnull
9033
9034 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
9035 Handle this case using a temporary base register that is suitable for
9036 all offsets in that range. Use ANCHOR_REG as this base register if it
9037 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
9038
9039 static inline void
9040 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
9041 rtx &anchor_reg, poly_int64 &offset,
9042 rtx &ptrue)
9043 {
9044 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
9045 {
9046 /* This is the maximum valid offset of the anchor from the base.
9047 Lower values would be valid too. */
9048 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
9049 if (!anchor_reg)
9050 {
9051 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9052 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9053 gen_int_mode (anchor_offset, Pmode)));
9054 }
9055 base_rtx = anchor_reg;
9056 offset -= anchor_offset;
9057 }
9058 if (!ptrue)
9059 {
9060 int pred_reg = cfun->machine->frame.spare_pred_reg;
9061 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
9062 CONSTM1_RTX (VNx16BImode));
9063 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
9064 }
9065 }
9066
9067 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
9068 is saved at BASE + OFFSET. */
9069
9070 static void
9071 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
9072 rtx base, poly_int64 offset)
9073 {
9074 rtx mem = gen_frame_mem (GET_MODE (reg),
9075 plus_constant (Pmode, base, offset));
9076 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
9077 }
9078
9079 /* Emit code to save the callee-saved registers from register number START
9080 to LIMIT to the stack at the location starting at offset START_OFFSET,
9081 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
9082 is true if the hard frame pointer has been set up. */
9083
9084 static void
9085 aarch64_save_callee_saves (poly_int64 start_offset,
9086 unsigned start, unsigned limit, bool skip_wb,
9087 bool hard_fp_valid_p)
9088 {
9089 rtx_insn *insn;
9090 unsigned regno;
9091 unsigned regno2;
9092 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
9093
9094 for (regno = aarch64_next_callee_save (start, limit);
9095 regno <= limit;
9096 regno = aarch64_next_callee_save (regno + 1, limit))
9097 {
9098 rtx reg, mem;
9099 poly_int64 offset;
9100 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9101
9102 if (skip_wb
9103 && (regno == cfun->machine->frame.wb_push_candidate1
9104 || regno == cfun->machine->frame.wb_push_candidate2))
9105 continue;
9106
9107 if (cfun->machine->reg_is_wrapped_separately[regno])
9108 continue;
9109
9110 machine_mode mode = aarch64_reg_save_mode (regno);
9111 reg = gen_rtx_REG (mode, regno);
9112 offset = start_offset + cfun->machine->frame.reg_offset[regno];
9113 rtx base_rtx = stack_pointer_rtx;
9114 poly_int64 sp_offset = offset;
9115
9116 HOST_WIDE_INT const_offset;
9117 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9118 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9119 offset, ptrue);
9120 else if (GP_REGNUM_P (regno)
9121 && (!offset.is_constant (&const_offset) || const_offset >= 512))
9122 {
9123 gcc_assert (known_eq (start_offset, 0));
9124 poly_int64 fp_offset
9125 = cfun->machine->frame.below_hard_fp_saved_regs_size;
9126 if (hard_fp_valid_p)
9127 base_rtx = hard_frame_pointer_rtx;
9128 else
9129 {
9130 if (!anchor_reg)
9131 {
9132 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9133 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9134 gen_int_mode (fp_offset, Pmode)));
9135 }
9136 base_rtx = anchor_reg;
9137 }
9138 offset -= fp_offset;
9139 }
9140 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9141 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
9142
9143 if (!aarch64_sve_mode_p (mode)
9144 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9145 && !cfun->machine->reg_is_wrapped_separately[regno2]
9146 && known_eq (GET_MODE_SIZE (mode),
9147 cfun->machine->frame.reg_offset[regno2]
9148 - cfun->machine->frame.reg_offset[regno]))
9149 {
9150 rtx reg2 = gen_rtx_REG (mode, regno2);
9151 rtx mem2;
9152
9153 offset += GET_MODE_SIZE (mode);
9154 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9155 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
9156 reg2));
9157
9158 /* The first part of a frame-related parallel insn is
9159 always assumed to be relevant to the frame
9160 calculations; subsequent parts, are only
9161 frame-related if explicitly marked. */
9162 if (aarch64_emit_cfi_for_reg_p (regno2))
9163 {
9164 if (need_cfa_note_p)
9165 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
9166 sp_offset + GET_MODE_SIZE (mode));
9167 else
9168 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
9169 }
9170
9171 regno = regno2;
9172 }
9173 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9174 {
9175 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
9176 need_cfa_note_p = true;
9177 }
9178 else if (aarch64_sve_mode_p (mode))
9179 insn = emit_insn (gen_rtx_SET (mem, reg));
9180 else
9181 insn = emit_move_insn (mem, reg);
9182
9183 RTX_FRAME_RELATED_P (insn) = frame_related_p;
9184 if (frame_related_p && need_cfa_note_p)
9185 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
9186 }
9187 }
9188
9189 /* Emit code to restore the callee registers from register number START
9190 up to and including LIMIT. Restore from the stack offset START_OFFSET,
9191 skipping any write-back candidates if SKIP_WB is true. Write the
9192 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
9193
9194 static void
9195 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
9196 unsigned limit, bool skip_wb, rtx *cfi_ops)
9197 {
9198 unsigned regno;
9199 unsigned regno2;
9200 poly_int64 offset;
9201 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
9202
9203 for (regno = aarch64_next_callee_save (start, limit);
9204 regno <= limit;
9205 regno = aarch64_next_callee_save (regno + 1, limit))
9206 {
9207 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9208 if (cfun->machine->reg_is_wrapped_separately[regno])
9209 continue;
9210
9211 rtx reg, mem;
9212
9213 if (skip_wb
9214 && (regno == cfun->machine->frame.wb_pop_candidate1
9215 || regno == cfun->machine->frame.wb_pop_candidate2))
9216 continue;
9217
9218 machine_mode mode = aarch64_reg_save_mode (regno);
9219 reg = gen_rtx_REG (mode, regno);
9220 offset = start_offset + cfun->machine->frame.reg_offset[regno];
9221 rtx base_rtx = stack_pointer_rtx;
9222 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9223 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9224 offset, ptrue);
9225 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9226
9227 if (!aarch64_sve_mode_p (mode)
9228 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9229 && !cfun->machine->reg_is_wrapped_separately[regno2]
9230 && known_eq (GET_MODE_SIZE (mode),
9231 cfun->machine->frame.reg_offset[regno2]
9232 - cfun->machine->frame.reg_offset[regno]))
9233 {
9234 rtx reg2 = gen_rtx_REG (mode, regno2);
9235 rtx mem2;
9236
9237 offset += GET_MODE_SIZE (mode);
9238 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9239 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9240
9241 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
9242 regno = regno2;
9243 }
9244 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9245 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
9246 else if (aarch64_sve_mode_p (mode))
9247 emit_insn (gen_rtx_SET (reg, mem));
9248 else
9249 emit_move_insn (reg, mem);
9250 if (frame_related_p)
9251 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
9252 }
9253 }
9254
9255 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
9256 of MODE. */
9257
9258 static inline bool
9259 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9260 {
9261 HOST_WIDE_INT multiple;
9262 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9263 && IN_RANGE (multiple, -8, 7));
9264 }
9265
9266 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
9267 of MODE. */
9268
9269 static inline bool
9270 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9271 {
9272 HOST_WIDE_INT multiple;
9273 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9274 && IN_RANGE (multiple, -32, 31));
9275 }
9276
9277 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
9278 of MODE. */
9279
9280 static inline bool
9281 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9282 {
9283 HOST_WIDE_INT multiple;
9284 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9285 && IN_RANGE (multiple, 0, 63));
9286 }
9287
9288 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
9289 of MODE. */
9290
9291 bool
9292 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9293 {
9294 HOST_WIDE_INT multiple;
9295 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9296 && IN_RANGE (multiple, -64, 63));
9297 }
9298
9299 /* Return true if OFFSET is a signed 9-bit value. */
9300
9301 bool
9302 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
9303 poly_int64 offset)
9304 {
9305 HOST_WIDE_INT const_offset;
9306 return (offset.is_constant (&const_offset)
9307 && IN_RANGE (const_offset, -256, 255));
9308 }
9309
9310 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
9311 of MODE. */
9312
9313 static inline bool
9314 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9315 {
9316 HOST_WIDE_INT multiple;
9317 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9318 && IN_RANGE (multiple, -256, 255));
9319 }
9320
9321 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9322 of MODE. */
9323
9324 static inline bool
9325 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9326 {
9327 HOST_WIDE_INT multiple;
9328 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9329 && IN_RANGE (multiple, 0, 4095));
9330 }
9331
9332 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
9333
9334 static sbitmap
9335 aarch64_get_separate_components (void)
9336 {
9337 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9338 bitmap_clear (components);
9339
9340 /* The registers we need saved to the frame. */
9341 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9342 if (aarch64_register_saved_on_entry (regno))
9343 {
9344 /* Punt on saves and restores that use ST1D and LD1D. We could
9345 try to be smarter, but it would involve making sure that the
9346 spare predicate register itself is safe to use at the save
9347 and restore points. Also, when a frame pointer is being used,
9348 the slots are often out of reach of ST1D and LD1D anyway. */
9349 machine_mode mode = aarch64_reg_save_mode (regno);
9350 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9351 continue;
9352
9353 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9354
9355 /* If the register is saved in the first SVE save slot, we use
9356 it as a stack probe for -fstack-clash-protection. */
9357 if (flag_stack_clash_protection
9358 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
9359 && known_eq (offset, 0))
9360 continue;
9361
9362 /* Get the offset relative to the register we'll use. */
9363 if (frame_pointer_needed)
9364 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9365 else
9366 offset += crtl->outgoing_args_size;
9367
9368 /* Check that we can access the stack slot of the register with one
9369 direct load with no adjustments needed. */
9370 if (aarch64_sve_mode_p (mode)
9371 ? offset_9bit_signed_scaled_p (mode, offset)
9372 : offset_12bit_unsigned_scaled_p (mode, offset))
9373 bitmap_set_bit (components, regno);
9374 }
9375
9376 /* Don't mess with the hard frame pointer. */
9377 if (frame_pointer_needed)
9378 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
9379
9380 /* If the spare predicate register used by big-endian SVE code
9381 is call-preserved, it must be saved in the main prologue
9382 before any saves that use it. */
9383 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
9384 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
9385
9386 unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9387 unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9388 /* If registers have been chosen to be stored/restored with
9389 writeback don't interfere with them to avoid having to output explicit
9390 stack adjustment instructions. */
9391 if (reg2 != INVALID_REGNUM)
9392 bitmap_clear_bit (components, reg2);
9393 if (reg1 != INVALID_REGNUM)
9394 bitmap_clear_bit (components, reg1);
9395
9396 bitmap_clear_bit (components, LR_REGNUM);
9397 bitmap_clear_bit (components, SP_REGNUM);
9398
9399 return components;
9400 }
9401
9402 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
9403
9404 static sbitmap
9405 aarch64_components_for_bb (basic_block bb)
9406 {
9407 bitmap in = DF_LIVE_IN (bb);
9408 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9409 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9410
9411 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9412 bitmap_clear (components);
9413
9414 /* Clobbered registers don't generate values in any meaningful sense,
9415 since nothing after the clobber can rely on their value. And we can't
9416 say that partially-clobbered registers are unconditionally killed,
9417 because whether they're killed or not depends on the mode of the
9418 value they're holding. Thus partially call-clobbered registers
9419 appear in neither the kill set nor the gen set.
9420
9421 Check manually for any calls that clobber more of a register than the
9422 current function can. */
9423 function_abi_aggregator callee_abis;
9424 rtx_insn *insn;
9425 FOR_BB_INSNS (bb, insn)
9426 if (CALL_P (insn))
9427 callee_abis.note_callee_abi (insn_callee_abi (insn));
9428 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9429
9430 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
9431 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9432 if (!fixed_regs[regno]
9433 && !crtl->abi->clobbers_full_reg_p (regno)
9434 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9435 || bitmap_bit_p (in, regno)
9436 || bitmap_bit_p (gen, regno)
9437 || bitmap_bit_p (kill, regno)))
9438 {
9439 bitmap_set_bit (components, regno);
9440
9441 /* If there is a callee-save at an adjacent offset, add it too
9442 to increase the use of LDP/STP. */
9443 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9444 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9445
9446 if (regno2 <= LAST_SAVED_REGNUM)
9447 {
9448 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9449 if (regno < regno2
9450 ? known_eq (offset + 8, offset2)
9451 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9452 bitmap_set_bit (components, regno2);
9453 }
9454 }
9455
9456 return components;
9457 }
9458
9459 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9460 Nothing to do for aarch64. */
9461
9462 static void
9463 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9464 {
9465 }
9466
9467 /* Return the next set bit in BMP from START onwards. Return the total number
9468 of bits in BMP if no set bit is found at or after START. */
9469
9470 static unsigned int
9471 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9472 {
9473 unsigned int nbits = SBITMAP_SIZE (bmp);
9474 if (start == nbits)
9475 return start;
9476
9477 gcc_assert (start < nbits);
9478 for (unsigned int i = start; i < nbits; i++)
9479 if (bitmap_bit_p (bmp, i))
9480 return i;
9481
9482 return nbits;
9483 }
9484
9485 /* Do the work for aarch64_emit_prologue_components and
9486 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
9487 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9488 for these components or the epilogue sequence. That is, it determines
9489 whether we should emit stores or loads and what kind of CFA notes to attach
9490 to the insns. Otherwise the logic for the two sequences is very
9491 similar. */
9492
9493 static void
9494 aarch64_process_components (sbitmap components, bool prologue_p)
9495 {
9496 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9497 ? HARD_FRAME_POINTER_REGNUM
9498 : STACK_POINTER_REGNUM);
9499
9500 unsigned last_regno = SBITMAP_SIZE (components);
9501 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9502 rtx_insn *insn = NULL;
9503
9504 while (regno != last_regno)
9505 {
9506 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9507 machine_mode mode = aarch64_reg_save_mode (regno);
9508
9509 rtx reg = gen_rtx_REG (mode, regno);
9510 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9511 if (frame_pointer_needed)
9512 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9513 else
9514 offset += crtl->outgoing_args_size;
9515
9516 rtx addr = plus_constant (Pmode, ptr_reg, offset);
9517 rtx mem = gen_frame_mem (mode, addr);
9518
9519 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9520 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9521 /* No more registers to handle after REGNO.
9522 Emit a single save/restore and exit. */
9523 if (regno2 == last_regno)
9524 {
9525 insn = emit_insn (set);
9526 if (frame_related_p)
9527 {
9528 RTX_FRAME_RELATED_P (insn) = 1;
9529 if (prologue_p)
9530 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9531 else
9532 add_reg_note (insn, REG_CFA_RESTORE, reg);
9533 }
9534 break;
9535 }
9536
9537 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9538 /* The next register is not of the same class or its offset is not
9539 mergeable with the current one into a pair. */
9540 if (aarch64_sve_mode_p (mode)
9541 || !satisfies_constraint_Ump (mem)
9542 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9543 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9544 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
9545 GET_MODE_SIZE (mode)))
9546 {
9547 insn = emit_insn (set);
9548 if (frame_related_p)
9549 {
9550 RTX_FRAME_RELATED_P (insn) = 1;
9551 if (prologue_p)
9552 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9553 else
9554 add_reg_note (insn, REG_CFA_RESTORE, reg);
9555 }
9556
9557 regno = regno2;
9558 continue;
9559 }
9560
9561 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9562
9563 /* REGNO2 can be saved/restored in a pair with REGNO. */
9564 rtx reg2 = gen_rtx_REG (mode, regno2);
9565 if (frame_pointer_needed)
9566 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9567 else
9568 offset2 += crtl->outgoing_args_size;
9569 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9570 rtx mem2 = gen_frame_mem (mode, addr2);
9571 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9572 : gen_rtx_SET (reg2, mem2);
9573
9574 if (prologue_p)
9575 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
9576 else
9577 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9578
9579 if (frame_related_p || frame_related2_p)
9580 {
9581 RTX_FRAME_RELATED_P (insn) = 1;
9582 if (prologue_p)
9583 {
9584 if (frame_related_p)
9585 add_reg_note (insn, REG_CFA_OFFSET, set);
9586 if (frame_related2_p)
9587 add_reg_note (insn, REG_CFA_OFFSET, set2);
9588 }
9589 else
9590 {
9591 if (frame_related_p)
9592 add_reg_note (insn, REG_CFA_RESTORE, reg);
9593 if (frame_related2_p)
9594 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9595 }
9596 }
9597
9598 regno = aarch64_get_next_set_bit (components, regno2 + 1);
9599 }
9600 }
9601
9602 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9603
9604 static void
9605 aarch64_emit_prologue_components (sbitmap components)
9606 {
9607 aarch64_process_components (components, true);
9608 }
9609
9610 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9611
9612 static void
9613 aarch64_emit_epilogue_components (sbitmap components)
9614 {
9615 aarch64_process_components (components, false);
9616 }
9617
9618 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9619
9620 static void
9621 aarch64_set_handled_components (sbitmap components)
9622 {
9623 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9624 if (bitmap_bit_p (components, regno))
9625 cfun->machine->reg_is_wrapped_separately[regno] = true;
9626 }
9627
9628 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9629 determining the probe offset for alloca. */
9630
9631 static HOST_WIDE_INT
9632 aarch64_stack_clash_protection_alloca_probe_range (void)
9633 {
9634 return STACK_CLASH_CALLER_GUARD;
9635 }
9636
9637
9638 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9639 registers. If POLY_SIZE is not large enough to require a probe this function
9640 will only adjust the stack. When allocating the stack space
9641 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9642 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
9643 arguments. If we are then we ensure that any allocation larger than the ABI
9644 defined buffer needs a probe so that the invariant of having a 1KB buffer is
9645 maintained.
9646
9647 We emit barriers after each stack adjustment to prevent optimizations from
9648 breaking the invariant that we never drop the stack more than a page. This
9649 invariant is needed to make it easier to correctly handle asynchronous
9650 events, e.g. if we were to allow the stack to be dropped by more than a page
9651 and then have multiple probes up and we take a signal somewhere in between
9652 then the signal handler doesn't know the state of the stack and can make no
9653 assumptions about which pages have been probed. */
9654
9655 static void
9656 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9657 poly_int64 poly_size,
9658 bool frame_related_p,
9659 bool final_adjustment_p)
9660 {
9661 HOST_WIDE_INT guard_size
9662 = 1 << param_stack_clash_protection_guard_size;
9663 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9664 HOST_WIDE_INT min_probe_threshold
9665 = (final_adjustment_p
9666 ? guard_used_by_caller
9667 : guard_size - guard_used_by_caller);
9668 /* When doing the final adjustment for the outgoing arguments, take into
9669 account any unprobed space there is above the current SP. There are
9670 two cases:
9671
9672 - When saving SVE registers below the hard frame pointer, we force
9673 the lowest save to take place in the prologue before doing the final
9674 adjustment (i.e. we don't allow the save to be shrink-wrapped).
9675 This acts as a probe at SP, so there is no unprobed space.
9676
9677 - When there are no SVE register saves, we use the store of the link
9678 register as a probe. We can't assume that LR was saved at position 0
9679 though, so treat any space below it as unprobed. */
9680 if (final_adjustment_p
9681 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
9682 {
9683 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
9684 if (known_ge (lr_offset, 0))
9685 min_probe_threshold -= lr_offset.to_constant ();
9686 else
9687 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
9688 }
9689
9690 poly_int64 frame_size = cfun->machine->frame.frame_size;
9691
9692 /* We should always have a positive probe threshold. */
9693 gcc_assert (min_probe_threshold > 0);
9694
9695 if (flag_stack_clash_protection && !final_adjustment_p)
9696 {
9697 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9698 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9699 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9700
9701 if (known_eq (frame_size, 0))
9702 {
9703 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9704 }
9705 else if (known_lt (initial_adjust + sve_callee_adjust,
9706 guard_size - guard_used_by_caller)
9707 && known_lt (final_adjust, guard_used_by_caller))
9708 {
9709 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9710 }
9711 }
9712
9713 /* If SIZE is not large enough to require probing, just adjust the stack and
9714 exit. */
9715 if (known_lt (poly_size, min_probe_threshold)
9716 || !flag_stack_clash_protection)
9717 {
9718 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
9719 return;
9720 }
9721
9722 HOST_WIDE_INT size;
9723 /* Handle the SVE non-constant case first. */
9724 if (!poly_size.is_constant (&size))
9725 {
9726 if (dump_file)
9727 {
9728 fprintf (dump_file, "Stack clash SVE prologue: ");
9729 print_dec (poly_size, dump_file);
9730 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9731 }
9732
9733 /* First calculate the amount of bytes we're actually spilling. */
9734 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9735 poly_size, temp1, temp2, false, true);
9736
9737 rtx_insn *insn = get_last_insn ();
9738
9739 if (frame_related_p)
9740 {
9741 /* This is done to provide unwinding information for the stack
9742 adjustments we're about to do, however to prevent the optimizers
9743 from removing the R11 move and leaving the CFA note (which would be
9744 very wrong) we tie the old and new stack pointer together.
9745 The tie will expand to nothing but the optimizers will not touch
9746 the instruction. */
9747 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9748 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9749 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
9750
9751 /* We want the CFA independent of the stack pointer for the
9752 duration of the loop. */
9753 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9754 RTX_FRAME_RELATED_P (insn) = 1;
9755 }
9756
9757 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9758 rtx guard_const = gen_int_mode (guard_size, Pmode);
9759
9760 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9761 stack_pointer_rtx, temp1,
9762 probe_const, guard_const));
9763
9764 /* Now reset the CFA register if needed. */
9765 if (frame_related_p)
9766 {
9767 add_reg_note (insn, REG_CFA_DEF_CFA,
9768 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9769 gen_int_mode (poly_size, Pmode)));
9770 RTX_FRAME_RELATED_P (insn) = 1;
9771 }
9772
9773 return;
9774 }
9775
9776 if (dump_file)
9777 fprintf (dump_file,
9778 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9779 " bytes, probing will be required.\n", size);
9780
9781 /* Round size to the nearest multiple of guard_size, and calculate the
9782 residual as the difference between the original size and the rounded
9783 size. */
9784 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9785 HOST_WIDE_INT residual = size - rounded_size;
9786
9787 /* We can handle a small number of allocations/probes inline. Otherwise
9788 punt to a loop. */
9789 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9790 {
9791 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9792 {
9793 aarch64_sub_sp (NULL, temp2, guard_size, true);
9794 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9795 guard_used_by_caller));
9796 emit_insn (gen_blockage ());
9797 }
9798 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9799 }
9800 else
9801 {
9802 /* Compute the ending address. */
9803 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9804 temp1, NULL, false, true);
9805 rtx_insn *insn = get_last_insn ();
9806
9807 /* For the initial allocation, we don't have a frame pointer
9808 set up, so we always need CFI notes. If we're doing the
9809 final allocation, then we may have a frame pointer, in which
9810 case it is the CFA, otherwise we need CFI notes.
9811
9812 We can determine which allocation we are doing by looking at
9813 the value of FRAME_RELATED_P since the final allocations are not
9814 frame related. */
9815 if (frame_related_p)
9816 {
9817 /* We want the CFA independent of the stack pointer for the
9818 duration of the loop. */
9819 add_reg_note (insn, REG_CFA_DEF_CFA,
9820 plus_constant (Pmode, temp1, rounded_size));
9821 RTX_FRAME_RELATED_P (insn) = 1;
9822 }
9823
9824 /* This allocates and probes the stack. Note that this re-uses some of
9825 the existing Ada stack protection code. However we are guaranteed not
9826 to enter the non loop or residual branches of that code.
9827
9828 The non-loop part won't be entered because if our allocation amount
9829 doesn't require a loop, the case above would handle it.
9830
9831 The residual amount won't be entered because TEMP1 is a mutliple of
9832 the allocation size. The residual will always be 0. As such, the only
9833 part we are actually using from that code is the loop setup. The
9834 actual probing is done in aarch64_output_probe_stack_range. */
9835 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9836 stack_pointer_rtx, temp1));
9837
9838 /* Now reset the CFA register if needed. */
9839 if (frame_related_p)
9840 {
9841 add_reg_note (insn, REG_CFA_DEF_CFA,
9842 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9843 RTX_FRAME_RELATED_P (insn) = 1;
9844 }
9845
9846 emit_insn (gen_blockage ());
9847 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9848 }
9849
9850 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9851 be probed. This maintains the requirement that each page is probed at
9852 least once. For initial probing we probe only if the allocation is
9853 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
9854 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9855 GUARD_SIZE. This works that for any allocation that is large enough to
9856 trigger a probe here, we'll have at least one, and if they're not large
9857 enough for this code to emit anything for them, The page would have been
9858 probed by the saving of FP/LR either by this function or any callees. If
9859 we don't have any callees then we won't have more stack adjustments and so
9860 are still safe. */
9861 if (residual)
9862 {
9863 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
9864 /* If we're doing final adjustments, and we've done any full page
9865 allocations then any residual needs to be probed. */
9866 if (final_adjustment_p && rounded_size != 0)
9867 min_probe_threshold = 0;
9868 /* If doing a small final adjustment, we always probe at offset 0.
9869 This is done to avoid issues when LR is not at position 0 or when
9870 the final adjustment is smaller than the probing offset. */
9871 else if (final_adjustment_p && rounded_size == 0)
9872 residual_probe_offset = 0;
9873
9874 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
9875 if (residual >= min_probe_threshold)
9876 {
9877 if (dump_file)
9878 fprintf (dump_file,
9879 "Stack clash AArch64 prologue residuals: "
9880 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9881 "\n", residual);
9882
9883 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9884 residual_probe_offset));
9885 emit_insn (gen_blockage ());
9886 }
9887 }
9888 }
9889
9890 /* Return 1 if the register is used by the epilogue. We need to say the
9891 return register is used, but only after epilogue generation is complete.
9892 Note that in the case of sibcalls, the values "used by the epilogue" are
9893 considered live at the start of the called function.
9894
9895 For SIMD functions we need to return 1 for FP registers that are saved and
9896 restored by a function but are not zero in call_used_regs. If we do not do
9897 this optimizations may remove the restore of the register. */
9898
9899 int
9900 aarch64_epilogue_uses (int regno)
9901 {
9902 if (epilogue_completed)
9903 {
9904 if (regno == LR_REGNUM)
9905 return 1;
9906 }
9907 return 0;
9908 }
9909
9910 /* AArch64 stack frames generated by this compiler look like:
9911
9912 +-------------------------------+
9913 | |
9914 | incoming stack arguments |
9915 | |
9916 +-------------------------------+
9917 | | <-- incoming stack pointer (aligned)
9918 | callee-allocated save area |
9919 | for register varargs |
9920 | |
9921 +-------------------------------+
9922 | local variables | <-- frame_pointer_rtx
9923 | |
9924 +-------------------------------+
9925 | padding | \
9926 +-------------------------------+ |
9927 | callee-saved registers | | frame.saved_regs_size
9928 +-------------------------------+ |
9929 | LR' | |
9930 +-------------------------------+ |
9931 | FP' | |
9932 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
9933 | SVE vector registers | | \
9934 +-------------------------------+ | | below_hard_fp_saved_regs_size
9935 | SVE predicate registers | / /
9936 +-------------------------------+
9937 | dynamic allocation |
9938 +-------------------------------+
9939 | padding |
9940 +-------------------------------+
9941 | outgoing stack arguments | <-- arg_pointer
9942 | |
9943 +-------------------------------+
9944 | | <-- stack_pointer_rtx (aligned)
9945
9946 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9947 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9948 unchanged.
9949
9950 By default for stack-clash we assume the guard is at least 64KB, but this
9951 value is configurable to either 4KB or 64KB. We also force the guard size to
9952 be the same as the probing interval and both values are kept in sync.
9953
9954 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9955 on the guard size) of stack space without probing.
9956
9957 When probing is needed, we emit a probe at the start of the prologue
9958 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9959
9960 We have to track how much space has been allocated and the only stores
9961 to the stack we track as implicit probes are the FP/LR stores.
9962
9963 For outgoing arguments we probe if the size is larger than 1KB, such that
9964 the ABI specified buffer is maintained for the next callee.
9965
9966 The following registers are reserved during frame layout and should not be
9967 used for any other purpose:
9968
9969 - r11: Used by stack clash protection when SVE is enabled, and also
9970 as an anchor register when saving and restoring registers
9971 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9972 - r14 and r15: Used for speculation tracking.
9973 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9974 - r30(LR), r29(FP): Used by standard frame layout.
9975
9976 These registers must be avoided in frame layout related code unless the
9977 explicit intention is to interact with one of the features listed above. */
9978
9979 /* Generate the prologue instructions for entry into a function.
9980 Establish the stack frame by decreasing the stack pointer with a
9981 properly calculated size and, if necessary, create a frame record
9982 filled with the values of LR and previous frame pointer. The
9983 current FP is also set up if it is in use. */
9984
9985 void
9986 aarch64_expand_prologue (void)
9987 {
9988 poly_int64 frame_size = cfun->machine->frame.frame_size;
9989 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9990 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
9991 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9992 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
9993 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9994 poly_int64 below_hard_fp_saved_regs_size
9995 = cfun->machine->frame.below_hard_fp_saved_regs_size;
9996 unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9997 unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9998 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
9999 rtx_insn *insn;
10000
10001 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
10002 {
10003 /* Fold the SVE allocation into the initial allocation.
10004 We don't do this in aarch64_layout_arg to avoid pessimizing
10005 the epilogue code. */
10006 initial_adjust += sve_callee_adjust;
10007 sve_callee_adjust = 0;
10008 }
10009
10010 /* Sign return address for functions. */
10011 if (aarch64_return_address_signing_enabled ())
10012 {
10013 switch (aarch64_ra_sign_key)
10014 {
10015 case AARCH64_KEY_A:
10016 insn = emit_insn (gen_paciasp ());
10017 break;
10018 case AARCH64_KEY_B:
10019 insn = emit_insn (gen_pacibsp ());
10020 break;
10021 default:
10022 gcc_unreachable ();
10023 }
10024 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10025 RTX_FRAME_RELATED_P (insn) = 1;
10026 }
10027
10028 /* Push return address to shadow call stack. */
10029 if (cfun->machine->frame.is_scs_enabled)
10030 emit_insn (gen_scs_push ());
10031
10032 if (flag_stack_usage_info)
10033 current_function_static_stack_size = constant_lower_bound (frame_size);
10034
10035 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10036 {
10037 if (crtl->is_leaf && !cfun->calls_alloca)
10038 {
10039 if (maybe_gt (frame_size, PROBE_INTERVAL)
10040 && maybe_gt (frame_size, get_stack_check_protect ()))
10041 aarch64_emit_probe_stack_range (get_stack_check_protect (),
10042 (frame_size
10043 - get_stack_check_protect ()));
10044 }
10045 else if (maybe_gt (frame_size, 0))
10046 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
10047 }
10048
10049 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10050 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10051
10052 /* In theory we should never have both an initial adjustment
10053 and a callee save adjustment. Verify that is the case since the
10054 code below does not handle it for -fstack-clash-protection. */
10055 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
10056
10057 /* Will only probe if the initial adjustment is larger than the guard
10058 less the amount of the guard reserved for use by the caller's
10059 outgoing args. */
10060 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
10061 true, false);
10062
10063 if (callee_adjust != 0)
10064 aarch64_push_regs (reg1, reg2, callee_adjust);
10065
10066 /* The offset of the frame chain record (if any) from the current SP. */
10067 poly_int64 chain_offset = (initial_adjust + callee_adjust
10068 - cfun->machine->frame.hard_fp_offset);
10069 gcc_assert (known_ge (chain_offset, 0));
10070
10071 /* The offset of the bottom of the save area from the current SP. */
10072 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
10073
10074 if (emit_frame_chain)
10075 {
10076 if (callee_adjust == 0)
10077 {
10078 reg1 = R29_REGNUM;
10079 reg2 = R30_REGNUM;
10080 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
10081 false, false);
10082 }
10083 else
10084 gcc_assert (known_eq (chain_offset, 0));
10085 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
10086 stack_pointer_rtx, chain_offset,
10087 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
10088 if (frame_pointer_needed && !frame_size.is_constant ())
10089 {
10090 /* Variable-sized frames need to describe the save slot
10091 address using DW_CFA_expression rather than DW_CFA_offset.
10092 This means that, without taking further action, the
10093 locations of the registers that we've already saved would
10094 remain based on the stack pointer even after we redefine
10095 the CFA based on the frame pointer. We therefore need new
10096 DW_CFA_expressions to re-express the save slots with addresses
10097 based on the frame pointer. */
10098 rtx_insn *insn = get_last_insn ();
10099 gcc_assert (RTX_FRAME_RELATED_P (insn));
10100
10101 /* Add an explicit CFA definition if this was previously
10102 implicit. */
10103 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
10104 {
10105 rtx src = plus_constant (Pmode, stack_pointer_rtx,
10106 callee_offset);
10107 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10108 gen_rtx_SET (hard_frame_pointer_rtx, src));
10109 }
10110
10111 /* Change the save slot expressions for the registers that
10112 we've already saved. */
10113 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
10114 hard_frame_pointer_rtx, UNITS_PER_WORD);
10115 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
10116 hard_frame_pointer_rtx, 0);
10117 }
10118 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
10119 }
10120
10121 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
10122 callee_adjust != 0 || emit_frame_chain,
10123 emit_frame_chain);
10124 if (maybe_ne (sve_callee_adjust, 0))
10125 {
10126 gcc_assert (!flag_stack_clash_protection
10127 || known_eq (initial_adjust, 0));
10128 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
10129 sve_callee_adjust,
10130 !frame_pointer_needed, false);
10131 saved_regs_offset += sve_callee_adjust;
10132 }
10133 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
10134 false, emit_frame_chain);
10135 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
10136 callee_adjust != 0 || emit_frame_chain,
10137 emit_frame_chain);
10138
10139 /* We may need to probe the final adjustment if it is larger than the guard
10140 that is assumed by the called. */
10141 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
10142 !frame_pointer_needed, true);
10143 }
10144
10145 /* Return TRUE if we can use a simple_return insn.
10146
10147 This function checks whether the callee saved stack is empty, which
10148 means no restore actions are need. The pro_and_epilogue will use
10149 this to check whether shrink-wrapping opt is feasible. */
10150
10151 bool
10152 aarch64_use_return_insn_p (void)
10153 {
10154 if (!reload_completed)
10155 return false;
10156
10157 if (crtl->profile)
10158 return false;
10159
10160 return known_eq (cfun->machine->frame.frame_size, 0);
10161 }
10162
10163 /* Generate the epilogue instructions for returning from a function.
10164 This is almost exactly the reverse of the prolog sequence, except
10165 that we need to insert barriers to avoid scheduling loads that read
10166 from a deallocated stack, and we optimize the unwind records by
10167 emitting them all together if possible. */
10168 void
10169 aarch64_expand_epilogue (bool for_sibcall)
10170 {
10171 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
10172 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
10173 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
10174 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
10175 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
10176 poly_int64 below_hard_fp_saved_regs_size
10177 = cfun->machine->frame.below_hard_fp_saved_regs_size;
10178 unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
10179 unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
10180 unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
10181 ? R29_REGNUM : R30_REGNUM);
10182 rtx cfi_ops = NULL;
10183 rtx_insn *insn;
10184 /* A stack clash protection prologue may not have left EP0_REGNUM or
10185 EP1_REGNUM in a usable state. The same is true for allocations
10186 with an SVE component, since we then need both temporary registers
10187 for each allocation. For stack clash we are in a usable state if
10188 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
10189 HOST_WIDE_INT guard_size
10190 = 1 << param_stack_clash_protection_guard_size;
10191 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
10192
10193 /* We can re-use the registers when:
10194
10195 (a) the deallocation amount is the same as the corresponding
10196 allocation amount (which is false if we combine the initial
10197 and SVE callee save allocations in the prologue); and
10198
10199 (b) the allocation amount doesn't need a probe (which is false
10200 if the amount is guard_size - guard_used_by_caller or greater).
10201
10202 In such situations the register should remain live with the correct
10203 value. */
10204 bool can_inherit_p = (initial_adjust.is_constant ()
10205 && final_adjust.is_constant ()
10206 && (!flag_stack_clash_protection
10207 || (known_lt (initial_adjust,
10208 guard_size - guard_used_by_caller)
10209 && known_eq (sve_callee_adjust, 0))));
10210
10211 /* We need to add memory barrier to prevent read from deallocated stack. */
10212 bool need_barrier_p
10213 = maybe_ne (get_frame_size ()
10214 + cfun->machine->frame.saved_varargs_size, 0);
10215
10216 /* Emit a barrier to prevent loads from a deallocated stack. */
10217 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
10218 || cfun->calls_alloca
10219 || crtl->calls_eh_return)
10220 {
10221 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10222 need_barrier_p = false;
10223 }
10224
10225 /* Restore the stack pointer from the frame pointer if it may not
10226 be the same as the stack pointer. */
10227 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10228 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10229 if (frame_pointer_needed
10230 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
10231 /* If writeback is used when restoring callee-saves, the CFA
10232 is restored on the instruction doing the writeback. */
10233 aarch64_add_offset (Pmode, stack_pointer_rtx,
10234 hard_frame_pointer_rtx,
10235 -callee_offset - below_hard_fp_saved_regs_size,
10236 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
10237 else
10238 /* The case where we need to re-use the register here is very rare, so
10239 avoid the complicated condition and just always emit a move if the
10240 immediate doesn't fit. */
10241 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
10242
10243 /* Restore the vector registers before the predicate registers,
10244 so that we can use P4 as a temporary for big-endian SVE frames. */
10245 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
10246 callee_adjust != 0, &cfi_ops);
10247 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
10248 false, &cfi_ops);
10249 if (maybe_ne (sve_callee_adjust, 0))
10250 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
10251
10252 /* When shadow call stack is enabled, the scs_pop in the epilogue will
10253 restore x30, we don't need to restore x30 again in the traditional
10254 way. */
10255 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
10256 R0_REGNUM, last_gpr,
10257 callee_adjust != 0, &cfi_ops);
10258
10259 if (need_barrier_p)
10260 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10261
10262 if (callee_adjust != 0)
10263 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
10264
10265 /* If we have no register restore information, the CFA must have been
10266 defined in terms of the stack pointer since the end of the prologue. */
10267 gcc_assert (cfi_ops || !frame_pointer_needed);
10268
10269 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
10270 {
10271 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
10272 insn = get_last_insn ();
10273 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
10274 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
10275 RTX_FRAME_RELATED_P (insn) = 1;
10276 cfi_ops = NULL;
10277 }
10278
10279 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10280 add restriction on emit_move optimization to leaf functions. */
10281 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
10282 (!can_inherit_p || !crtl->is_leaf
10283 || df_regs_ever_live_p (EP0_REGNUM)));
10284
10285 if (cfi_ops)
10286 {
10287 /* Emit delayed restores and reset the CFA to be SP. */
10288 insn = get_last_insn ();
10289 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
10290 REG_NOTES (insn) = cfi_ops;
10291 RTX_FRAME_RELATED_P (insn) = 1;
10292 }
10293
10294 /* Pop return address from shadow call stack. */
10295 if (cfun->machine->frame.is_scs_enabled)
10296 {
10297 machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
10298 rtx reg = gen_rtx_REG (mode, R30_REGNUM);
10299
10300 insn = emit_insn (gen_scs_pop ());
10301 add_reg_note (insn, REG_CFA_RESTORE, reg);
10302 RTX_FRAME_RELATED_P (insn) = 1;
10303 }
10304
10305 /* We prefer to emit the combined return/authenticate instruction RETAA,
10306 however there are three cases in which we must instead emit an explicit
10307 authentication instruction.
10308
10309 1) Sibcalls don't return in a normal way, so if we're about to call one
10310 we must authenticate.
10311
10312 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10313 generating code for !TARGET_ARMV8_3 we can't use it and must
10314 explicitly authenticate.
10315 */
10316 if (aarch64_return_address_signing_enabled ()
10317 && (for_sibcall || !TARGET_ARMV8_3))
10318 {
10319 switch (aarch64_ra_sign_key)
10320 {
10321 case AARCH64_KEY_A:
10322 insn = emit_insn (gen_autiasp ());
10323 break;
10324 case AARCH64_KEY_B:
10325 insn = emit_insn (gen_autibsp ());
10326 break;
10327 default:
10328 gcc_unreachable ();
10329 }
10330 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10331 RTX_FRAME_RELATED_P (insn) = 1;
10332 }
10333
10334 /* Stack adjustment for exception handler. */
10335 if (crtl->calls_eh_return && !for_sibcall)
10336 {
10337 /* We need to unwind the stack by the offset computed by
10338 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
10339 to be SP; letting the CFA move during this adjustment
10340 is just as correct as retaining the CFA from the body
10341 of the function. Therefore, do nothing special. */
10342 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
10343 }
10344
10345 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10346 if (!for_sibcall)
10347 emit_jump_insn (ret_rtx);
10348 }
10349
10350 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
10351 normally or return to a previous frame after unwinding.
10352
10353 An EH return uses a single shared return sequence. The epilogue is
10354 exactly like a normal epilogue except that it has an extra input
10355 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
10356 that must be applied after the frame has been destroyed. An extra label
10357 is inserted before the epilogue which initializes this register to zero,
10358 and this is the entry point for a normal return.
10359
10360 An actual EH return updates the return address, initializes the stack
10361 adjustment and jumps directly into the epilogue (bypassing the zeroing
10362 of the adjustment). Since the return address is typically saved on the
10363 stack when a function makes a call, the saved LR must be updated outside
10364 the epilogue.
10365
10366 This poses problems as the store is generated well before the epilogue,
10367 so the offset of LR is not known yet. Also optimizations will remove the
10368 store as it appears dead, even after the epilogue is generated (as the
10369 base or offset for loading LR is different in many cases).
10370
10371 To avoid these problems this implementation forces the frame pointer
10372 in eh_return functions so that the location of LR is fixed and known early.
10373 It also marks the store volatile, so no optimization is permitted to
10374 remove the store. */
10375 rtx
10376 aarch64_eh_return_handler_rtx (void)
10377 {
10378 rtx tmp = gen_frame_mem (Pmode,
10379 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
10380
10381 /* Mark the store volatile, so no optimization is permitted to remove it. */
10382 MEM_VOLATILE_P (tmp) = true;
10383 return tmp;
10384 }
10385
10386 /* Output code to add DELTA to the first argument, and then jump
10387 to FUNCTION. Used for C++ multiple inheritance. */
10388 static void
10389 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10390 HOST_WIDE_INT delta,
10391 HOST_WIDE_INT vcall_offset,
10392 tree function)
10393 {
10394 /* The this pointer is always in x0. Note that this differs from
10395 Arm where the this pointer maybe bumped to r1 if r0 is required
10396 to return a pointer to an aggregate. On AArch64 a result value
10397 pointer will be in x8. */
10398 int this_regno = R0_REGNUM;
10399 rtx this_rtx, temp0, temp1, addr, funexp;
10400 rtx_insn *insn;
10401 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10402
10403 if (aarch64_bti_enabled ())
10404 emit_insn (gen_bti_c());
10405
10406 reload_completed = 1;
10407 emit_note (NOTE_INSN_PROLOGUE_END);
10408
10409 this_rtx = gen_rtx_REG (Pmode, this_regno);
10410 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10411 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10412
10413 if (vcall_offset == 0)
10414 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
10415 else
10416 {
10417 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10418
10419 addr = this_rtx;
10420 if (delta != 0)
10421 {
10422 if (delta >= -256 && delta < 256)
10423 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10424 plus_constant (Pmode, this_rtx, delta));
10425 else
10426 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10427 temp1, temp0, false);
10428 }
10429
10430 if (Pmode == ptr_mode)
10431 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10432 else
10433 aarch64_emit_move (temp0,
10434 gen_rtx_ZERO_EXTEND (Pmode,
10435 gen_rtx_MEM (ptr_mode, addr)));
10436
10437 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10438 addr = plus_constant (Pmode, temp0, vcall_offset);
10439 else
10440 {
10441 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10442 Pmode);
10443 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10444 }
10445
10446 if (Pmode == ptr_mode)
10447 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10448 else
10449 aarch64_emit_move (temp1,
10450 gen_rtx_SIGN_EXTEND (Pmode,
10451 gen_rtx_MEM (ptr_mode, addr)));
10452
10453 emit_insn (gen_add2_insn (this_rtx, temp1));
10454 }
10455
10456 /* Generate a tail call to the target function. */
10457 if (!TREE_USED (function))
10458 {
10459 assemble_external (function);
10460 TREE_USED (function) = 1;
10461 }
10462 funexp = XEXP (DECL_RTL (function), 0);
10463 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10464 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
10465 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10466 SIBLING_CALL_P (insn) = 1;
10467
10468 insn = get_insns ();
10469 shorten_branches (insn);
10470
10471 assemble_start_function (thunk, fnname);
10472 final_start_function (insn, file, 1);
10473 final (insn, file, 1);
10474 final_end_function ();
10475 assemble_end_function (thunk, fnname);
10476
10477 /* Stop pretending to be a post-reload pass. */
10478 reload_completed = 0;
10479 }
10480
10481 static bool
10482 aarch64_tls_referenced_p (rtx x)
10483 {
10484 if (!TARGET_HAVE_TLS)
10485 return false;
10486 subrtx_iterator::array_type array;
10487 FOR_EACH_SUBRTX (iter, array, x, ALL)
10488 {
10489 const_rtx x = *iter;
10490 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10491 return true;
10492 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10493 TLS offsets, not real symbol references. */
10494 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10495 iter.skip_subrtxes ();
10496 }
10497 return false;
10498 }
10499
10500
10501 static bool
10502 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10503 {
10504 if (GET_CODE (x) == HIGH)
10505 return true;
10506
10507 /* There's no way to calculate VL-based values using relocations. */
10508 subrtx_iterator::array_type array;
10509 FOR_EACH_SUBRTX (iter, array, x, ALL)
10510 if (GET_CODE (*iter) == CONST_POLY_INT)
10511 return true;
10512
10513 poly_int64 offset;
10514 rtx base = strip_offset_and_salt (x, &offset);
10515 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10516 {
10517 /* We checked for POLY_INT_CST offsets above. */
10518 if (aarch64_classify_symbol (base, offset.to_constant ())
10519 != SYMBOL_FORCE_TO_MEM)
10520 return true;
10521 else
10522 /* Avoid generating a 64-bit relocation in ILP32; leave
10523 to aarch64_expand_mov_immediate to handle it properly. */
10524 return mode != ptr_mode;
10525 }
10526
10527 return aarch64_tls_referenced_p (x);
10528 }
10529
10530 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10531 The expansion for a table switch is quite expensive due to the number
10532 of instructions, the table lookup and hard to predict indirect jump.
10533 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10534 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10535 performance. When optimizing for size, use 8 for smallest codesize. */
10536
10537 static unsigned int
10538 aarch64_case_values_threshold (void)
10539 {
10540 /* Use the specified limit for the number of cases before using jump
10541 tables at higher optimization levels. */
10542 if (optimize > 2
10543 && aarch64_tune_params.max_case_values != 0)
10544 return aarch64_tune_params.max_case_values;
10545 else
10546 return optimize_size ? 8 : 11;
10547 }
10548
10549 /* Return true if register REGNO is a valid index register.
10550 STRICT_P is true if REG_OK_STRICT is in effect. */
10551
10552 bool
10553 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10554 {
10555 if (!HARD_REGISTER_NUM_P (regno))
10556 {
10557 if (!strict_p)
10558 return true;
10559
10560 if (!reg_renumber)
10561 return false;
10562
10563 regno = reg_renumber[regno];
10564 }
10565 return GP_REGNUM_P (regno);
10566 }
10567
10568 /* Return true if register REGNO is a valid base register for mode MODE.
10569 STRICT_P is true if REG_OK_STRICT is in effect. */
10570
10571 bool
10572 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10573 {
10574 if (!HARD_REGISTER_NUM_P (regno))
10575 {
10576 if (!strict_p)
10577 return true;
10578
10579 if (!reg_renumber)
10580 return false;
10581
10582 regno = reg_renumber[regno];
10583 }
10584
10585 /* The fake registers will be eliminated to either the stack or
10586 hard frame pointer, both of which are usually valid base registers.
10587 Reload deals with the cases where the eliminated form isn't valid. */
10588 return (GP_REGNUM_P (regno)
10589 || regno == SP_REGNUM
10590 || regno == FRAME_POINTER_REGNUM
10591 || regno == ARG_POINTER_REGNUM);
10592 }
10593
10594 /* Return true if X is a valid base register for mode MODE.
10595 STRICT_P is true if REG_OK_STRICT is in effect. */
10596
10597 static bool
10598 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10599 {
10600 if (!strict_p
10601 && SUBREG_P (x)
10602 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10603 x = SUBREG_REG (x);
10604
10605 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10606 }
10607
10608 /* Return true if address offset is a valid index. If it is, fill in INFO
10609 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10610
10611 static bool
10612 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10613 machine_mode mode, bool strict_p)
10614 {
10615 enum aarch64_address_type type;
10616 rtx index;
10617 int shift;
10618
10619 /* (reg:P) */
10620 if ((REG_P (x) || SUBREG_P (x))
10621 && GET_MODE (x) == Pmode)
10622 {
10623 type = ADDRESS_REG_REG;
10624 index = x;
10625 shift = 0;
10626 }
10627 /* (sign_extend:DI (reg:SI)) */
10628 else if ((GET_CODE (x) == SIGN_EXTEND
10629 || GET_CODE (x) == ZERO_EXTEND)
10630 && GET_MODE (x) == DImode
10631 && GET_MODE (XEXP (x, 0)) == SImode)
10632 {
10633 type = (GET_CODE (x) == SIGN_EXTEND)
10634 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10635 index = XEXP (x, 0);
10636 shift = 0;
10637 }
10638 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10639 else if (GET_CODE (x) == MULT
10640 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10641 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10642 && GET_MODE (XEXP (x, 0)) == DImode
10643 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10644 && CONST_INT_P (XEXP (x, 1)))
10645 {
10646 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10647 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10648 index = XEXP (XEXP (x, 0), 0);
10649 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10650 }
10651 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10652 else if (GET_CODE (x) == ASHIFT
10653 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10654 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10655 && GET_MODE (XEXP (x, 0)) == DImode
10656 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10657 && CONST_INT_P (XEXP (x, 1)))
10658 {
10659 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10660 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10661 index = XEXP (XEXP (x, 0), 0);
10662 shift = INTVAL (XEXP (x, 1));
10663 }
10664 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10665 (const_int 0xffffffff<<shift)) */
10666 else if (GET_CODE (x) == AND
10667 && GET_MODE (x) == DImode
10668 && GET_CODE (XEXP (x, 0)) == MULT
10669 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10670 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10671 && CONST_INT_P (XEXP (x, 1)))
10672 {
10673 type = ADDRESS_REG_UXTW;
10674 index = XEXP (XEXP (x, 0), 0);
10675 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10676 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10677 shift = -1;
10678 }
10679 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10680 (const_int 0xffffffff<<shift)) */
10681 else if (GET_CODE (x) == AND
10682 && GET_MODE (x) == DImode
10683 && GET_CODE (XEXP (x, 0)) == ASHIFT
10684 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10685 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10686 && CONST_INT_P (XEXP (x, 1)))
10687 {
10688 type = ADDRESS_REG_UXTW;
10689 index = XEXP (XEXP (x, 0), 0);
10690 shift = INTVAL (XEXP (XEXP (x, 0), 1));
10691 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10692 shift = -1;
10693 }
10694 /* (mult:P (reg:P) (const_int scale)) */
10695 else if (GET_CODE (x) == MULT
10696 && GET_MODE (x) == Pmode
10697 && GET_MODE (XEXP (x, 0)) == Pmode
10698 && CONST_INT_P (XEXP (x, 1)))
10699 {
10700 type = ADDRESS_REG_REG;
10701 index = XEXP (x, 0);
10702 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10703 }
10704 /* (ashift:P (reg:P) (const_int shift)) */
10705 else if (GET_CODE (x) == ASHIFT
10706 && GET_MODE (x) == Pmode
10707 && GET_MODE (XEXP (x, 0)) == Pmode
10708 && CONST_INT_P (XEXP (x, 1)))
10709 {
10710 type = ADDRESS_REG_REG;
10711 index = XEXP (x, 0);
10712 shift = INTVAL (XEXP (x, 1));
10713 }
10714 else
10715 return false;
10716
10717 if (!strict_p
10718 && SUBREG_P (index)
10719 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10720 index = SUBREG_REG (index);
10721
10722 if (aarch64_sve_data_mode_p (mode))
10723 {
10724 if (type != ADDRESS_REG_REG
10725 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10726 return false;
10727 }
10728 else
10729 {
10730 if (shift != 0
10731 && !(IN_RANGE (shift, 1, 3)
10732 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10733 return false;
10734 }
10735
10736 if (REG_P (index)
10737 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10738 {
10739 info->type = type;
10740 info->offset = index;
10741 info->shift = shift;
10742 return true;
10743 }
10744
10745 return false;
10746 }
10747
10748 /* Return true if MODE is one of the modes for which we
10749 support LDP/STP operations. */
10750
10751 static bool
10752 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10753 {
10754 return mode == SImode || mode == DImode
10755 || mode == SFmode || mode == DFmode
10756 || mode == SDmode || mode == DDmode
10757 || (aarch64_vector_mode_supported_p (mode)
10758 && (known_eq (GET_MODE_SIZE (mode), 8)
10759 || (known_eq (GET_MODE_SIZE (mode), 16)
10760 && (aarch64_tune_params.extra_tuning_flags
10761 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
10762 }
10763
10764 /* Return true if REGNO is a virtual pointer register, or an eliminable
10765 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10766 include stack_pointer or hard_frame_pointer. */
10767 static bool
10768 virt_or_elim_regno_p (unsigned regno)
10769 {
10770 return ((regno >= FIRST_VIRTUAL_REGISTER
10771 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10772 || regno == FRAME_POINTER_REGNUM
10773 || regno == ARG_POINTER_REGNUM);
10774 }
10775
10776 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10777 If it is, fill in INFO appropriately. STRICT_P is true if
10778 REG_OK_STRICT is in effect. */
10779
10780 bool
10781 aarch64_classify_address (struct aarch64_address_info *info,
10782 rtx x, machine_mode mode, bool strict_p,
10783 aarch64_addr_query_type type)
10784 {
10785 enum rtx_code code = GET_CODE (x);
10786 rtx op0, op1;
10787 poly_int64 offset;
10788
10789 HOST_WIDE_INT const_size;
10790
10791 /* Whether a vector mode is partial doesn't affect address legitimacy.
10792 Partial vectors like VNx8QImode allow the same indexed addressing
10793 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10794 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10795 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10796 vec_flags &= ~VEC_PARTIAL;
10797
10798 /* On BE, we use load/store pair for all large int mode load/stores.
10799 TI/TF/TDmode may also use a load/store pair. */
10800 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10801 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10802 || type == ADDR_QUERY_LDP_STP_N
10803 || mode == TImode
10804 || mode == TFmode
10805 || mode == TDmode
10806 || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10807 && advsimd_struct_p));
10808 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10809 corresponds to the actual size of the memory being loaded/stored and the
10810 mode of the corresponding addressing mode is half of that. */
10811 if (type == ADDR_QUERY_LDP_STP_N)
10812 {
10813 if (known_eq (GET_MODE_SIZE (mode), 16))
10814 mode = DFmode;
10815 else if (known_eq (GET_MODE_SIZE (mode), 8))
10816 mode = SFmode;
10817 else
10818 return false;
10819 }
10820
10821 bool allow_reg_index_p = (!load_store_pair_p
10822 && ((vec_flags == 0
10823 && known_lt (GET_MODE_SIZE (mode), 16))
10824 || vec_flags == VEC_ADVSIMD
10825 || vec_flags & VEC_SVE_DATA));
10826
10827 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10828 The latter is not valid for SVE predicates, and that's rejected through
10829 allow_reg_index_p above. */
10830 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10831 && (code != REG && code != PLUS))
10832 return false;
10833
10834 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10835 REG addressing. */
10836 if (advsimd_struct_p
10837 && TARGET_SIMD
10838 && !BYTES_BIG_ENDIAN
10839 && (code != POST_INC && code != REG))
10840 return false;
10841
10842 gcc_checking_assert (GET_MODE (x) == VOIDmode
10843 || SCALAR_INT_MODE_P (GET_MODE (x)));
10844
10845 switch (code)
10846 {
10847 case REG:
10848 case SUBREG:
10849 info->type = ADDRESS_REG_IMM;
10850 info->base = x;
10851 info->offset = const0_rtx;
10852 info->const_offset = 0;
10853 return aarch64_base_register_rtx_p (x, strict_p);
10854
10855 case PLUS:
10856 op0 = XEXP (x, 0);
10857 op1 = XEXP (x, 1);
10858
10859 if (! strict_p
10860 && REG_P (op0)
10861 && virt_or_elim_regno_p (REGNO (op0))
10862 && poly_int_rtx_p (op1, &offset))
10863 {
10864 info->type = ADDRESS_REG_IMM;
10865 info->base = op0;
10866 info->offset = op1;
10867 info->const_offset = offset;
10868
10869 return true;
10870 }
10871
10872 if (maybe_ne (GET_MODE_SIZE (mode), 0)
10873 && aarch64_base_register_rtx_p (op0, strict_p)
10874 && poly_int_rtx_p (op1, &offset))
10875 {
10876 info->type = ADDRESS_REG_IMM;
10877 info->base = op0;
10878 info->offset = op1;
10879 info->const_offset = offset;
10880
10881 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10882 registers and individual Q registers. The available
10883 address modes are:
10884 X,X: 7-bit signed scaled offset
10885 Q: 9-bit signed offset
10886 We conservatively require an offset representable in either mode.
10887 When performing the check for pairs of X registers i.e. LDP/STP
10888 pass down DImode since that is the natural size of the LDP/STP
10889 instruction memory accesses. */
10890 if (mode == TImode || mode == TFmode || mode == TDmode)
10891 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10892 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10893 || offset_12bit_unsigned_scaled_p (mode, offset)));
10894
10895 if (mode == V8DImode)
10896 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10897 && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10898
10899 /* A 7bit offset check because OImode will emit a ldp/stp
10900 instruction (only !TARGET_SIMD or big endian will get here).
10901 For ldp/stp instructions, the offset is scaled for the size of a
10902 single element of the pair. */
10903 if (aarch64_advsimd_partial_struct_mode_p (mode)
10904 && known_eq (GET_MODE_SIZE (mode), 16))
10905 return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10906 if (aarch64_advsimd_full_struct_mode_p (mode)
10907 && known_eq (GET_MODE_SIZE (mode), 32))
10908 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10909
10910 /* Three 9/12 bit offsets checks because CImode will emit three
10911 ldr/str instructions (only !TARGET_SIMD or big endian will
10912 get here). */
10913 if (aarch64_advsimd_partial_struct_mode_p (mode)
10914 && known_eq (GET_MODE_SIZE (mode), 24))
10915 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10916 && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10917 offset + 16)
10918 || offset_12bit_unsigned_scaled_p (DImode,
10919 offset + 16)));
10920 if (aarch64_advsimd_full_struct_mode_p (mode)
10921 && known_eq (GET_MODE_SIZE (mode), 48))
10922 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10923 && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10924 offset + 32)
10925 || offset_12bit_unsigned_scaled_p (TImode,
10926 offset + 32)));
10927
10928 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10929 instructions (only big endian will get here). */
10930 if (aarch64_advsimd_partial_struct_mode_p (mode)
10931 && known_eq (GET_MODE_SIZE (mode), 32))
10932 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10933 && aarch64_offset_7bit_signed_scaled_p (DImode,
10934 offset + 16));
10935 if (aarch64_advsimd_full_struct_mode_p (mode)
10936 && known_eq (GET_MODE_SIZE (mode), 64))
10937 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10938 && aarch64_offset_7bit_signed_scaled_p (TImode,
10939 offset + 32));
10940
10941 /* Make "m" use the LD1 offset range for SVE data modes, so
10942 that pre-RTL optimizers like ivopts will work to that
10943 instead of the wider LDR/STR range. */
10944 if (vec_flags == VEC_SVE_DATA)
10945 return (type == ADDR_QUERY_M
10946 ? offset_4bit_signed_scaled_p (mode, offset)
10947 : offset_9bit_signed_scaled_p (mode, offset));
10948
10949 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10950 {
10951 poly_int64 end_offset = (offset
10952 + GET_MODE_SIZE (mode)
10953 - BYTES_PER_SVE_VECTOR);
10954 return (type == ADDR_QUERY_M
10955 ? offset_4bit_signed_scaled_p (mode, offset)
10956 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10957 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10958 end_offset)));
10959 }
10960
10961 if (vec_flags == VEC_SVE_PRED)
10962 return offset_9bit_signed_scaled_p (mode, offset);
10963
10964 if (load_store_pair_p)
10965 return ((known_eq (GET_MODE_SIZE (mode), 4)
10966 || known_eq (GET_MODE_SIZE (mode), 8)
10967 || known_eq (GET_MODE_SIZE (mode), 16))
10968 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10969 else
10970 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10971 || offset_12bit_unsigned_scaled_p (mode, offset));
10972 }
10973
10974 if (allow_reg_index_p)
10975 {
10976 /* Look for base + (scaled/extended) index register. */
10977 if (aarch64_base_register_rtx_p (op0, strict_p)
10978 && aarch64_classify_index (info, op1, mode, strict_p))
10979 {
10980 info->base = op0;
10981 return true;
10982 }
10983 if (aarch64_base_register_rtx_p (op1, strict_p)
10984 && aarch64_classify_index (info, op0, mode, strict_p))
10985 {
10986 info->base = op1;
10987 return true;
10988 }
10989 }
10990
10991 return false;
10992
10993 case POST_INC:
10994 case POST_DEC:
10995 case PRE_INC:
10996 case PRE_DEC:
10997 info->type = ADDRESS_REG_WB;
10998 info->base = XEXP (x, 0);
10999 info->offset = NULL_RTX;
11000 return aarch64_base_register_rtx_p (info->base, strict_p);
11001
11002 case POST_MODIFY:
11003 case PRE_MODIFY:
11004 info->type = ADDRESS_REG_WB;
11005 info->base = XEXP (x, 0);
11006 if (GET_CODE (XEXP (x, 1)) == PLUS
11007 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
11008 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
11009 && aarch64_base_register_rtx_p (info->base, strict_p))
11010 {
11011 info->offset = XEXP (XEXP (x, 1), 1);
11012 info->const_offset = offset;
11013
11014 /* TImode, TFmode and TDmode values are allowed in both pairs of X
11015 registers and individual Q registers. The available
11016 address modes are:
11017 X,X: 7-bit signed scaled offset
11018 Q: 9-bit signed offset
11019 We conservatively require an offset representable in either mode.
11020 */
11021 if (mode == TImode || mode == TFmode || mode == TDmode)
11022 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
11023 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
11024
11025 if (load_store_pair_p)
11026 return ((known_eq (GET_MODE_SIZE (mode), 4)
11027 || known_eq (GET_MODE_SIZE (mode), 8)
11028 || known_eq (GET_MODE_SIZE (mode), 16))
11029 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
11030 else
11031 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
11032 }
11033 return false;
11034
11035 case CONST:
11036 case SYMBOL_REF:
11037 case LABEL_REF:
11038 /* load literal: pc-relative constant pool entry. Only supported
11039 for SI mode or larger. */
11040 info->type = ADDRESS_SYMBOLIC;
11041
11042 if (!load_store_pair_p
11043 && GET_MODE_SIZE (mode).is_constant (&const_size)
11044 && const_size >= 4)
11045 {
11046 poly_int64 offset;
11047 rtx sym = strip_offset_and_salt (x, &offset);
11048 return ((LABEL_REF_P (sym)
11049 || (SYMBOL_REF_P (sym)
11050 && CONSTANT_POOL_ADDRESS_P (sym)
11051 && aarch64_pcrelative_literal_loads)));
11052 }
11053 return false;
11054
11055 case LO_SUM:
11056 info->type = ADDRESS_LO_SUM;
11057 info->base = XEXP (x, 0);
11058 info->offset = XEXP (x, 1);
11059 if (allow_reg_index_p
11060 && aarch64_base_register_rtx_p (info->base, strict_p))
11061 {
11062 poly_int64 offset;
11063 HOST_WIDE_INT const_offset;
11064 rtx sym = strip_offset_and_salt (info->offset, &offset);
11065 if (SYMBOL_REF_P (sym)
11066 && offset.is_constant (&const_offset)
11067 && (aarch64_classify_symbol (sym, const_offset)
11068 == SYMBOL_SMALL_ABSOLUTE))
11069 {
11070 /* The symbol and offset must be aligned to the access size. */
11071 unsigned int align;
11072
11073 if (CONSTANT_POOL_ADDRESS_P (sym))
11074 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
11075 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
11076 {
11077 tree exp = SYMBOL_REF_DECL (sym);
11078 align = TYPE_ALIGN (TREE_TYPE (exp));
11079 align = aarch64_constant_alignment (exp, align);
11080 }
11081 else if (SYMBOL_REF_DECL (sym))
11082 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
11083 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
11084 && SYMBOL_REF_BLOCK (sym) != NULL)
11085 align = SYMBOL_REF_BLOCK (sym)->alignment;
11086 else
11087 align = BITS_PER_UNIT;
11088
11089 poly_int64 ref_size = GET_MODE_SIZE (mode);
11090 if (known_eq (ref_size, 0))
11091 ref_size = GET_MODE_SIZE (DImode);
11092
11093 return (multiple_p (const_offset, ref_size)
11094 && multiple_p (align / BITS_PER_UNIT, ref_size));
11095 }
11096 }
11097 return false;
11098
11099 default:
11100 return false;
11101 }
11102 }
11103
11104 /* Return true if the address X is valid for a PRFM instruction.
11105 STRICT_P is true if we should do strict checking with
11106 aarch64_classify_address. */
11107
11108 bool
11109 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
11110 {
11111 struct aarch64_address_info addr;
11112
11113 /* PRFM accepts the same addresses as DImode... */
11114 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
11115 if (!res)
11116 return false;
11117
11118 /* ... except writeback forms. */
11119 return addr.type != ADDRESS_REG_WB;
11120 }
11121
11122 bool
11123 aarch64_symbolic_address_p (rtx x)
11124 {
11125 poly_int64 offset;
11126 x = strip_offset_and_salt (x, &offset);
11127 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
11128 }
11129
11130 /* Classify the base of symbolic expression X. */
11131
11132 enum aarch64_symbol_type
11133 aarch64_classify_symbolic_expression (rtx x)
11134 {
11135 rtx offset;
11136
11137 split_const (x, &x, &offset);
11138 return aarch64_classify_symbol (x, INTVAL (offset));
11139 }
11140
11141
11142 /* Return TRUE if X is a legitimate address for accessing memory in
11143 mode MODE. */
11144 static bool
11145 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
11146 {
11147 struct aarch64_address_info addr;
11148
11149 return aarch64_classify_address (&addr, x, mode, strict_p);
11150 }
11151
11152 /* Return TRUE if X is a legitimate address of type TYPE for accessing
11153 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
11154 bool
11155 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
11156 aarch64_addr_query_type type)
11157 {
11158 struct aarch64_address_info addr;
11159
11160 return aarch64_classify_address (&addr, x, mode, strict_p, type);
11161 }
11162
11163 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
11164
11165 static bool
11166 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
11167 poly_int64 orig_offset,
11168 machine_mode mode)
11169 {
11170 HOST_WIDE_INT size;
11171 if (GET_MODE_SIZE (mode).is_constant (&size))
11172 {
11173 HOST_WIDE_INT const_offset, second_offset;
11174
11175 /* A general SVE offset is A * VQ + B. Remove the A component from
11176 coefficient 0 in order to get the constant B. */
11177 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
11178
11179 /* Split an out-of-range address displacement into a base and
11180 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
11181 range otherwise to increase opportunities for sharing the base
11182 address of different sizes. Unaligned accesses use the signed
11183 9-bit range, TImode/TFmode/TDmode use the intersection of signed
11184 scaled 7-bit and signed 9-bit offset. */
11185 if (mode == TImode || mode == TFmode || mode == TDmode)
11186 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
11187 else if ((const_offset & (size - 1)) != 0)
11188 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
11189 else
11190 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
11191
11192 if (second_offset == 0 || known_eq (orig_offset, second_offset))
11193 return false;
11194
11195 /* Split the offset into second_offset and the rest. */
11196 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11197 *offset2 = gen_int_mode (second_offset, Pmode);
11198 return true;
11199 }
11200 else
11201 {
11202 /* Get the mode we should use as the basis of the range. For structure
11203 modes this is the mode of one vector. */
11204 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11205 machine_mode step_mode
11206 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11207
11208 /* Get the "mul vl" multiplier we'd like to use. */
11209 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11210 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11211 if (vec_flags & VEC_SVE_DATA)
11212 /* LDR supports a 9-bit range, but the move patterns for
11213 structure modes require all vectors to be in range of the
11214 same base. The simplest way of accomodating that while still
11215 promoting reuse of anchor points between different modes is
11216 to use an 8-bit range unconditionally. */
11217 vnum = ((vnum + 128) & 255) - 128;
11218 else
11219 /* Predicates are only handled singly, so we might as well use
11220 the full range. */
11221 vnum = ((vnum + 256) & 511) - 256;
11222 if (vnum == 0)
11223 return false;
11224
11225 /* Convert the "mul vl" multiplier into a byte offset. */
11226 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11227 if (known_eq (second_offset, orig_offset))
11228 return false;
11229
11230 /* Split the offset into second_offset and the rest. */
11231 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11232 *offset2 = gen_int_mode (second_offset, Pmode);
11233 return true;
11234 }
11235 }
11236
11237 /* Return the binary representation of floating point constant VALUE in INTVAL.
11238 If the value cannot be converted, return false without setting INTVAL.
11239 The conversion is done in the given MODE. */
11240 bool
11241 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11242 {
11243
11244 /* We make a general exception for 0. */
11245 if (aarch64_float_const_zero_rtx_p (value))
11246 {
11247 *intval = 0;
11248 return true;
11249 }
11250
11251 scalar_float_mode mode;
11252 if (!CONST_DOUBLE_P (value)
11253 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11254 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11255 /* Only support up to DF mode. */
11256 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11257 return false;
11258
11259 unsigned HOST_WIDE_INT ival = 0;
11260
11261 long res[2];
11262 real_to_target (res,
11263 CONST_DOUBLE_REAL_VALUE (value),
11264 REAL_MODE_FORMAT (mode));
11265
11266 if (mode == DFmode || mode == DDmode)
11267 {
11268 int order = BYTES_BIG_ENDIAN ? 1 : 0;
11269 ival = zext_hwi (res[order], 32);
11270 ival |= (zext_hwi (res[1 - order], 32) << 32);
11271 }
11272 else
11273 ival = zext_hwi (res[0], 32);
11274
11275 *intval = ival;
11276 return true;
11277 }
11278
11279 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11280 single MOV(+MOVK) followed by an FMOV. */
11281 bool
11282 aarch64_float_const_rtx_p (rtx x)
11283 {
11284 machine_mode mode = GET_MODE (x);
11285 if (mode == VOIDmode)
11286 return false;
11287
11288 /* Determine whether it's cheaper to write float constants as
11289 mov/movk pairs over ldr/adrp pairs. */
11290 unsigned HOST_WIDE_INT ival;
11291
11292 if (CONST_DOUBLE_P (x)
11293 && SCALAR_FLOAT_MODE_P (mode)
11294 && aarch64_reinterpret_float_as_int (x, &ival))
11295 {
11296 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
11297 int num_instr = aarch64_internal_mov_immediate
11298 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11299 return num_instr < 3;
11300 }
11301
11302 return false;
11303 }
11304
11305 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11306 Floating Point). */
11307 bool
11308 aarch64_float_const_zero_rtx_p (rtx x)
11309 {
11310 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11311 zr as our callers expect, so no need to check the actual
11312 value if X is of Decimal Floating Point type. */
11313 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
11314 return false;
11315
11316 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11317 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11318 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11319 }
11320
11321 /* Return TRUE if rtx X is immediate constant that fits in a single
11322 MOVI immediate operation. */
11323 bool
11324 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11325 {
11326 if (!TARGET_SIMD)
11327 return false;
11328
11329 machine_mode vmode;
11330 scalar_int_mode imode;
11331 unsigned HOST_WIDE_INT ival;
11332
11333 if (CONST_DOUBLE_P (x)
11334 && SCALAR_FLOAT_MODE_P (mode))
11335 {
11336 if (!aarch64_reinterpret_float_as_int (x, &ival))
11337 return false;
11338
11339 /* We make a general exception for 0. */
11340 if (aarch64_float_const_zero_rtx_p (x))
11341 return true;
11342
11343 imode = int_mode_for_mode (mode).require ();
11344 }
11345 else if (CONST_INT_P (x)
11346 && is_a <scalar_int_mode> (mode, &imode))
11347 ival = INTVAL (x);
11348 else
11349 return false;
11350
11351 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11352 a 128 bit vector mode. */
11353 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11354
11355 vmode = aarch64_simd_container_mode (imode, width);
11356 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11357
11358 return aarch64_simd_valid_immediate (v_op, NULL);
11359 }
11360
11361
11362 /* Return the fixed registers used for condition codes. */
11363
11364 static bool
11365 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11366 {
11367 *p1 = CC_REGNUM;
11368 *p2 = INVALID_REGNUM;
11369 return true;
11370 }
11371
11372 /* This function is used by the call expanders of the machine description.
11373 RESULT is the register in which the result is returned. It's NULL for
11374 "call" and "sibcall".
11375 MEM is the location of the function call.
11376 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
11377 SIBCALL indicates whether this function call is normal call or sibling call.
11378 It will generate different pattern accordingly. */
11379
11380 void
11381 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
11382 {
11383 rtx call, callee, tmp;
11384 rtvec vec;
11385 machine_mode mode;
11386
11387 gcc_assert (MEM_P (mem));
11388 callee = XEXP (mem, 0);
11389 mode = GET_MODE (callee);
11390 gcc_assert (mode == Pmode);
11391
11392 /* Decide if we should generate indirect calls by loading the
11393 address of the callee into a register before performing
11394 the branch-and-link. */
11395 if (SYMBOL_REF_P (callee)
11396 ? (aarch64_is_long_call_p (callee)
11397 || aarch64_is_noplt_call_p (callee))
11398 : !REG_P (callee))
11399 XEXP (mem, 0) = force_reg (mode, callee);
11400
11401 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11402
11403 if (result != NULL_RTX)
11404 call = gen_rtx_SET (result, call);
11405
11406 if (sibcall)
11407 tmp = ret_rtx;
11408 else
11409 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11410
11411 gcc_assert (CONST_INT_P (callee_abi));
11412 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11413 UNSPEC_CALLEE_ABI);
11414
11415 vec = gen_rtvec (3, call, callee_abi, tmp);
11416 call = gen_rtx_PARALLEL (VOIDmode, vec);
11417
11418 aarch64_emit_call_insn (call);
11419 }
11420
11421 /* Emit call insn with PAT and do aarch64-specific handling. */
11422
11423 void
11424 aarch64_emit_call_insn (rtx pat)
11425 {
11426 rtx insn = emit_call_insn (pat);
11427
11428 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11429 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11430 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11431 }
11432
11433 machine_mode
11434 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11435 {
11436 machine_mode mode_x = GET_MODE (x);
11437 rtx_code code_x = GET_CODE (x);
11438
11439 /* All floating point compares return CCFP if it is an equality
11440 comparison, and CCFPE otherwise. */
11441 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11442 {
11443 switch (code)
11444 {
11445 case EQ:
11446 case NE:
11447 case UNORDERED:
11448 case ORDERED:
11449 case UNLT:
11450 case UNLE:
11451 case UNGT:
11452 case UNGE:
11453 case UNEQ:
11454 return CCFPmode;
11455
11456 case LT:
11457 case LE:
11458 case GT:
11459 case GE:
11460 case LTGT:
11461 return CCFPEmode;
11462
11463 default:
11464 gcc_unreachable ();
11465 }
11466 }
11467
11468 /* Equality comparisons of short modes against zero can be performed
11469 using the TST instruction with the appropriate bitmask. */
11470 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11471 && (code == EQ || code == NE)
11472 && (mode_x == HImode || mode_x == QImode))
11473 return CC_Zmode;
11474
11475 /* Similarly, comparisons of zero_extends from shorter modes can
11476 be performed using an ANDS with an immediate mask. */
11477 if (y == const0_rtx && code_x == ZERO_EXTEND
11478 && (mode_x == SImode || mode_x == DImode)
11479 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11480 && (code == EQ || code == NE))
11481 return CC_Zmode;
11482
11483 /* Zero extracts support equality comparisons. */
11484 if ((mode_x == SImode || mode_x == DImode)
11485 && y == const0_rtx
11486 && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11487 && CONST_INT_P (XEXP (x, 2)))
11488 && (code == EQ || code == NE))
11489 return CC_Zmode;
11490
11491 /* ANDS/BICS/TST support equality and all signed comparisons. */
11492 if ((mode_x == SImode || mode_x == DImode)
11493 && y == const0_rtx
11494 && (code_x == AND)
11495 && (code == EQ || code == NE || code == LT || code == GE
11496 || code == GT || code == LE))
11497 return CC_NZVmode;
11498
11499 /* ADDS/SUBS correctly set N and Z flags. */
11500 if ((mode_x == SImode || mode_x == DImode)
11501 && y == const0_rtx
11502 && (code == EQ || code == NE || code == LT || code == GE)
11503 && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11504 return CC_NZmode;
11505
11506 /* A compare with a shifted operand. Because of canonicalization,
11507 the comparison will have to be swapped when we emit the assembly
11508 code. */
11509 if ((mode_x == SImode || mode_x == DImode)
11510 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11511 && (code_x == ASHIFT || code_x == ASHIFTRT
11512 || code_x == LSHIFTRT
11513 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11514 return CC_SWPmode;
11515
11516 /* Similarly for a negated operand, but we can only do this for
11517 equalities. */
11518 if ((mode_x == SImode || mode_x == DImode)
11519 && (REG_P (y) || SUBREG_P (y))
11520 && (code == EQ || code == NE)
11521 && code_x == NEG)
11522 return CC_Zmode;
11523
11524 /* A test for unsigned overflow from an addition. */
11525 if ((mode_x == DImode || mode_x == TImode)
11526 && (code == LTU || code == GEU)
11527 && code_x == PLUS
11528 && rtx_equal_p (XEXP (x, 0), y))
11529 return CC_Cmode;
11530
11531 /* A test for unsigned overflow from an add with carry. */
11532 if ((mode_x == DImode || mode_x == TImode)
11533 && (code == LTU || code == GEU)
11534 && code_x == PLUS
11535 && CONST_SCALAR_INT_P (y)
11536 && (rtx_mode_t (y, mode_x)
11537 == (wi::shwi (1, mode_x)
11538 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11539 return CC_ADCmode;
11540
11541 /* A test for signed overflow. */
11542 if ((mode_x == DImode || mode_x == TImode)
11543 && code == NE
11544 && code_x == PLUS
11545 && GET_CODE (y) == SIGN_EXTEND)
11546 return CC_Vmode;
11547
11548 /* For everything else, return CCmode. */
11549 return CCmode;
11550 }
11551
11552 static int
11553 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11554
11555 int
11556 aarch64_get_condition_code (rtx x)
11557 {
11558 machine_mode mode = GET_MODE (XEXP (x, 0));
11559 enum rtx_code comp_code = GET_CODE (x);
11560
11561 if (GET_MODE_CLASS (mode) != MODE_CC)
11562 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11563 return aarch64_get_condition_code_1 (mode, comp_code);
11564 }
11565
11566 static int
11567 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11568 {
11569 switch (mode)
11570 {
11571 case E_CCFPmode:
11572 case E_CCFPEmode:
11573 switch (comp_code)
11574 {
11575 case GE: return AARCH64_GE;
11576 case GT: return AARCH64_GT;
11577 case LE: return AARCH64_LS;
11578 case LT: return AARCH64_MI;
11579 case NE: return AARCH64_NE;
11580 case EQ: return AARCH64_EQ;
11581 case ORDERED: return AARCH64_VC;
11582 case UNORDERED: return AARCH64_VS;
11583 case UNLT: return AARCH64_LT;
11584 case UNLE: return AARCH64_LE;
11585 case UNGT: return AARCH64_HI;
11586 case UNGE: return AARCH64_PL;
11587 default: return -1;
11588 }
11589 break;
11590
11591 case E_CCmode:
11592 switch (comp_code)
11593 {
11594 case NE: return AARCH64_NE;
11595 case EQ: return AARCH64_EQ;
11596 case GE: return AARCH64_GE;
11597 case GT: return AARCH64_GT;
11598 case LE: return AARCH64_LE;
11599 case LT: return AARCH64_LT;
11600 case GEU: return AARCH64_CS;
11601 case GTU: return AARCH64_HI;
11602 case LEU: return AARCH64_LS;
11603 case LTU: return AARCH64_CC;
11604 default: return -1;
11605 }
11606 break;
11607
11608 case E_CC_SWPmode:
11609 switch (comp_code)
11610 {
11611 case NE: return AARCH64_NE;
11612 case EQ: return AARCH64_EQ;
11613 case GE: return AARCH64_LE;
11614 case GT: return AARCH64_LT;
11615 case LE: return AARCH64_GE;
11616 case LT: return AARCH64_GT;
11617 case GEU: return AARCH64_LS;
11618 case GTU: return AARCH64_CC;
11619 case LEU: return AARCH64_CS;
11620 case LTU: return AARCH64_HI;
11621 default: return -1;
11622 }
11623 break;
11624
11625 case E_CC_NZCmode:
11626 switch (comp_code)
11627 {
11628 case NE: return AARCH64_NE; /* = any */
11629 case EQ: return AARCH64_EQ; /* = none */
11630 case GE: return AARCH64_PL; /* = nfrst */
11631 case LT: return AARCH64_MI; /* = first */
11632 case GEU: return AARCH64_CS; /* = nlast */
11633 case GTU: return AARCH64_HI; /* = pmore */
11634 case LEU: return AARCH64_LS; /* = plast */
11635 case LTU: return AARCH64_CC; /* = last */
11636 default: return -1;
11637 }
11638 break;
11639
11640 case E_CC_NZVmode:
11641 switch (comp_code)
11642 {
11643 case NE: return AARCH64_NE;
11644 case EQ: return AARCH64_EQ;
11645 case GE: return AARCH64_PL;
11646 case LT: return AARCH64_MI;
11647 case GT: return AARCH64_GT;
11648 case LE: return AARCH64_LE;
11649 default: return -1;
11650 }
11651 break;
11652
11653 case E_CC_NZmode:
11654 switch (comp_code)
11655 {
11656 case NE: return AARCH64_NE;
11657 case EQ: return AARCH64_EQ;
11658 case GE: return AARCH64_PL;
11659 case LT: return AARCH64_MI;
11660 default: return -1;
11661 }
11662 break;
11663
11664 case E_CC_Zmode:
11665 switch (comp_code)
11666 {
11667 case NE: return AARCH64_NE;
11668 case EQ: return AARCH64_EQ;
11669 default: return -1;
11670 }
11671 break;
11672
11673 case E_CC_Cmode:
11674 switch (comp_code)
11675 {
11676 case LTU: return AARCH64_CS;
11677 case GEU: return AARCH64_CC;
11678 default: return -1;
11679 }
11680 break;
11681
11682 case E_CC_ADCmode:
11683 switch (comp_code)
11684 {
11685 case GEU: return AARCH64_CS;
11686 case LTU: return AARCH64_CC;
11687 default: return -1;
11688 }
11689 break;
11690
11691 case E_CC_Vmode:
11692 switch (comp_code)
11693 {
11694 case NE: return AARCH64_VS;
11695 case EQ: return AARCH64_VC;
11696 default: return -1;
11697 }
11698 break;
11699
11700 default:
11701 return -1;
11702 }
11703
11704 return -1;
11705 }
11706
11707 bool
11708 aarch64_const_vec_all_same_in_range_p (rtx x,
11709 HOST_WIDE_INT minval,
11710 HOST_WIDE_INT maxval)
11711 {
11712 rtx elt;
11713 return (const_vec_duplicate_p (x, &elt)
11714 && CONST_INT_P (elt)
11715 && IN_RANGE (INTVAL (elt), minval, maxval));
11716 }
11717
11718 bool
11719 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11720 {
11721 return aarch64_const_vec_all_same_in_range_p (x, val, val);
11722 }
11723
11724 /* Return true if VEC is a constant in which every element is in the range
11725 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
11726
11727 static bool
11728 aarch64_const_vec_all_in_range_p (rtx vec,
11729 HOST_WIDE_INT minval,
11730 HOST_WIDE_INT maxval)
11731 {
11732 if (!CONST_VECTOR_P (vec)
11733 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11734 return false;
11735
11736 int nunits;
11737 if (!CONST_VECTOR_STEPPED_P (vec))
11738 nunits = const_vector_encoded_nelts (vec);
11739 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11740 return false;
11741
11742 for (int i = 0; i < nunits; i++)
11743 {
11744 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11745 if (!CONST_INT_P (vec_elem)
11746 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11747 return false;
11748 }
11749 return true;
11750 }
11751
11752 /* N Z C V. */
11753 #define AARCH64_CC_V 1
11754 #define AARCH64_CC_C (1 << 1)
11755 #define AARCH64_CC_Z (1 << 2)
11756 #define AARCH64_CC_N (1 << 3)
11757
11758 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
11759 static const int aarch64_nzcv_codes[] =
11760 {
11761 0, /* EQ, Z == 1. */
11762 AARCH64_CC_Z, /* NE, Z == 0. */
11763 0, /* CS, C == 1. */
11764 AARCH64_CC_C, /* CC, C == 0. */
11765 0, /* MI, N == 1. */
11766 AARCH64_CC_N, /* PL, N == 0. */
11767 0, /* VS, V == 1. */
11768 AARCH64_CC_V, /* VC, V == 0. */
11769 0, /* HI, C ==1 && Z == 0. */
11770 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
11771 AARCH64_CC_V, /* GE, N == V. */
11772 0, /* LT, N != V. */
11773 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
11774 0, /* LE, !(Z == 0 && N == V). */
11775 0, /* AL, Any. */
11776 0 /* NV, Any. */
11777 };
11778
11779 /* Print floating-point vector immediate operand X to F, negating it
11780 first if NEGATE is true. Return true on success, false if it isn't
11781 a constant we can handle. */
11782
11783 static bool
11784 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11785 {
11786 rtx elt;
11787
11788 if (!const_vec_duplicate_p (x, &elt))
11789 return false;
11790
11791 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11792 if (negate)
11793 r = real_value_negate (&r);
11794
11795 /* Handle the SVE single-bit immediates specially, since they have a
11796 fixed form in the assembly syntax. */
11797 if (real_equal (&r, &dconst0))
11798 asm_fprintf (f, "0.0");
11799 else if (real_equal (&r, &dconst2))
11800 asm_fprintf (f, "2.0");
11801 else if (real_equal (&r, &dconst1))
11802 asm_fprintf (f, "1.0");
11803 else if (real_equal (&r, &dconsthalf))
11804 asm_fprintf (f, "0.5");
11805 else
11806 {
11807 const int buf_size = 20;
11808 char float_buf[buf_size] = {'\0'};
11809 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11810 1, GET_MODE (elt));
11811 asm_fprintf (f, "%s", float_buf);
11812 }
11813
11814 return true;
11815 }
11816
11817 /* Return the equivalent letter for size. */
11818 static char
11819 sizetochar (int size)
11820 {
11821 switch (size)
11822 {
11823 case 64: return 'd';
11824 case 32: return 's';
11825 case 16: return 'h';
11826 case 8 : return 'b';
11827 default: gcc_unreachable ();
11828 }
11829 }
11830
11831 /* Print operand X to file F in a target specific manner according to CODE.
11832 The acceptable formatting commands given by CODE are:
11833 'c': An integer or symbol address without a preceding #
11834 sign.
11835 'C': Take the duplicated element in a vector constant
11836 and print it in hex.
11837 'D': Take the duplicated element in a vector constant
11838 and print it as an unsigned integer, in decimal.
11839 'e': Print the sign/zero-extend size as a character 8->b,
11840 16->h, 32->w. Can also be used for masks:
11841 0xff->b, 0xffff->h, 0xffffffff->w.
11842 'I': If the operand is a duplicated vector constant,
11843 replace it with the duplicated scalar. If the
11844 operand is then a floating-point constant, replace
11845 it with the integer bit representation. Print the
11846 transformed constant as a signed decimal number.
11847 'p': Prints N such that 2^N == X (X must be power of 2 and
11848 const int).
11849 'P': Print the number of non-zero bits in X (a const_int).
11850 'H': Print the higher numbered register of a pair (TImode)
11851 of regs.
11852 'm': Print a condition (eq, ne, etc).
11853 'M': Same as 'm', but invert condition.
11854 'N': Take the duplicated element in a vector constant
11855 and print the negative of it in decimal.
11856 'b/h/s/d/q': Print a scalar FP/SIMD register name.
11857 'S/T/U/V': Print a FP/SIMD register name for a register list.
11858 The register printed is the FP/SIMD register name
11859 of X + 0/1/2/3 for S/T/U/V.
11860 'R': Print a scalar Integer/FP/SIMD register name + 1.
11861 'X': Print bottom 16 bits of integer constant in hex.
11862 'w/x': Print a general register name or the zero register
11863 (32-bit or 64-bit).
11864 '0': Print a normal operand, if it's a general register,
11865 then we assume DImode.
11866 'k': Print NZCV for conditional compare instructions.
11867 'A': Output address constant representing the first
11868 argument of X, specifying a relocation offset
11869 if appropriate.
11870 'L': Output constant address specified by X
11871 with a relocation offset if appropriate.
11872 'G': Prints address of X, specifying a PC relative
11873 relocation mode if appropriate.
11874 'y': Output address of LDP or STP - this is used for
11875 some LDP/STPs which don't use a PARALLEL in their
11876 pattern (so the mode needs to be adjusted).
11877 'z': Output address of a typical LDP or STP. */
11878
11879 static void
11880 aarch64_print_operand (FILE *f, rtx x, int code)
11881 {
11882 rtx elt;
11883 switch (code)
11884 {
11885 case 'c':
11886 if (CONST_INT_P (x))
11887 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11888 else
11889 {
11890 poly_int64 offset;
11891 rtx base = strip_offset_and_salt (x, &offset);
11892 if (SYMBOL_REF_P (base))
11893 output_addr_const (f, x);
11894 else
11895 output_operand_lossage ("unsupported operand for code '%c'", code);
11896 }
11897 break;
11898
11899 case 'e':
11900 {
11901 x = unwrap_const_vec_duplicate (x);
11902 if (!CONST_INT_P (x))
11903 {
11904 output_operand_lossage ("invalid operand for '%%%c'", code);
11905 return;
11906 }
11907
11908 HOST_WIDE_INT val = INTVAL (x);
11909 if ((val & ~7) == 8 || val == 0xff)
11910 fputc ('b', f);
11911 else if ((val & ~7) == 16 || val == 0xffff)
11912 fputc ('h', f);
11913 else if ((val & ~7) == 32 || val == 0xffffffff)
11914 fputc ('w', f);
11915 else
11916 {
11917 output_operand_lossage ("invalid operand for '%%%c'", code);
11918 return;
11919 }
11920 }
11921 break;
11922
11923 case 'p':
11924 {
11925 int n;
11926
11927 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
11928 {
11929 output_operand_lossage ("invalid operand for '%%%c'", code);
11930 return;
11931 }
11932
11933 asm_fprintf (f, "%d", n);
11934 }
11935 break;
11936
11937 case 'P':
11938 if (!CONST_INT_P (x))
11939 {
11940 output_operand_lossage ("invalid operand for '%%%c'", code);
11941 return;
11942 }
11943
11944 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
11945 break;
11946
11947 case 'H':
11948 if (x == const0_rtx)
11949 {
11950 asm_fprintf (f, "xzr");
11951 break;
11952 }
11953
11954 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
11955 {
11956 output_operand_lossage ("invalid operand for '%%%c'", code);
11957 return;
11958 }
11959
11960 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
11961 break;
11962
11963 case 'I':
11964 {
11965 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11966 if (CONST_INT_P (x))
11967 asm_fprintf (f, "%wd", INTVAL (x));
11968 else
11969 {
11970 output_operand_lossage ("invalid operand for '%%%c'", code);
11971 return;
11972 }
11973 break;
11974 }
11975
11976 case 'M':
11977 case 'm':
11978 {
11979 int cond_code;
11980 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
11981 if (x == const_true_rtx)
11982 {
11983 if (code == 'M')
11984 fputs ("nv", f);
11985 return;
11986 }
11987
11988 if (!COMPARISON_P (x))
11989 {
11990 output_operand_lossage ("invalid operand for '%%%c'", code);
11991 return;
11992 }
11993
11994 cond_code = aarch64_get_condition_code (x);
11995 gcc_assert (cond_code >= 0);
11996 if (code == 'M')
11997 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
11998 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11999 fputs (aarch64_sve_condition_codes[cond_code], f);
12000 else
12001 fputs (aarch64_condition_codes[cond_code], f);
12002 }
12003 break;
12004
12005 case 'N':
12006 if (!const_vec_duplicate_p (x, &elt))
12007 {
12008 output_operand_lossage ("invalid vector constant");
12009 return;
12010 }
12011
12012 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12013 asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12014 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12015 && aarch64_print_vector_float_operand (f, x, true))
12016 ;
12017 else
12018 {
12019 output_operand_lossage ("invalid vector constant");
12020 return;
12021 }
12022 break;
12023
12024 case 'b':
12025 case 'h':
12026 case 's':
12027 case 'd':
12028 case 'q':
12029 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12030 {
12031 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12032 return;
12033 }
12034 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12035 break;
12036
12037 case 'S':
12038 case 'T':
12039 case 'U':
12040 case 'V':
12041 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12042 {
12043 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12044 return;
12045 }
12046 asm_fprintf (f, "%c%d",
12047 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12048 REGNO (x) - V0_REGNUM + (code - 'S'));
12049 break;
12050
12051 case 'R':
12052 if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12053 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12054 asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12055 else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12056 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12057 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12058 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12059 else
12060 output_operand_lossage ("incompatible register operand for '%%%c'",
12061 code);
12062 break;
12063
12064 case 'X':
12065 if (!CONST_INT_P (x))
12066 {
12067 output_operand_lossage ("invalid operand for '%%%c'", code);
12068 return;
12069 }
12070 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12071 break;
12072
12073 case 'C':
12074 {
12075 /* Print a replicated constant in hex. */
12076 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12077 {
12078 output_operand_lossage ("invalid operand for '%%%c'", code);
12079 return;
12080 }
12081 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12082 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12083 }
12084 break;
12085
12086 case 'D':
12087 {
12088 /* Print a replicated constant in decimal, treating it as
12089 unsigned. */
12090 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12091 {
12092 output_operand_lossage ("invalid operand for '%%%c'", code);
12093 return;
12094 }
12095 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12096 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12097 }
12098 break;
12099
12100 case 'w':
12101 case 'x':
12102 if (x == const0_rtx
12103 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
12104 {
12105 asm_fprintf (f, "%czr", code);
12106 break;
12107 }
12108
12109 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12110 {
12111 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12112 break;
12113 }
12114
12115 if (REG_P (x) && REGNO (x) == SP_REGNUM)
12116 {
12117 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12118 break;
12119 }
12120
12121 /* Fall through */
12122
12123 case 0:
12124 if (x == NULL)
12125 {
12126 output_operand_lossage ("missing operand");
12127 return;
12128 }
12129
12130 switch (GET_CODE (x))
12131 {
12132 case REG:
12133 if (aarch64_sve_data_mode_p (GET_MODE (x)))
12134 {
12135 if (REG_NREGS (x) == 1)
12136 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12137 else
12138 {
12139 char suffix
12140 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12141 asm_fprintf (f, "{z%d.%c - z%d.%c}",
12142 REGNO (x) - V0_REGNUM, suffix,
12143 END_REGNO (x) - V0_REGNUM - 1, suffix);
12144 }
12145 }
12146 else
12147 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12148 break;
12149
12150 case MEM:
12151 output_address (GET_MODE (x), XEXP (x, 0));
12152 break;
12153
12154 case LABEL_REF:
12155 case SYMBOL_REF:
12156 output_addr_const (asm_out_file, x);
12157 break;
12158
12159 case CONST_INT:
12160 asm_fprintf (f, "%wd", INTVAL (x));
12161 break;
12162
12163 case CONST:
12164 if (!VECTOR_MODE_P (GET_MODE (x)))
12165 {
12166 output_addr_const (asm_out_file, x);
12167 break;
12168 }
12169 /* fall through */
12170
12171 case CONST_VECTOR:
12172 if (!const_vec_duplicate_p (x, &elt))
12173 {
12174 output_operand_lossage ("invalid vector constant");
12175 return;
12176 }
12177
12178 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12179 asm_fprintf (f, "%wd", INTVAL (elt));
12180 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12181 && aarch64_print_vector_float_operand (f, x, false))
12182 ;
12183 else
12184 {
12185 output_operand_lossage ("invalid vector constant");
12186 return;
12187 }
12188 break;
12189
12190 case CONST_DOUBLE:
12191 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12192 be getting CONST_DOUBLEs holding integers. */
12193 gcc_assert (GET_MODE (x) != VOIDmode);
12194 if (aarch64_float_const_zero_rtx_p (x))
12195 {
12196 fputc ('0', f);
12197 break;
12198 }
12199 else if (aarch64_float_const_representable_p (x))
12200 {
12201 #define buf_size 20
12202 char float_buf[buf_size] = {'\0'};
12203 real_to_decimal_for_mode (float_buf,
12204 CONST_DOUBLE_REAL_VALUE (x),
12205 buf_size, buf_size,
12206 1, GET_MODE (x));
12207 asm_fprintf (asm_out_file, "%s", float_buf);
12208 break;
12209 #undef buf_size
12210 }
12211 output_operand_lossage ("invalid constant");
12212 return;
12213 default:
12214 output_operand_lossage ("invalid operand");
12215 return;
12216 }
12217 break;
12218
12219 case 'A':
12220 if (GET_CODE (x) == HIGH)
12221 x = XEXP (x, 0);
12222
12223 switch (aarch64_classify_symbolic_expression (x))
12224 {
12225 case SYMBOL_SMALL_GOT_4G:
12226 asm_fprintf (asm_out_file, ":got:");
12227 break;
12228
12229 case SYMBOL_SMALL_TLSGD:
12230 asm_fprintf (asm_out_file, ":tlsgd:");
12231 break;
12232
12233 case SYMBOL_SMALL_TLSDESC:
12234 asm_fprintf (asm_out_file, ":tlsdesc:");
12235 break;
12236
12237 case SYMBOL_SMALL_TLSIE:
12238 asm_fprintf (asm_out_file, ":gottprel:");
12239 break;
12240
12241 case SYMBOL_TLSLE24:
12242 asm_fprintf (asm_out_file, ":tprel:");
12243 break;
12244
12245 case SYMBOL_TINY_GOT:
12246 gcc_unreachable ();
12247 break;
12248
12249 default:
12250 break;
12251 }
12252 output_addr_const (asm_out_file, x);
12253 break;
12254
12255 case 'L':
12256 switch (aarch64_classify_symbolic_expression (x))
12257 {
12258 case SYMBOL_SMALL_GOT_4G:
12259 asm_fprintf (asm_out_file, ":got_lo12:");
12260 break;
12261
12262 case SYMBOL_SMALL_TLSGD:
12263 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12264 break;
12265
12266 case SYMBOL_SMALL_TLSDESC:
12267 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12268 break;
12269
12270 case SYMBOL_SMALL_TLSIE:
12271 asm_fprintf (asm_out_file, ":gottprel_lo12:");
12272 break;
12273
12274 case SYMBOL_TLSLE12:
12275 asm_fprintf (asm_out_file, ":tprel_lo12:");
12276 break;
12277
12278 case SYMBOL_TLSLE24:
12279 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12280 break;
12281
12282 case SYMBOL_TINY_GOT:
12283 asm_fprintf (asm_out_file, ":got:");
12284 break;
12285
12286 case SYMBOL_TINY_TLSIE:
12287 asm_fprintf (asm_out_file, ":gottprel:");
12288 break;
12289
12290 default:
12291 break;
12292 }
12293 output_addr_const (asm_out_file, x);
12294 break;
12295
12296 case 'G':
12297 switch (aarch64_classify_symbolic_expression (x))
12298 {
12299 case SYMBOL_TLSLE24:
12300 asm_fprintf (asm_out_file, ":tprel_hi12:");
12301 break;
12302 default:
12303 break;
12304 }
12305 output_addr_const (asm_out_file, x);
12306 break;
12307
12308 case 'k':
12309 {
12310 HOST_WIDE_INT cond_code;
12311
12312 if (!CONST_INT_P (x))
12313 {
12314 output_operand_lossage ("invalid operand for '%%%c'", code);
12315 return;
12316 }
12317
12318 cond_code = INTVAL (x);
12319 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12320 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12321 }
12322 break;
12323
12324 case 'y':
12325 case 'z':
12326 {
12327 machine_mode mode = GET_MODE (x);
12328
12329 if (!MEM_P (x)
12330 || (code == 'y'
12331 && maybe_ne (GET_MODE_SIZE (mode), 8)
12332 && maybe_ne (GET_MODE_SIZE (mode), 16)))
12333 {
12334 output_operand_lossage ("invalid operand for '%%%c'", code);
12335 return;
12336 }
12337
12338 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12339 code == 'y'
12340 ? ADDR_QUERY_LDP_STP_N
12341 : ADDR_QUERY_LDP_STP))
12342 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12343 }
12344 break;
12345
12346 default:
12347 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12348 return;
12349 }
12350 }
12351
12352 /* Print address 'x' of a memory access with mode 'mode'.
12353 'op' is the context required by aarch64_classify_address. It can either be
12354 MEM for a normal memory access or PARALLEL for LDP/STP. */
12355 static bool
12356 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12357 aarch64_addr_query_type type)
12358 {
12359 struct aarch64_address_info addr;
12360 unsigned int size, vec_flags;
12361
12362 /* Check all addresses are Pmode - including ILP32. */
12363 if (GET_MODE (x) != Pmode
12364 && (!CONST_INT_P (x)
12365 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12366 {
12367 output_operand_lossage ("invalid address mode");
12368 return false;
12369 }
12370
12371 if (aarch64_classify_address (&addr, x, mode, true, type))
12372 switch (addr.type)
12373 {
12374 case ADDRESS_REG_IMM:
12375 if (known_eq (addr.const_offset, 0))
12376 {
12377 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12378 return true;
12379 }
12380
12381 vec_flags = aarch64_classify_vector_mode (mode);
12382 if (vec_flags & VEC_ANY_SVE)
12383 {
12384 HOST_WIDE_INT vnum
12385 = exact_div (addr.const_offset,
12386 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12387 asm_fprintf (f, "[%s, #%wd, mul vl]",
12388 reg_names[REGNO (addr.base)], vnum);
12389 return true;
12390 }
12391
12392 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12393 INTVAL (addr.offset));
12394 return true;
12395
12396 case ADDRESS_REG_REG:
12397 if (addr.shift == 0)
12398 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12399 reg_names [REGNO (addr.offset)]);
12400 else
12401 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12402 reg_names [REGNO (addr.offset)], addr.shift);
12403 return true;
12404
12405 case ADDRESS_REG_UXTW:
12406 if (addr.shift == 0)
12407 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12408 REGNO (addr.offset) - R0_REGNUM);
12409 else
12410 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12411 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12412 return true;
12413
12414 case ADDRESS_REG_SXTW:
12415 if (addr.shift == 0)
12416 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12417 REGNO (addr.offset) - R0_REGNUM);
12418 else
12419 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12420 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12421 return true;
12422
12423 case ADDRESS_REG_WB:
12424 /* Writeback is only supported for fixed-width modes. */
12425 size = GET_MODE_SIZE (mode).to_constant ();
12426 switch (GET_CODE (x))
12427 {
12428 case PRE_INC:
12429 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12430 return true;
12431 case POST_INC:
12432 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12433 return true;
12434 case PRE_DEC:
12435 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12436 return true;
12437 case POST_DEC:
12438 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12439 return true;
12440 case PRE_MODIFY:
12441 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12442 INTVAL (addr.offset));
12443 return true;
12444 case POST_MODIFY:
12445 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12446 INTVAL (addr.offset));
12447 return true;
12448 default:
12449 break;
12450 }
12451 break;
12452
12453 case ADDRESS_LO_SUM:
12454 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12455 output_addr_const (f, addr.offset);
12456 asm_fprintf (f, "]");
12457 return true;
12458
12459 case ADDRESS_SYMBOLIC:
12460 output_addr_const (f, x);
12461 return true;
12462 }
12463
12464 return false;
12465 }
12466
12467 /* Print address 'x' of a memory access with mode 'mode'. */
12468 static void
12469 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12470 {
12471 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12472 output_addr_const (f, x);
12473 }
12474
12475 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12476
12477 static bool
12478 aarch64_output_addr_const_extra (FILE *file, rtx x)
12479 {
12480 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12481 {
12482 output_addr_const (file, XVECEXP (x, 0, 0));
12483 return true;
12484 }
12485 return false;
12486 }
12487
12488 bool
12489 aarch64_label_mentioned_p (rtx x)
12490 {
12491 const char *fmt;
12492 int i;
12493
12494 if (LABEL_REF_P (x))
12495 return true;
12496
12497 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12498 referencing instruction, but they are constant offsets, not
12499 symbols. */
12500 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12501 return false;
12502
12503 fmt = GET_RTX_FORMAT (GET_CODE (x));
12504 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12505 {
12506 if (fmt[i] == 'E')
12507 {
12508 int j;
12509
12510 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12511 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12512 return 1;
12513 }
12514 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12515 return 1;
12516 }
12517
12518 return 0;
12519 }
12520
12521 /* Implement REGNO_REG_CLASS. */
12522
12523 enum reg_class
12524 aarch64_regno_regclass (unsigned regno)
12525 {
12526 if (STUB_REGNUM_P (regno))
12527 return STUB_REGS;
12528
12529 if (GP_REGNUM_P (regno))
12530 return GENERAL_REGS;
12531
12532 if (regno == SP_REGNUM)
12533 return STACK_REG;
12534
12535 if (regno == FRAME_POINTER_REGNUM
12536 || regno == ARG_POINTER_REGNUM)
12537 return POINTER_REGS;
12538
12539 if (FP_REGNUM_P (regno))
12540 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12541 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12542
12543 if (PR_REGNUM_P (regno))
12544 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12545
12546 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12547 return FFR_REGS;
12548
12549 return NO_REGS;
12550 }
12551
12552 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12553 If OFFSET is out of range, return an offset of an anchor point
12554 that is in range. Return 0 otherwise. */
12555
12556 static HOST_WIDE_INT
12557 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12558 machine_mode mode)
12559 {
12560 /* Does it look like we'll need a 16-byte load/store-pair operation? */
12561 if (size > 16)
12562 return (offset + 0x400) & ~0x7f0;
12563
12564 /* For offsets that aren't a multiple of the access size, the limit is
12565 -256...255. */
12566 if (offset & (size - 1))
12567 {
12568 /* BLKmode typically uses LDP of X-registers. */
12569 if (mode == BLKmode)
12570 return (offset + 512) & ~0x3ff;
12571 return (offset + 0x100) & ~0x1ff;
12572 }
12573
12574 /* Small negative offsets are supported. */
12575 if (IN_RANGE (offset, -256, 0))
12576 return 0;
12577
12578 if (mode == TImode || mode == TFmode || mode == TDmode)
12579 return (offset + 0x100) & ~0x1ff;
12580
12581 /* Use 12-bit offset by access size. */
12582 return offset & (~0xfff * size);
12583 }
12584
12585 static rtx
12586 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
12587 {
12588 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12589 where mask is selected by alignment and size of the offset.
12590 We try to pick as large a range for the offset as possible to
12591 maximize the chance of a CSE. However, for aligned addresses
12592 we limit the range to 4k so that structures with different sized
12593 elements are likely to use the same base. We need to be careful
12594 not to split a CONST for some forms of address expression, otherwise
12595 it will generate sub-optimal code. */
12596
12597 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12598 {
12599 rtx base = XEXP (x, 0);
12600 rtx offset_rtx = XEXP (x, 1);
12601 HOST_WIDE_INT offset = INTVAL (offset_rtx);
12602
12603 if (GET_CODE (base) == PLUS)
12604 {
12605 rtx op0 = XEXP (base, 0);
12606 rtx op1 = XEXP (base, 1);
12607
12608 /* Force any scaling into a temp for CSE. */
12609 op0 = force_reg (Pmode, op0);
12610 op1 = force_reg (Pmode, op1);
12611
12612 /* Let the pointer register be in op0. */
12613 if (REG_POINTER (op1))
12614 std::swap (op0, op1);
12615
12616 /* If the pointer is virtual or frame related, then we know that
12617 virtual register instantiation or register elimination is going
12618 to apply a second constant. We want the two constants folded
12619 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
12620 if (virt_or_elim_regno_p (REGNO (op0)))
12621 {
12622 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12623 NULL_RTX, true, OPTAB_DIRECT);
12624 return gen_rtx_PLUS (Pmode, base, op1);
12625 }
12626
12627 /* Otherwise, in order to encourage CSE (and thence loop strength
12628 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
12629 base = expand_binop (Pmode, add_optab, op0, op1,
12630 NULL_RTX, true, OPTAB_DIRECT);
12631 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12632 }
12633
12634 HOST_WIDE_INT size;
12635 if (GET_MODE_SIZE (mode).is_constant (&size))
12636 {
12637 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12638 mode);
12639 if (base_offset != 0)
12640 {
12641 base = plus_constant (Pmode, base, base_offset);
12642 base = force_operand (base, NULL_RTX);
12643 return plus_constant (Pmode, base, offset - base_offset);
12644 }
12645 }
12646 }
12647
12648 return x;
12649 }
12650
12651 static reg_class_t
12652 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12653 reg_class_t rclass,
12654 machine_mode mode,
12655 secondary_reload_info *sri)
12656 {
12657 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12658 LDR and STR. See the comment at the head of aarch64-sve.md for
12659 more details about the big-endian handling. */
12660 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12661 if (reg_class_subset_p (rclass, FP_REGS)
12662 && !((REG_P (x) && HARD_REGISTER_P (x))
12663 || aarch64_simd_valid_immediate (x, NULL))
12664 && mode != VNx16QImode
12665 && (vec_flags & VEC_SVE_DATA)
12666 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12667 {
12668 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12669 return NO_REGS;
12670 }
12671
12672 /* If we have to disable direct literal pool loads and stores because the
12673 function is too big, then we need a scratch register. */
12674 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12675 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12676 || targetm.vector_mode_supported_p (GET_MODE (x)))
12677 && !aarch64_pcrelative_literal_loads)
12678 {
12679 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12680 return NO_REGS;
12681 }
12682
12683 /* Without the TARGET_SIMD instructions we cannot move a Q register
12684 to a Q register directly. We need a scratch. */
12685 if (REG_P (x)
12686 && (mode == TFmode
12687 || mode == TImode
12688 || mode == TDmode
12689 || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12690 && mode == GET_MODE (x)
12691 && !TARGET_SIMD
12692 && FP_REGNUM_P (REGNO (x))
12693 && reg_class_subset_p (rclass, FP_REGS))
12694 {
12695 sri->icode = code_for_aarch64_reload_mov (mode);
12696 return NO_REGS;
12697 }
12698
12699 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12700 because AArch64 has richer addressing modes for LDR/STR instructions
12701 than LDP/STP instructions. */
12702 if (TARGET_FLOAT && rclass == GENERAL_REGS
12703 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12704 return FP_REGS;
12705
12706 if (rclass == FP_REGS
12707 && (mode == TImode || mode == TFmode || mode == TDmode)
12708 && CONSTANT_P(x))
12709 return GENERAL_REGS;
12710
12711 return NO_REGS;
12712 }
12713
12714 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
12715
12716 static bool
12717 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12718 reg_class_t class2)
12719 {
12720 if (!TARGET_SIMD
12721 && reg_classes_intersect_p (class1, FP_REGS)
12722 && reg_classes_intersect_p (class2, FP_REGS))
12723 {
12724 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12725 so we can't easily split a move involving tuples of 128-bit
12726 vectors. Force the copy through memory instead.
12727
12728 (Tuples of 64-bit vectors are fine.) */
12729 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12730 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12731 return true;
12732 }
12733 return false;
12734 }
12735
12736 static bool
12737 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12738 {
12739 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12740
12741 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12742 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
12743 if (frame_pointer_needed)
12744 return to == HARD_FRAME_POINTER_REGNUM;
12745 return true;
12746 }
12747
12748 poly_int64
12749 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12750 {
12751 if (to == HARD_FRAME_POINTER_REGNUM)
12752 {
12753 if (from == ARG_POINTER_REGNUM)
12754 return cfun->machine->frame.hard_fp_offset;
12755
12756 if (from == FRAME_POINTER_REGNUM)
12757 return cfun->machine->frame.hard_fp_offset
12758 - cfun->machine->frame.locals_offset;
12759 }
12760
12761 if (to == STACK_POINTER_REGNUM)
12762 {
12763 if (from == FRAME_POINTER_REGNUM)
12764 return cfun->machine->frame.frame_size
12765 - cfun->machine->frame.locals_offset;
12766 }
12767
12768 return cfun->machine->frame.frame_size;
12769 }
12770
12771
12772 /* Get return address without mangling. */
12773
12774 rtx
12775 aarch64_return_addr_rtx (void)
12776 {
12777 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12778 /* Note: aarch64_return_address_signing_enabled only
12779 works after cfun->machine->frame.laid_out is set,
12780 so here we don't know if the return address will
12781 be signed or not. */
12782 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12783 emit_move_insn (lr, val);
12784 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12785 return lr;
12786 }
12787
12788
12789 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
12790 previous frame. */
12791
12792 rtx
12793 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12794 {
12795 if (count != 0)
12796 return const0_rtx;
12797 return aarch64_return_addr_rtx ();
12798 }
12799
12800 static void
12801 aarch64_asm_trampoline_template (FILE *f)
12802 {
12803 /* Even if the current function doesn't have branch protection, some
12804 later function might, so since this template is only generated once
12805 we have to add a BTI just in case. */
12806 asm_fprintf (f, "\thint\t34 // bti c\n");
12807
12808 if (TARGET_ILP32)
12809 {
12810 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12811 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12812 }
12813 else
12814 {
12815 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12816 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12817 }
12818 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12819
12820 /* We always emit a speculation barrier.
12821 This is because the same trampoline template is used for every nested
12822 function. Since nested functions are not particularly common or
12823 performant we don't worry too much about the extra instructions to copy
12824 around.
12825 This is not yet a problem, since we have not yet implemented function
12826 specific attributes to choose between hardening against straight line
12827 speculation or not, but such function specific attributes are likely to
12828 happen in the future. */
12829 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12830
12831 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12832 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12833 }
12834
12835 static void
12836 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12837 {
12838 rtx fnaddr, mem, a_tramp;
12839 const int tramp_code_sz = 24;
12840
12841 /* Don't need to copy the trailing D-words, we fill those in below. */
12842 /* We create our own memory address in Pmode so that `emit_block_move` can
12843 use parts of the backend which expect Pmode addresses. */
12844 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12845 emit_block_move (gen_rtx_MEM (BLKmode, temp),
12846 assemble_trampoline_template (),
12847 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12848 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12849 fnaddr = XEXP (DECL_RTL (fndecl), 0);
12850 if (GET_MODE (fnaddr) != ptr_mode)
12851 fnaddr = convert_memory_address (ptr_mode, fnaddr);
12852 emit_move_insn (mem, fnaddr);
12853
12854 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12855 emit_move_insn (mem, chain_value);
12856
12857 /* XXX We should really define a "clear_cache" pattern and use
12858 gen_clear_cache(). */
12859 a_tramp = XEXP (m_tramp, 0);
12860 maybe_emit_call_builtin___clear_cache (a_tramp,
12861 plus_constant (ptr_mode,
12862 a_tramp,
12863 TRAMPOLINE_SIZE));
12864 }
12865
12866 static unsigned char
12867 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
12868 {
12869 /* ??? Logically we should only need to provide a value when
12870 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12871 can hold MODE, but at the moment we need to handle all modes.
12872 Just ignore any runtime parts for registers that can't store them. */
12873 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
12874 unsigned int nregs, vec_flags;
12875 switch (regclass)
12876 {
12877 case STUB_REGS:
12878 case TAILCALL_ADDR_REGS:
12879 case POINTER_REGS:
12880 case GENERAL_REGS:
12881 case ALL_REGS:
12882 case POINTER_AND_FP_REGS:
12883 case FP_REGS:
12884 case FP_LO_REGS:
12885 case FP_LO8_REGS:
12886 vec_flags = aarch64_classify_vector_mode (mode);
12887 if ((vec_flags & VEC_SVE_DATA)
12888 && constant_multiple_p (GET_MODE_SIZE (mode),
12889 aarch64_vl_bytes (mode, vec_flags), &nregs))
12890 return nregs;
12891 return (vec_flags & VEC_ADVSIMD
12892 ? CEIL (lowest_size, UNITS_PER_VREG)
12893 : CEIL (lowest_size, UNITS_PER_WORD));
12894 case STACK_REG:
12895 case PR_REGS:
12896 case PR_LO_REGS:
12897 case PR_HI_REGS:
12898 case FFR_REGS:
12899 case PR_AND_FFR_REGS:
12900 return 1;
12901
12902 case NO_REGS:
12903 return 0;
12904
12905 default:
12906 break;
12907 }
12908 gcc_unreachable ();
12909 }
12910
12911 static reg_class_t
12912 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
12913 {
12914 if (regclass == POINTER_REGS)
12915 return GENERAL_REGS;
12916
12917 if (regclass == STACK_REG)
12918 {
12919 if (REG_P(x)
12920 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12921 return regclass;
12922
12923 return NO_REGS;
12924 }
12925
12926 /* Register eliminiation can result in a request for
12927 SP+constant->FP_REGS. We cannot support such operations which
12928 use SP as source and an FP_REG as destination, so reject out
12929 right now. */
12930 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12931 {
12932 rtx lhs = XEXP (x, 0);
12933
12934 /* Look through a possible SUBREG introduced by ILP32. */
12935 if (SUBREG_P (lhs))
12936 lhs = SUBREG_REG (lhs);
12937
12938 gcc_assert (REG_P (lhs));
12939 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12940 POINTER_REGS));
12941 return NO_REGS;
12942 }
12943
12944 return regclass;
12945 }
12946
12947 void
12948 aarch64_asm_output_labelref (FILE* f, const char *name)
12949 {
12950 asm_fprintf (f, "%U%s", name);
12951 }
12952
12953 static void
12954 aarch64_elf_asm_constructor (rtx symbol, int priority)
12955 {
12956 if (priority == DEFAULT_INIT_PRIORITY)
12957 default_ctor_section_asm_out_constructor (symbol, priority);
12958 else
12959 {
12960 section *s;
12961 /* While priority is known to be in range [0, 65535], so 18 bytes
12962 would be enough, the compiler might not know that. To avoid
12963 -Wformat-truncation false positive, use a larger size. */
12964 char buf[23];
12965 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
12966 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12967 switch_to_section (s);
12968 assemble_align (POINTER_SIZE);
12969 assemble_aligned_integer (POINTER_BYTES, symbol);
12970 }
12971 }
12972
12973 static void
12974 aarch64_elf_asm_destructor (rtx symbol, int priority)
12975 {
12976 if (priority == DEFAULT_INIT_PRIORITY)
12977 default_dtor_section_asm_out_destructor (symbol, priority);
12978 else
12979 {
12980 section *s;
12981 /* While priority is known to be in range [0, 65535], so 18 bytes
12982 would be enough, the compiler might not know that. To avoid
12983 -Wformat-truncation false positive, use a larger size. */
12984 char buf[23];
12985 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
12986 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12987 switch_to_section (s);
12988 assemble_align (POINTER_SIZE);
12989 assemble_aligned_integer (POINTER_BYTES, symbol);
12990 }
12991 }
12992
12993 const char*
12994 aarch64_output_casesi (rtx *operands)
12995 {
12996 char buf[100];
12997 char label[100];
12998 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
12999 int index;
13000 static const char *const patterns[4][2] =
13001 {
13002 {
13003 "ldrb\t%w3, [%0,%w1,uxtw]",
13004 "add\t%3, %4, %w3, sxtb #2"
13005 },
13006 {
13007 "ldrh\t%w3, [%0,%w1,uxtw #1]",
13008 "add\t%3, %4, %w3, sxth #2"
13009 },
13010 {
13011 "ldr\t%w3, [%0,%w1,uxtw #2]",
13012 "add\t%3, %4, %w3, sxtw #2"
13013 },
13014 /* We assume that DImode is only generated when not optimizing and
13015 that we don't really need 64-bit address offsets. That would
13016 imply an object file with 8GB of code in a single function! */
13017 {
13018 "ldr\t%w3, [%0,%w1,uxtw #2]",
13019 "add\t%3, %4, %w3, sxtw #2"
13020 }
13021 };
13022
13023 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13024
13025 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13026 index = exact_log2 (GET_MODE_SIZE (mode));
13027
13028 gcc_assert (index >= 0 && index <= 3);
13029
13030 /* Need to implement table size reduction, by chaning the code below. */
13031 output_asm_insn (patterns[index][0], operands);
13032 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13033 snprintf (buf, sizeof (buf),
13034 "adr\t%%4, %s", targetm.strip_name_encoding (label));
13035 output_asm_insn (buf, operands);
13036 output_asm_insn (patterns[index][1], operands);
13037 output_asm_insn ("br\t%3", operands);
13038 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13039 operands);
13040 assemble_label (asm_out_file, label);
13041 return "";
13042 }
13043
13044
13045 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13046 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13047 operator. */
13048
13049 int
13050 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13051 {
13052 if (shift >= 0 && shift <= 3)
13053 {
13054 int size;
13055 for (size = 8; size <= 32; size *= 2)
13056 {
13057 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13058 if (mask == bits << shift)
13059 return size;
13060 }
13061 }
13062 return 0;
13063 }
13064
13065 /* Constant pools are per function only when PC relative
13066 literal loads are true or we are in the large memory
13067 model. */
13068
13069 static inline bool
13070 aarch64_can_use_per_function_literal_pools_p (void)
13071 {
13072 return (aarch64_pcrelative_literal_loads
13073 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13074 }
13075
13076 static bool
13077 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13078 {
13079 /* We can't use blocks for constants when we're using a per-function
13080 constant pool. */
13081 return !aarch64_can_use_per_function_literal_pools_p ();
13082 }
13083
13084 /* Select appropriate section for constants depending
13085 on where we place literal pools. */
13086
13087 static section *
13088 aarch64_select_rtx_section (machine_mode mode,
13089 rtx x,
13090 unsigned HOST_WIDE_INT align)
13091 {
13092 if (aarch64_can_use_per_function_literal_pools_p ())
13093 return function_section (current_function_decl);
13094
13095 return default_elf_select_rtx_section (mode, x, align);
13096 }
13097
13098 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
13099 void
13100 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13101 HOST_WIDE_INT offset)
13102 {
13103 /* When using per-function literal pools, we must ensure that any code
13104 section is aligned to the minimal instruction length, lest we get
13105 errors from the assembler re "unaligned instructions". */
13106 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13107 ASM_OUTPUT_ALIGN (f, 2);
13108 }
13109
13110 /* Costs. */
13111
13112 /* Helper function for rtx cost calculation. Strip a shift expression
13113 from X. Returns the inner operand if successful, or the original
13114 expression on failure. */
13115 static rtx
13116 aarch64_strip_shift (rtx x)
13117 {
13118 rtx op = x;
13119
13120 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13121 we can convert both to ROR during final output. */
13122 if ((GET_CODE (op) == ASHIFT
13123 || GET_CODE (op) == ASHIFTRT
13124 || GET_CODE (op) == LSHIFTRT
13125 || GET_CODE (op) == ROTATERT
13126 || GET_CODE (op) == ROTATE)
13127 && CONST_INT_P (XEXP (op, 1)))
13128 return XEXP (op, 0);
13129
13130 if (GET_CODE (op) == MULT
13131 && CONST_INT_P (XEXP (op, 1))
13132 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13133 return XEXP (op, 0);
13134
13135 return x;
13136 }
13137
13138 /* Helper function for rtx cost calculation. Strip an extend
13139 expression from X. Returns the inner operand if successful, or the
13140 original expression on failure. We deal with a number of possible
13141 canonicalization variations here. If STRIP_SHIFT is true, then
13142 we can strip off a shift also. */
13143 static rtx
13144 aarch64_strip_extend (rtx x, bool strip_shift)
13145 {
13146 scalar_int_mode mode;
13147 rtx op = x;
13148
13149 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13150 return op;
13151
13152 if (GET_CODE (op) == AND
13153 && GET_CODE (XEXP (op, 0)) == MULT
13154 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13155 && CONST_INT_P (XEXP (op, 1))
13156 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13157 INTVAL (XEXP (op, 1))) != 0)
13158 return XEXP (XEXP (op, 0), 0);
13159
13160 /* Now handle extended register, as this may also have an optional
13161 left shift by 1..4. */
13162 if (strip_shift
13163 && GET_CODE (op) == ASHIFT
13164 && CONST_INT_P (XEXP (op, 1))
13165 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13166 op = XEXP (op, 0);
13167
13168 if (GET_CODE (op) == ZERO_EXTEND
13169 || GET_CODE (op) == SIGN_EXTEND)
13170 op = XEXP (op, 0);
13171
13172 if (op != x)
13173 return op;
13174
13175 return x;
13176 }
13177
13178 /* Helper function for rtx cost calculation. Strip extension as well as any
13179 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13180 successful, or the original expression on failure. */
13181 static rtx
13182 aarch64_strip_extend_vec_half (rtx x)
13183 {
13184 if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13185 {
13186 x = XEXP (x, 0);
13187 if (GET_CODE (x) == VEC_SELECT
13188 && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13189 XEXP (x, 1)))
13190 x = XEXP (x, 0);
13191 }
13192 return x;
13193 }
13194
13195 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13196 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13197 operand if successful, or the original expression on failure. */
13198 static rtx
13199 aarch64_strip_duplicate_vec_elt (rtx x)
13200 {
13201 if (GET_CODE (x) == VEC_DUPLICATE
13202 && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13203 {
13204 x = XEXP (x, 0);
13205 if (GET_CODE (x) == VEC_SELECT)
13206 x = XEXP (x, 0);
13207 else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13208 && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13209 x = XEXP (XEXP (x, 0), 0);
13210 }
13211 return x;
13212 }
13213
13214 /* Return true iff CODE is a shift supported in combination
13215 with arithmetic instructions. */
13216
13217 static bool
13218 aarch64_shift_p (enum rtx_code code)
13219 {
13220 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13221 }
13222
13223
13224 /* Return true iff X is a cheap shift without a sign extend. */
13225
13226 static bool
13227 aarch64_cheap_mult_shift_p (rtx x)
13228 {
13229 rtx op0, op1;
13230
13231 op0 = XEXP (x, 0);
13232 op1 = XEXP (x, 1);
13233
13234 if (!(aarch64_tune_params.extra_tuning_flags
13235 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13236 return false;
13237
13238 if (GET_CODE (op0) == SIGN_EXTEND)
13239 return false;
13240
13241 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13242 && UINTVAL (op1) <= 4)
13243 return true;
13244
13245 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13246 return false;
13247
13248 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13249
13250 if (l2 > 0 && l2 <= 4)
13251 return true;
13252
13253 return false;
13254 }
13255
13256 /* Helper function for rtx cost calculation. Calculate the cost of
13257 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13258 Return the calculated cost of the expression, recursing manually in to
13259 operands where needed. */
13260
13261 static int
13262 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13263 {
13264 rtx op0, op1;
13265 const struct cpu_cost_table *extra_cost
13266 = aarch64_tune_params.insn_extra_cost;
13267 int cost = 0;
13268 bool compound_p = (outer == PLUS || outer == MINUS);
13269 machine_mode mode = GET_MODE (x);
13270
13271 gcc_checking_assert (code == MULT);
13272
13273 op0 = XEXP (x, 0);
13274 op1 = XEXP (x, 1);
13275
13276 if (VECTOR_MODE_P (mode))
13277 {
13278 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13279 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13280 {
13281 /* The select-operand-high-half versions of the instruction have the
13282 same cost as the three vector version - don't add the costs of the
13283 extension or selection into the costs of the multiply. */
13284 op0 = aarch64_strip_extend_vec_half (op0);
13285 op1 = aarch64_strip_extend_vec_half (op1);
13286 /* The by-element versions of the instruction have the same costs as
13287 the normal 3-vector version. We make an assumption that the input
13288 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13289 costing of a MUL by element pre RA is a bit optimistic. */
13290 op0 = aarch64_strip_duplicate_vec_elt (op0);
13291 op1 = aarch64_strip_duplicate_vec_elt (op1);
13292 }
13293 cost += rtx_cost (op0, mode, MULT, 0, speed);
13294 cost += rtx_cost (op1, mode, MULT, 1, speed);
13295 if (speed)
13296 {
13297 if (GET_CODE (x) == MULT)
13298 cost += extra_cost->vect.mult;
13299 /* This is to catch the SSRA costing currently flowing here. */
13300 else
13301 cost += extra_cost->vect.alu;
13302 }
13303 return cost;
13304 }
13305
13306 /* Integer multiply/fma. */
13307 if (GET_MODE_CLASS (mode) == MODE_INT)
13308 {
13309 /* The multiply will be canonicalized as a shift, cost it as such. */
13310 if (aarch64_shift_p (GET_CODE (x))
13311 || (CONST_INT_P (op1)
13312 && exact_log2 (INTVAL (op1)) > 0))
13313 {
13314 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13315 || GET_CODE (op0) == SIGN_EXTEND;
13316 if (speed)
13317 {
13318 if (compound_p)
13319 {
13320 /* If the shift is considered cheap,
13321 then don't add any cost. */
13322 if (aarch64_cheap_mult_shift_p (x))
13323 ;
13324 else if (REG_P (op1))
13325 /* ARITH + shift-by-register. */
13326 cost += extra_cost->alu.arith_shift_reg;
13327 else if (is_extend)
13328 /* ARITH + extended register. We don't have a cost field
13329 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13330 cost += extra_cost->alu.extend_arith;
13331 else
13332 /* ARITH + shift-by-immediate. */
13333 cost += extra_cost->alu.arith_shift;
13334 }
13335 else
13336 /* LSL (immediate). */
13337 cost += extra_cost->alu.shift;
13338
13339 }
13340 /* Strip extends as we will have costed them in the case above. */
13341 if (is_extend)
13342 op0 = aarch64_strip_extend (op0, true);
13343
13344 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13345
13346 return cost;
13347 }
13348
13349 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13350 compound and let the below cases handle it. After all, MNEG is a
13351 special-case alias of MSUB. */
13352 if (GET_CODE (op0) == NEG)
13353 {
13354 op0 = XEXP (op0, 0);
13355 compound_p = true;
13356 }
13357
13358 /* Integer multiplies or FMAs have zero/sign extending variants. */
13359 if ((GET_CODE (op0) == ZERO_EXTEND
13360 && GET_CODE (op1) == ZERO_EXTEND)
13361 || (GET_CODE (op0) == SIGN_EXTEND
13362 && GET_CODE (op1) == SIGN_EXTEND))
13363 {
13364 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13365 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13366
13367 if (speed)
13368 {
13369 if (compound_p)
13370 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13371 cost += extra_cost->mult[0].extend_add;
13372 else
13373 /* MUL/SMULL/UMULL. */
13374 cost += extra_cost->mult[0].extend;
13375 }
13376
13377 return cost;
13378 }
13379
13380 /* This is either an integer multiply or a MADD. In both cases
13381 we want to recurse and cost the operands. */
13382 cost += rtx_cost (op0, mode, MULT, 0, speed);
13383 cost += rtx_cost (op1, mode, MULT, 1, speed);
13384
13385 if (speed)
13386 {
13387 if (compound_p)
13388 /* MADD/MSUB. */
13389 cost += extra_cost->mult[mode == DImode].add;
13390 else
13391 /* MUL. */
13392 cost += extra_cost->mult[mode == DImode].simple;
13393 }
13394
13395 return cost;
13396 }
13397 else
13398 {
13399 if (speed)
13400 {
13401 /* Floating-point FMA/FMUL can also support negations of the
13402 operands, unless the rounding mode is upward or downward in
13403 which case FNMUL is different than FMUL with operand negation. */
13404 bool neg0 = GET_CODE (op0) == NEG;
13405 bool neg1 = GET_CODE (op1) == NEG;
13406 if (compound_p || !flag_rounding_math || (neg0 && neg1))
13407 {
13408 if (neg0)
13409 op0 = XEXP (op0, 0);
13410 if (neg1)
13411 op1 = XEXP (op1, 0);
13412 }
13413
13414 if (compound_p)
13415 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13416 cost += extra_cost->fp[mode == DFmode].fma;
13417 else
13418 /* FMUL/FNMUL. */
13419 cost += extra_cost->fp[mode == DFmode].mult;
13420 }
13421
13422 cost += rtx_cost (op0, mode, MULT, 0, speed);
13423 cost += rtx_cost (op1, mode, MULT, 1, speed);
13424 return cost;
13425 }
13426 }
13427
13428 static int
13429 aarch64_address_cost (rtx x,
13430 machine_mode mode,
13431 addr_space_t as ATTRIBUTE_UNUSED,
13432 bool speed)
13433 {
13434 enum rtx_code c = GET_CODE (x);
13435 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13436 struct aarch64_address_info info;
13437 int cost = 0;
13438 info.shift = 0;
13439
13440 if (!aarch64_classify_address (&info, x, mode, false))
13441 {
13442 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13443 {
13444 /* This is a CONST or SYMBOL ref which will be split
13445 in a different way depending on the code model in use.
13446 Cost it through the generic infrastructure. */
13447 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13448 /* Divide through by the cost of one instruction to
13449 bring it to the same units as the address costs. */
13450 cost_symbol_ref /= COSTS_N_INSNS (1);
13451 /* The cost is then the cost of preparing the address,
13452 followed by an immediate (possibly 0) offset. */
13453 return cost_symbol_ref + addr_cost->imm_offset;
13454 }
13455 else
13456 {
13457 /* This is most likely a jump table from a case
13458 statement. */
13459 return addr_cost->register_offset;
13460 }
13461 }
13462
13463 switch (info.type)
13464 {
13465 case ADDRESS_LO_SUM:
13466 case ADDRESS_SYMBOLIC:
13467 case ADDRESS_REG_IMM:
13468 cost += addr_cost->imm_offset;
13469 break;
13470
13471 case ADDRESS_REG_WB:
13472 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13473 cost += addr_cost->pre_modify;
13474 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13475 {
13476 unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13477 if (nvectors == 3)
13478 cost += addr_cost->post_modify_ld3_st3;
13479 else if (nvectors == 4)
13480 cost += addr_cost->post_modify_ld4_st4;
13481 else
13482 cost += addr_cost->post_modify;
13483 }
13484 else
13485 gcc_unreachable ();
13486
13487 break;
13488
13489 case ADDRESS_REG_REG:
13490 cost += addr_cost->register_offset;
13491 break;
13492
13493 case ADDRESS_REG_SXTW:
13494 cost += addr_cost->register_sextend;
13495 break;
13496
13497 case ADDRESS_REG_UXTW:
13498 cost += addr_cost->register_zextend;
13499 break;
13500
13501 default:
13502 gcc_unreachable ();
13503 }
13504
13505
13506 if (info.shift > 0)
13507 {
13508 /* For the sake of calculating the cost of the shifted register
13509 component, we can treat same sized modes in the same way. */
13510 if (known_eq (GET_MODE_BITSIZE (mode), 16))
13511 cost += addr_cost->addr_scale_costs.hi;
13512 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13513 cost += addr_cost->addr_scale_costs.si;
13514 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13515 cost += addr_cost->addr_scale_costs.di;
13516 else
13517 /* We can't tell, or this is a 128-bit vector. */
13518 cost += addr_cost->addr_scale_costs.ti;
13519 }
13520
13521 return cost;
13522 }
13523
13524 /* Return the cost of a branch. If SPEED_P is true then the compiler is
13525 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
13526 to be taken. */
13527
13528 int
13529 aarch64_branch_cost (bool speed_p, bool predictable_p)
13530 {
13531 /* When optimizing for speed, use the cost of unpredictable branches. */
13532 const struct cpu_branch_cost *branch_costs =
13533 aarch64_tune_params.branch_costs;
13534
13535 if (!speed_p || predictable_p)
13536 return branch_costs->predictable;
13537 else
13538 return branch_costs->unpredictable;
13539 }
13540
13541 /* Return true if X is a zero or sign extract
13542 usable in an ADD or SUB (extended register) instruction. */
13543 static bool
13544 aarch64_rtx_arith_op_extract_p (rtx x)
13545 {
13546 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13547 No shift. */
13548 if (GET_CODE (x) == SIGN_EXTEND
13549 || GET_CODE (x) == ZERO_EXTEND)
13550 return REG_P (XEXP (x, 0));
13551
13552 return false;
13553 }
13554
13555 static bool
13556 aarch64_frint_unspec_p (unsigned int u)
13557 {
13558 switch (u)
13559 {
13560 case UNSPEC_FRINTZ:
13561 case UNSPEC_FRINTP:
13562 case UNSPEC_FRINTM:
13563 case UNSPEC_FRINTA:
13564 case UNSPEC_FRINTN:
13565 case UNSPEC_FRINTX:
13566 case UNSPEC_FRINTI:
13567 return true;
13568
13569 default:
13570 return false;
13571 }
13572 }
13573
13574 /* Return true iff X is an rtx that will match an extr instruction
13575 i.e. as described in the *extr<mode>5_insn family of patterns.
13576 OP0 and OP1 will be set to the operands of the shifts involved
13577 on success and will be NULL_RTX otherwise. */
13578
13579 static bool
13580 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13581 {
13582 rtx op0, op1;
13583 scalar_int_mode mode;
13584 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13585 return false;
13586
13587 *res_op0 = NULL_RTX;
13588 *res_op1 = NULL_RTX;
13589
13590 if (GET_CODE (x) != IOR)
13591 return false;
13592
13593 op0 = XEXP (x, 0);
13594 op1 = XEXP (x, 1);
13595
13596 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13597 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13598 {
13599 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
13600 if (GET_CODE (op1) == ASHIFT)
13601 std::swap (op0, op1);
13602
13603 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13604 return false;
13605
13606 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13607 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13608
13609 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13610 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13611 {
13612 *res_op0 = XEXP (op0, 0);
13613 *res_op1 = XEXP (op1, 0);
13614 return true;
13615 }
13616 }
13617
13618 return false;
13619 }
13620
13621 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13622 storing it in *COST. Result is true if the total cost of the operation
13623 has now been calculated. */
13624 static bool
13625 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13626 {
13627 rtx inner;
13628 rtx comparator;
13629 enum rtx_code cmpcode;
13630 const struct cpu_cost_table *extra_cost
13631 = aarch64_tune_params.insn_extra_cost;
13632
13633 if (COMPARISON_P (op0))
13634 {
13635 inner = XEXP (op0, 0);
13636 comparator = XEXP (op0, 1);
13637 cmpcode = GET_CODE (op0);
13638 }
13639 else
13640 {
13641 inner = op0;
13642 comparator = const0_rtx;
13643 cmpcode = NE;
13644 }
13645
13646 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13647 {
13648 /* Conditional branch. */
13649 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13650 return true;
13651 else
13652 {
13653 if (cmpcode == NE || cmpcode == EQ)
13654 {
13655 if (comparator == const0_rtx)
13656 {
13657 /* TBZ/TBNZ/CBZ/CBNZ. */
13658 if (GET_CODE (inner) == ZERO_EXTRACT)
13659 /* TBZ/TBNZ. */
13660 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13661 ZERO_EXTRACT, 0, speed);
13662 else
13663 /* CBZ/CBNZ. */
13664 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13665
13666 return true;
13667 }
13668 if (register_operand (inner, VOIDmode)
13669 && aarch64_imm24 (comparator, VOIDmode))
13670 {
13671 /* SUB and SUBS. */
13672 *cost += COSTS_N_INSNS (2);
13673 if (speed)
13674 *cost += extra_cost->alu.arith * 2;
13675 return true;
13676 }
13677 }
13678 else if (cmpcode == LT || cmpcode == GE)
13679 {
13680 /* TBZ/TBNZ. */
13681 if (comparator == const0_rtx)
13682 return true;
13683 }
13684 }
13685 }
13686 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13687 {
13688 /* CCMP. */
13689 if (GET_CODE (op1) == COMPARE)
13690 {
13691 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
13692 if (XEXP (op1, 1) == const0_rtx)
13693 *cost += 1;
13694 if (speed)
13695 {
13696 machine_mode mode = GET_MODE (XEXP (op1, 0));
13697
13698 if (GET_MODE_CLASS (mode) == MODE_INT)
13699 *cost += extra_cost->alu.arith;
13700 else
13701 *cost += extra_cost->fp[mode == DFmode].compare;
13702 }
13703 return true;
13704 }
13705
13706 /* It's a conditional operation based on the status flags,
13707 so it must be some flavor of CSEL. */
13708
13709 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
13710 if (GET_CODE (op1) == NEG
13711 || GET_CODE (op1) == NOT
13712 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13713 op1 = XEXP (op1, 0);
13714 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13715 {
13716 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
13717 op1 = XEXP (op1, 0);
13718 op2 = XEXP (op2, 0);
13719 }
13720 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13721 {
13722 inner = XEXP (op1, 0);
13723 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13724 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
13725 op1 = XEXP (inner, 0);
13726 }
13727
13728 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13729 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13730 return true;
13731 }
13732
13733 /* We don't know what this is, cost all operands. */
13734 return false;
13735 }
13736
13737 /* Check whether X is a bitfield operation of the form shift + extend that
13738 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
13739 operand to which the bitfield operation is applied. Otherwise return
13740 NULL_RTX. */
13741
13742 static rtx
13743 aarch64_extend_bitfield_pattern_p (rtx x)
13744 {
13745 rtx_code outer_code = GET_CODE (x);
13746 machine_mode outer_mode = GET_MODE (x);
13747
13748 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13749 && outer_mode != SImode && outer_mode != DImode)
13750 return NULL_RTX;
13751
13752 rtx inner = XEXP (x, 0);
13753 rtx_code inner_code = GET_CODE (inner);
13754 machine_mode inner_mode = GET_MODE (inner);
13755 rtx op = NULL_RTX;
13756
13757 switch (inner_code)
13758 {
13759 case ASHIFT:
13760 if (CONST_INT_P (XEXP (inner, 1))
13761 && (inner_mode == QImode || inner_mode == HImode))
13762 op = XEXP (inner, 0);
13763 break;
13764 case LSHIFTRT:
13765 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13766 && (inner_mode == QImode || inner_mode == HImode))
13767 op = XEXP (inner, 0);
13768 break;
13769 case ASHIFTRT:
13770 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13771 && (inner_mode == QImode || inner_mode == HImode))
13772 op = XEXP (inner, 0);
13773 break;
13774 default:
13775 break;
13776 }
13777
13778 return op;
13779 }
13780
13781 /* Return true if the mask and a shift amount from an RTX of the form
13782 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13783 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
13784
13785 bool
13786 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13787 rtx shft_amnt)
13788 {
13789 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
13790 && INTVAL (mask) > 0
13791 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13792 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13793 && (UINTVAL (mask)
13794 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
13795 }
13796
13797 /* Return true if the masks and a shift amount from an RTX of the form
13798 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13799 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
13800
13801 bool
13802 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13803 unsigned HOST_WIDE_INT mask1,
13804 unsigned HOST_WIDE_INT shft_amnt,
13805 unsigned HOST_WIDE_INT mask2)
13806 {
13807 unsigned HOST_WIDE_INT t;
13808
13809 /* Verify that there is no overlap in what bits are set in the two masks. */
13810 if (mask1 != ~mask2)
13811 return false;
13812
13813 /* Verify that mask2 is not all zeros or ones. */
13814 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13815 return false;
13816
13817 /* The shift amount should always be less than the mode size. */
13818 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13819
13820 /* Verify that the mask being shifted is contiguous and would be in the
13821 least significant bits after shifting by shft_amnt. */
13822 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13823 return (t == (t & -t));
13824 }
13825
13826 /* Calculate the cost of calculating X, storing it in *COST. Result
13827 is true if the total cost of the operation has now been calculated. */
13828 static bool
13829 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
13830 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13831 {
13832 rtx op0, op1, op2;
13833 const struct cpu_cost_table *extra_cost
13834 = aarch64_tune_params.insn_extra_cost;
13835 rtx_code code = GET_CODE (x);
13836 scalar_int_mode int_mode;
13837
13838 /* By default, assume that everything has equivalent cost to the
13839 cheapest instruction. Any additional costs are applied as a delta
13840 above this default. */
13841 *cost = COSTS_N_INSNS (1);
13842
13843 switch (code)
13844 {
13845 case SET:
13846 /* The cost depends entirely on the operands to SET. */
13847 *cost = 0;
13848 op0 = SET_DEST (x);
13849 op1 = SET_SRC (x);
13850
13851 switch (GET_CODE (op0))
13852 {
13853 case MEM:
13854 if (speed)
13855 {
13856 rtx address = XEXP (op0, 0);
13857 if (VECTOR_MODE_P (mode))
13858 *cost += extra_cost->ldst.storev;
13859 else if (GET_MODE_CLASS (mode) == MODE_INT)
13860 *cost += extra_cost->ldst.store;
13861 else if (mode == SFmode || mode == SDmode)
13862 *cost += extra_cost->ldst.storef;
13863 else if (mode == DFmode || mode == DDmode)
13864 *cost += extra_cost->ldst.stored;
13865
13866 *cost +=
13867 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13868 0, speed));
13869 }
13870
13871 *cost += rtx_cost (op1, mode, SET, 1, speed);
13872 return true;
13873
13874 case SUBREG:
13875 if (! REG_P (SUBREG_REG (op0)))
13876 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
13877
13878 /* Fall through. */
13879 case REG:
13880 /* The cost is one per vector-register copied. */
13881 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
13882 {
13883 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
13884 *cost = COSTS_N_INSNS (nregs);
13885 }
13886 /* const0_rtx is in general free, but we will use an
13887 instruction to set a register to 0. */
13888 else if (REG_P (op1) || op1 == const0_rtx)
13889 {
13890 /* The cost is 1 per register copied. */
13891 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
13892 *cost = COSTS_N_INSNS (nregs);
13893 }
13894 else
13895 /* Cost is just the cost of the RHS of the set. */
13896 *cost += rtx_cost (op1, mode, SET, 1, speed);
13897 return true;
13898
13899 case ZERO_EXTRACT:
13900 case SIGN_EXTRACT:
13901 /* Bit-field insertion. Strip any redundant widening of
13902 the RHS to meet the width of the target. */
13903 if (SUBREG_P (op1))
13904 op1 = SUBREG_REG (op1);
13905 if ((GET_CODE (op1) == ZERO_EXTEND
13906 || GET_CODE (op1) == SIGN_EXTEND)
13907 && CONST_INT_P (XEXP (op0, 1))
13908 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
13909 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
13910 op1 = XEXP (op1, 0);
13911
13912 if (CONST_INT_P (op1))
13913 {
13914 /* MOV immediate is assumed to always be cheap. */
13915 *cost = COSTS_N_INSNS (1);
13916 }
13917 else
13918 {
13919 /* BFM. */
13920 if (speed)
13921 *cost += extra_cost->alu.bfi;
13922 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
13923 }
13924
13925 return true;
13926
13927 default:
13928 /* We can't make sense of this, assume default cost. */
13929 *cost = COSTS_N_INSNS (1);
13930 return false;
13931 }
13932 return false;
13933
13934 case CONST_INT:
13935 /* If an instruction can incorporate a constant within the
13936 instruction, the instruction's expression avoids calling
13937 rtx_cost() on the constant. If rtx_cost() is called on a
13938 constant, then it is usually because the constant must be
13939 moved into a register by one or more instructions.
13940
13941 The exception is constant 0, which can be expressed
13942 as XZR/WZR and is therefore free. The exception to this is
13943 if we have (set (reg) (const0_rtx)) in which case we must cost
13944 the move. However, we can catch that when we cost the SET, so
13945 we don't need to consider that here. */
13946 if (x == const0_rtx)
13947 *cost = 0;
13948 else
13949 {
13950 /* To an approximation, building any other constant is
13951 proportionally expensive to the number of instructions
13952 required to build that constant. This is true whether we
13953 are compiling for SPEED or otherwise. */
13954 machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
13955 ? SImode : DImode;
13956 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
13957 (NULL_RTX, x, false, imode));
13958 }
13959 return true;
13960
13961 case CONST_DOUBLE:
13962
13963 /* First determine number of instructions to do the move
13964 as an integer constant. */
13965 if (!aarch64_float_const_representable_p (x)
13966 && !aarch64_can_const_movi_rtx_p (x, mode)
13967 && aarch64_float_const_rtx_p (x))
13968 {
13969 unsigned HOST_WIDE_INT ival;
13970 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
13971 gcc_assert (succeed);
13972
13973 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
13974 ? DImode : SImode;
13975 int ncost = aarch64_internal_mov_immediate
13976 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
13977 *cost += COSTS_N_INSNS (ncost);
13978 return true;
13979 }
13980
13981 if (speed)
13982 {
13983 /* mov[df,sf]_aarch64. */
13984 if (aarch64_float_const_representable_p (x))
13985 /* FMOV (scalar immediate). */
13986 *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
13987 else if (!aarch64_float_const_zero_rtx_p (x))
13988 {
13989 /* This will be a load from memory. */
13990 if (mode == DFmode || mode == DDmode)
13991 *cost += extra_cost->ldst.loadd;
13992 else
13993 *cost += extra_cost->ldst.loadf;
13994 }
13995 else
13996 /* Otherwise this is +0.0. We get this using MOVI d0, #0
13997 or MOV v0.s[0], wzr - neither of which are modeled by the
13998 cost tables. Just use the default cost. */
13999 {
14000 }
14001 }
14002
14003 return true;
14004
14005 case MEM:
14006 if (speed)
14007 {
14008 /* For loads we want the base cost of a load, plus an
14009 approximation for the additional cost of the addressing
14010 mode. */
14011 rtx address = XEXP (x, 0);
14012 if (VECTOR_MODE_P (mode))
14013 *cost += extra_cost->ldst.loadv;
14014 else if (GET_MODE_CLASS (mode) == MODE_INT)
14015 *cost += extra_cost->ldst.load;
14016 else if (mode == SFmode || mode == SDmode)
14017 *cost += extra_cost->ldst.loadf;
14018 else if (mode == DFmode || mode == DDmode)
14019 *cost += extra_cost->ldst.loadd;
14020
14021 *cost +=
14022 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14023 0, speed));
14024 }
14025
14026 return true;
14027
14028 case NEG:
14029 op0 = XEXP (x, 0);
14030
14031 if (VECTOR_MODE_P (mode))
14032 {
14033 if (speed)
14034 {
14035 /* FNEG. */
14036 *cost += extra_cost->vect.alu;
14037 }
14038 return false;
14039 }
14040
14041 if (GET_MODE_CLASS (mode) == MODE_INT)
14042 {
14043 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14044 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14045 {
14046 /* CSETM. */
14047 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14048 return true;
14049 }
14050
14051 /* Cost this as SUB wzr, X. */
14052 op0 = CONST0_RTX (mode);
14053 op1 = XEXP (x, 0);
14054 goto cost_minus;
14055 }
14056
14057 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14058 {
14059 /* Support (neg(fma...)) as a single instruction only if
14060 sign of zeros is unimportant. This matches the decision
14061 making in aarch64.md. */
14062 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14063 {
14064 /* FNMADD. */
14065 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14066 return true;
14067 }
14068 if (GET_CODE (op0) == MULT)
14069 {
14070 /* FNMUL. */
14071 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14072 return true;
14073 }
14074 if (speed)
14075 /* FNEG. */
14076 *cost += extra_cost->fp[mode == DFmode].neg;
14077 return false;
14078 }
14079
14080 return false;
14081
14082 case CLRSB:
14083 case CLZ:
14084 if (speed)
14085 {
14086 if (VECTOR_MODE_P (mode))
14087 *cost += extra_cost->vect.alu;
14088 else
14089 *cost += extra_cost->alu.clz;
14090 }
14091
14092 return false;
14093
14094 case CTZ:
14095 *cost = COSTS_N_INSNS (2);
14096
14097 if (speed)
14098 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14099 return false;
14100
14101 case COMPARE:
14102 op0 = XEXP (x, 0);
14103 op1 = XEXP (x, 1);
14104
14105 if (op1 == const0_rtx
14106 && GET_CODE (op0) == AND)
14107 {
14108 x = op0;
14109 mode = GET_MODE (op0);
14110 goto cost_logic;
14111 }
14112
14113 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14114 {
14115 /* TODO: A write to the CC flags possibly costs extra, this
14116 needs encoding in the cost tables. */
14117
14118 mode = GET_MODE (op0);
14119 /* ANDS. */
14120 if (GET_CODE (op0) == AND)
14121 {
14122 x = op0;
14123 goto cost_logic;
14124 }
14125
14126 if (GET_CODE (op0) == PLUS)
14127 {
14128 /* ADDS (and CMN alias). */
14129 x = op0;
14130 goto cost_plus;
14131 }
14132
14133 if (GET_CODE (op0) == MINUS)
14134 {
14135 /* SUBS. */
14136 x = op0;
14137 goto cost_minus;
14138 }
14139
14140 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14141 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14142 && CONST_INT_P (XEXP (op0, 2)))
14143 {
14144 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14145 Handle it here directly rather than going to cost_logic
14146 since we know the immediate generated for the TST is valid
14147 so we can avoid creating an intermediate rtx for it only
14148 for costing purposes. */
14149 if (speed)
14150 *cost += extra_cost->alu.logical;
14151
14152 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14153 ZERO_EXTRACT, 0, speed);
14154 return true;
14155 }
14156
14157 if (GET_CODE (op1) == NEG)
14158 {
14159 /* CMN. */
14160 if (speed)
14161 *cost += extra_cost->alu.arith;
14162
14163 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14164 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14165 return true;
14166 }
14167
14168 /* CMP.
14169
14170 Compare can freely swap the order of operands, and
14171 canonicalization puts the more complex operation first.
14172 But the integer MINUS logic expects the shift/extend
14173 operation in op1. */
14174 if (! (REG_P (op0)
14175 || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14176 {
14177 op0 = XEXP (x, 1);
14178 op1 = XEXP (x, 0);
14179 }
14180 goto cost_minus;
14181 }
14182
14183 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14184 {
14185 /* FCMP. */
14186 if (speed)
14187 *cost += extra_cost->fp[mode == DFmode].compare;
14188
14189 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14190 {
14191 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14192 /* FCMP supports constant 0.0 for no extra cost. */
14193 return true;
14194 }
14195 return false;
14196 }
14197
14198 if (VECTOR_MODE_P (mode))
14199 {
14200 /* Vector compare. */
14201 if (speed)
14202 *cost += extra_cost->vect.alu;
14203
14204 if (aarch64_float_const_zero_rtx_p (op1))
14205 {
14206 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14207 cost. */
14208 return true;
14209 }
14210 return false;
14211 }
14212 return false;
14213
14214 case MINUS:
14215 {
14216 op0 = XEXP (x, 0);
14217 op1 = XEXP (x, 1);
14218
14219 cost_minus:
14220 if (VECTOR_MODE_P (mode))
14221 {
14222 /* SUBL2 and SUBW2. */
14223 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14224 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14225 {
14226 /* The select-operand-high-half versions of the sub instruction
14227 have the same cost as the regular three vector version -
14228 don't add the costs of the select into the costs of the sub.
14229 */
14230 op0 = aarch64_strip_extend_vec_half (op0);
14231 op1 = aarch64_strip_extend_vec_half (op1);
14232 }
14233 }
14234
14235 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14236
14237 /* Detect valid immediates. */
14238 if ((GET_MODE_CLASS (mode) == MODE_INT
14239 || (GET_MODE_CLASS (mode) == MODE_CC
14240 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14241 && CONST_INT_P (op1)
14242 && aarch64_uimm12_shift (INTVAL (op1)))
14243 {
14244 if (speed)
14245 /* SUB(S) (immediate). */
14246 *cost += extra_cost->alu.arith;
14247 return true;
14248 }
14249
14250 /* Look for SUB (extended register). */
14251 if (is_a <scalar_int_mode> (mode)
14252 && aarch64_rtx_arith_op_extract_p (op1))
14253 {
14254 if (speed)
14255 *cost += extra_cost->alu.extend_arith;
14256
14257 op1 = aarch64_strip_extend (op1, true);
14258 *cost += rtx_cost (op1, VOIDmode,
14259 (enum rtx_code) GET_CODE (op1), 0, speed);
14260 return true;
14261 }
14262
14263 rtx new_op1 = aarch64_strip_extend (op1, false);
14264
14265 /* Cost this as an FMA-alike operation. */
14266 if ((GET_CODE (new_op1) == MULT
14267 || aarch64_shift_p (GET_CODE (new_op1)))
14268 && code != COMPARE)
14269 {
14270 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14271 (enum rtx_code) code,
14272 speed);
14273 return true;
14274 }
14275
14276 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14277
14278 if (speed)
14279 {
14280 if (VECTOR_MODE_P (mode))
14281 {
14282 /* Vector SUB. */
14283 *cost += extra_cost->vect.alu;
14284 }
14285 else if (GET_MODE_CLASS (mode) == MODE_INT)
14286 {
14287 /* SUB(S). */
14288 *cost += extra_cost->alu.arith;
14289 }
14290 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14291 {
14292 /* FSUB. */
14293 *cost += extra_cost->fp[mode == DFmode].addsub;
14294 }
14295 }
14296 return true;
14297 }
14298
14299 case PLUS:
14300 {
14301 rtx new_op0;
14302
14303 op0 = XEXP (x, 0);
14304 op1 = XEXP (x, 1);
14305
14306 cost_plus:
14307 if (VECTOR_MODE_P (mode))
14308 {
14309 /* ADDL2 and ADDW2. */
14310 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14311 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14312 {
14313 /* The select-operand-high-half versions of the add instruction
14314 have the same cost as the regular three vector version -
14315 don't add the costs of the select into the costs of the add.
14316 */
14317 op0 = aarch64_strip_extend_vec_half (op0);
14318 op1 = aarch64_strip_extend_vec_half (op1);
14319 }
14320 }
14321
14322 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14323 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14324 {
14325 /* CSINC. */
14326 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14327 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14328 return true;
14329 }
14330
14331 if (GET_MODE_CLASS (mode) == MODE_INT
14332 && (aarch64_plus_immediate (op1, mode)
14333 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14334 {
14335 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14336
14337 if (speed)
14338 {
14339 /* ADD (immediate). */
14340 *cost += extra_cost->alu.arith;
14341
14342 /* Some tunings prefer to not use the VL-based scalar ops.
14343 Increase the cost of the poly immediate to prevent their
14344 formation. */
14345 if (GET_CODE (op1) == CONST_POLY_INT
14346 && (aarch64_tune_params.extra_tuning_flags
14347 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14348 *cost += COSTS_N_INSNS (1);
14349 }
14350 return true;
14351 }
14352
14353 if (aarch64_pluslong_immediate (op1, mode))
14354 {
14355 /* 24-bit add in 2 instructions or 12-bit shifted add. */
14356 if ((INTVAL (op1) & 0xfff) != 0)
14357 *cost += COSTS_N_INSNS (1);
14358
14359 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14360 return true;
14361 }
14362
14363 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14364
14365 /* Look for ADD (extended register). */
14366 if (is_a <scalar_int_mode> (mode)
14367 && aarch64_rtx_arith_op_extract_p (op0))
14368 {
14369 if (speed)
14370 *cost += extra_cost->alu.extend_arith;
14371
14372 op0 = aarch64_strip_extend (op0, true);
14373 *cost += rtx_cost (op0, VOIDmode,
14374 (enum rtx_code) GET_CODE (op0), 0, speed);
14375 return true;
14376 }
14377
14378 /* Strip any extend, leave shifts behind as we will
14379 cost them through mult_cost. */
14380 new_op0 = aarch64_strip_extend (op0, false);
14381
14382 if (GET_CODE (new_op0) == MULT
14383 || aarch64_shift_p (GET_CODE (new_op0)))
14384 {
14385 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14386 speed);
14387 return true;
14388 }
14389
14390 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14391
14392 if (speed)
14393 {
14394 if (VECTOR_MODE_P (mode))
14395 {
14396 /* Vector ADD. */
14397 *cost += extra_cost->vect.alu;
14398 }
14399 else if (GET_MODE_CLASS (mode) == MODE_INT)
14400 {
14401 /* ADD. */
14402 *cost += extra_cost->alu.arith;
14403 }
14404 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14405 {
14406 /* FADD. */
14407 *cost += extra_cost->fp[mode == DFmode].addsub;
14408 }
14409 }
14410 return true;
14411 }
14412
14413 case BSWAP:
14414 *cost = COSTS_N_INSNS (1);
14415
14416 if (speed)
14417 {
14418 if (VECTOR_MODE_P (mode))
14419 *cost += extra_cost->vect.alu;
14420 else
14421 *cost += extra_cost->alu.rev;
14422 }
14423 return false;
14424
14425 case IOR:
14426 if (aarch_rev16_p (x))
14427 {
14428 *cost = COSTS_N_INSNS (1);
14429
14430 if (speed)
14431 {
14432 if (VECTOR_MODE_P (mode))
14433 *cost += extra_cost->vect.alu;
14434 else
14435 *cost += extra_cost->alu.rev;
14436 }
14437 return true;
14438 }
14439
14440 if (aarch64_extr_rtx_p (x, &op0, &op1))
14441 {
14442 *cost += rtx_cost (op0, mode, IOR, 0, speed);
14443 *cost += rtx_cost (op1, mode, IOR, 1, speed);
14444 if (speed)
14445 *cost += extra_cost->alu.shift;
14446
14447 return true;
14448 }
14449 /* Fall through. */
14450 case XOR:
14451 case AND:
14452 cost_logic:
14453 op0 = XEXP (x, 0);
14454 op1 = XEXP (x, 1);
14455
14456 if (VECTOR_MODE_P (mode))
14457 {
14458 if (speed)
14459 *cost += extra_cost->vect.alu;
14460 return true;
14461 }
14462
14463 if (code == AND
14464 && GET_CODE (op0) == MULT
14465 && CONST_INT_P (XEXP (op0, 1))
14466 && CONST_INT_P (op1)
14467 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14468 INTVAL (op1)) != 0)
14469 {
14470 /* This is a UBFM/SBFM. */
14471 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14472 if (speed)
14473 *cost += extra_cost->alu.bfx;
14474 return true;
14475 }
14476
14477 if (is_int_mode (mode, &int_mode))
14478 {
14479 if (CONST_INT_P (op1))
14480 {
14481 /* We have a mask + shift version of a UBFIZ
14482 i.e. the *andim_ashift<mode>_bfiz pattern. */
14483 if (GET_CODE (op0) == ASHIFT
14484 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14485 XEXP (op0, 1)))
14486 {
14487 *cost += rtx_cost (XEXP (op0, 0), int_mode,
14488 (enum rtx_code) code, 0, speed);
14489 if (speed)
14490 *cost += extra_cost->alu.bfx;
14491
14492 return true;
14493 }
14494 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14495 {
14496 /* We possibly get the immediate for free, this is not
14497 modelled. */
14498 *cost += rtx_cost (op0, int_mode,
14499 (enum rtx_code) code, 0, speed);
14500 if (speed)
14501 *cost += extra_cost->alu.logical;
14502
14503 return true;
14504 }
14505 }
14506 else
14507 {
14508 rtx new_op0 = op0;
14509
14510 /* Handle ORN, EON, or BIC. */
14511 if (GET_CODE (op0) == NOT)
14512 op0 = XEXP (op0, 0);
14513
14514 new_op0 = aarch64_strip_shift (op0);
14515
14516 /* If we had a shift on op0 then this is a logical-shift-
14517 by-register/immediate operation. Otherwise, this is just
14518 a logical operation. */
14519 if (speed)
14520 {
14521 if (new_op0 != op0)
14522 {
14523 /* Shift by immediate. */
14524 if (CONST_INT_P (XEXP (op0, 1)))
14525 *cost += extra_cost->alu.log_shift;
14526 else
14527 *cost += extra_cost->alu.log_shift_reg;
14528 }
14529 else
14530 *cost += extra_cost->alu.logical;
14531 }
14532
14533 /* In both cases we want to cost both operands. */
14534 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14535 0, speed);
14536 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14537 1, speed);
14538
14539 return true;
14540 }
14541 }
14542 return false;
14543
14544 case NOT:
14545 x = XEXP (x, 0);
14546 op0 = aarch64_strip_shift (x);
14547
14548 if (VECTOR_MODE_P (mode))
14549 {
14550 /* Vector NOT. */
14551 *cost += extra_cost->vect.alu;
14552 return false;
14553 }
14554
14555 /* MVN-shifted-reg. */
14556 if (op0 != x)
14557 {
14558 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14559
14560 if (speed)
14561 *cost += extra_cost->alu.log_shift;
14562
14563 return true;
14564 }
14565 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14566 Handle the second form here taking care that 'a' in the above can
14567 be a shift. */
14568 else if (GET_CODE (op0) == XOR)
14569 {
14570 rtx newop0 = XEXP (op0, 0);
14571 rtx newop1 = XEXP (op0, 1);
14572 rtx op0_stripped = aarch64_strip_shift (newop0);
14573
14574 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14575 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14576
14577 if (speed)
14578 {
14579 if (op0_stripped != newop0)
14580 *cost += extra_cost->alu.log_shift;
14581 else
14582 *cost += extra_cost->alu.logical;
14583 }
14584
14585 return true;
14586 }
14587 /* MVN. */
14588 if (speed)
14589 *cost += extra_cost->alu.logical;
14590
14591 return false;
14592
14593 case ZERO_EXTEND:
14594
14595 op0 = XEXP (x, 0);
14596 /* If a value is written in SI mode, then zero extended to DI
14597 mode, the operation will in general be free as a write to
14598 a 'w' register implicitly zeroes the upper bits of an 'x'
14599 register. However, if this is
14600
14601 (set (reg) (zero_extend (reg)))
14602
14603 we must cost the explicit register move. */
14604 if (mode == DImode
14605 && GET_MODE (op0) == SImode)
14606 {
14607 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14608
14609 /* If OP_COST is non-zero, then the cost of the zero extend
14610 is effectively the cost of the inner operation. Otherwise
14611 we have a MOV instruction and we take the cost from the MOV
14612 itself. This is true independently of whether we are
14613 optimizing for space or time. */
14614 if (op_cost)
14615 *cost = op_cost;
14616
14617 return true;
14618 }
14619 else if (MEM_P (op0))
14620 {
14621 /* All loads can zero extend to any size for free. */
14622 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14623 return true;
14624 }
14625
14626 op0 = aarch64_extend_bitfield_pattern_p (x);
14627 if (op0)
14628 {
14629 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14630 if (speed)
14631 *cost += extra_cost->alu.bfx;
14632 return true;
14633 }
14634
14635 if (speed)
14636 {
14637 if (VECTOR_MODE_P (mode))
14638 {
14639 /* UMOV. */
14640 *cost += extra_cost->vect.alu;
14641 }
14642 else
14643 {
14644 /* We generate an AND instead of UXTB/UXTH. */
14645 *cost += extra_cost->alu.logical;
14646 }
14647 }
14648 return false;
14649
14650 case SIGN_EXTEND:
14651 if (MEM_P (XEXP (x, 0)))
14652 {
14653 /* LDRSH. */
14654 if (speed)
14655 {
14656 rtx address = XEXP (XEXP (x, 0), 0);
14657 *cost += extra_cost->ldst.load_sign_extend;
14658
14659 *cost +=
14660 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14661 0, speed));
14662 }
14663 return true;
14664 }
14665
14666 op0 = aarch64_extend_bitfield_pattern_p (x);
14667 if (op0)
14668 {
14669 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14670 if (speed)
14671 *cost += extra_cost->alu.bfx;
14672 return true;
14673 }
14674
14675 if (speed)
14676 {
14677 if (VECTOR_MODE_P (mode))
14678 *cost += extra_cost->vect.alu;
14679 else
14680 *cost += extra_cost->alu.extend;
14681 }
14682 return false;
14683
14684 case ASHIFT:
14685 op0 = XEXP (x, 0);
14686 op1 = XEXP (x, 1);
14687
14688 if (CONST_INT_P (op1))
14689 {
14690 if (speed)
14691 {
14692 if (VECTOR_MODE_P (mode))
14693 {
14694 /* Vector shift (immediate). */
14695 *cost += extra_cost->vect.alu;
14696 }
14697 else
14698 {
14699 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
14700 aliases. */
14701 *cost += extra_cost->alu.shift;
14702 }
14703 }
14704
14705 /* We can incorporate zero/sign extend for free. */
14706 if (GET_CODE (op0) == ZERO_EXTEND
14707 || GET_CODE (op0) == SIGN_EXTEND)
14708 op0 = XEXP (op0, 0);
14709
14710 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14711 return true;
14712 }
14713 else
14714 {
14715 if (VECTOR_MODE_P (mode))
14716 {
14717 if (speed)
14718 /* Vector shift (register). */
14719 *cost += extra_cost->vect.alu;
14720 }
14721 else
14722 {
14723 if (speed)
14724 /* LSLV. */
14725 *cost += extra_cost->alu.shift_reg;
14726
14727 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14728 && CONST_INT_P (XEXP (op1, 1))
14729 && known_eq (INTVAL (XEXP (op1, 1)),
14730 GET_MODE_BITSIZE (mode) - 1))
14731 {
14732 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14733 /* We already demanded XEXP (op1, 0) to be REG_P, so
14734 don't recurse into it. */
14735 return true;
14736 }
14737 }
14738 return false; /* All arguments need to be in registers. */
14739 }
14740
14741 case ROTATE:
14742 case ROTATERT:
14743 case LSHIFTRT:
14744 case ASHIFTRT:
14745 op0 = XEXP (x, 0);
14746 op1 = XEXP (x, 1);
14747
14748 if (CONST_INT_P (op1))
14749 {
14750 /* ASR (immediate) and friends. */
14751 if (speed)
14752 {
14753 if (VECTOR_MODE_P (mode))
14754 *cost += extra_cost->vect.alu;
14755 else
14756 *cost += extra_cost->alu.shift;
14757 }
14758
14759 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14760 return true;
14761 }
14762 else
14763 {
14764 if (VECTOR_MODE_P (mode))
14765 {
14766 if (speed)
14767 /* Vector shift (register). */
14768 *cost += extra_cost->vect.alu;
14769 }
14770 else
14771 {
14772 if (speed)
14773 /* ASR (register) and friends. */
14774 *cost += extra_cost->alu.shift_reg;
14775
14776 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14777 && CONST_INT_P (XEXP (op1, 1))
14778 && known_eq (INTVAL (XEXP (op1, 1)),
14779 GET_MODE_BITSIZE (mode) - 1))
14780 {
14781 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14782 /* We already demanded XEXP (op1, 0) to be REG_P, so
14783 don't recurse into it. */
14784 return true;
14785 }
14786 }
14787 return false; /* All arguments need to be in registers. */
14788 }
14789
14790 case SYMBOL_REF:
14791
14792 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14793 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
14794 {
14795 /* LDR. */
14796 if (speed)
14797 *cost += extra_cost->ldst.load;
14798 }
14799 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14800 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14801 {
14802 /* ADRP, followed by ADD. */
14803 *cost += COSTS_N_INSNS (1);
14804 if (speed)
14805 *cost += 2 * extra_cost->alu.arith;
14806 }
14807 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14808 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14809 {
14810 /* ADR. */
14811 if (speed)
14812 *cost += extra_cost->alu.arith;
14813 }
14814
14815 if (flag_pic)
14816 {
14817 /* One extra load instruction, after accessing the GOT. */
14818 *cost += COSTS_N_INSNS (1);
14819 if (speed)
14820 *cost += extra_cost->ldst.load;
14821 }
14822 return true;
14823
14824 case HIGH:
14825 case LO_SUM:
14826 /* ADRP/ADD (immediate). */
14827 if (speed)
14828 *cost += extra_cost->alu.arith;
14829 return true;
14830
14831 case ZERO_EXTRACT:
14832 case SIGN_EXTRACT:
14833 /* UBFX/SBFX. */
14834 if (speed)
14835 {
14836 if (VECTOR_MODE_P (mode))
14837 *cost += extra_cost->vect.alu;
14838 else
14839 *cost += extra_cost->alu.bfx;
14840 }
14841
14842 /* We can trust that the immediates used will be correct (there
14843 are no by-register forms), so we need only cost op0. */
14844 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
14845 return true;
14846
14847 case MULT:
14848 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14849 /* aarch64_rtx_mult_cost always handles recursion to its
14850 operands. */
14851 return true;
14852
14853 case MOD:
14854 /* We can expand signed mod by power of 2 using a NEGS, two parallel
14855 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
14856 an unconditional negate. This case should only ever be reached through
14857 the set_smod_pow2_cheap check in expmed.cc. */
14858 if (CONST_INT_P (XEXP (x, 1))
14859 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14860 && (mode == SImode || mode == DImode))
14861 {
14862 /* We expand to 4 instructions. Reset the baseline. */
14863 *cost = COSTS_N_INSNS (4);
14864
14865 if (speed)
14866 *cost += 2 * extra_cost->alu.logical
14867 + 2 * extra_cost->alu.arith;
14868
14869 return true;
14870 }
14871
14872 /* Fall-through. */
14873 case UMOD:
14874 if (speed)
14875 {
14876 /* Slighly prefer UMOD over SMOD. */
14877 if (VECTOR_MODE_P (mode))
14878 *cost += extra_cost->vect.alu;
14879 else if (GET_MODE_CLASS (mode) == MODE_INT)
14880 *cost += (extra_cost->mult[mode == DImode].add
14881 + extra_cost->mult[mode == DImode].idiv
14882 + (code == MOD ? 1 : 0));
14883 }
14884 return false; /* All arguments need to be in registers. */
14885
14886 case DIV:
14887 case UDIV:
14888 case SQRT:
14889 if (speed)
14890 {
14891 if (VECTOR_MODE_P (mode))
14892 *cost += extra_cost->vect.alu;
14893 else if (GET_MODE_CLASS (mode) == MODE_INT)
14894 /* There is no integer SQRT, so only DIV and UDIV can get
14895 here. */
14896 *cost += (extra_cost->mult[mode == DImode].idiv
14897 /* Slighly prefer UDIV over SDIV. */
14898 + (code == DIV ? 1 : 0));
14899 else
14900 *cost += extra_cost->fp[mode == DFmode].div;
14901 }
14902 return false; /* All arguments need to be in registers. */
14903
14904 case IF_THEN_ELSE:
14905 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
14906 XEXP (x, 2), cost, speed);
14907
14908 case EQ:
14909 case NE:
14910 case GT:
14911 case GTU:
14912 case LT:
14913 case LTU:
14914 case GE:
14915 case GEU:
14916 case LE:
14917 case LEU:
14918
14919 return false; /* All arguments must be in registers. */
14920
14921 case FMA:
14922 op0 = XEXP (x, 0);
14923 op1 = XEXP (x, 1);
14924 op2 = XEXP (x, 2);
14925
14926 if (speed)
14927 {
14928 if (VECTOR_MODE_P (mode))
14929 *cost += extra_cost->vect.alu;
14930 else
14931 *cost += extra_cost->fp[mode == DFmode].fma;
14932 }
14933
14934 /* FMSUB, FNMADD, and FNMSUB are free. */
14935 if (GET_CODE (op0) == NEG)
14936 op0 = XEXP (op0, 0);
14937
14938 if (GET_CODE (op2) == NEG)
14939 op2 = XEXP (op2, 0);
14940
14941 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
14942 and the by-element operand as operand 0. */
14943 if (GET_CODE (op1) == NEG)
14944 op1 = XEXP (op1, 0);
14945
14946 /* Catch vector-by-element operations. The by-element operand can
14947 either be (vec_duplicate (vec_select (x))) or just
14948 (vec_select (x)), depending on whether we are multiplying by
14949 a vector or a scalar.
14950
14951 Canonicalization is not very good in these cases, FMA4 will put the
14952 by-element operand as operand 0, FNMA4 will have it as operand 1. */
14953 if (GET_CODE (op0) == VEC_DUPLICATE)
14954 op0 = XEXP (op0, 0);
14955 else if (GET_CODE (op1) == VEC_DUPLICATE)
14956 op1 = XEXP (op1, 0);
14957
14958 if (GET_CODE (op0) == VEC_SELECT)
14959 op0 = XEXP (op0, 0);
14960 else if (GET_CODE (op1) == VEC_SELECT)
14961 op1 = XEXP (op1, 0);
14962
14963 /* If the remaining parameters are not registers,
14964 get the cost to put them into registers. */
14965 *cost += rtx_cost (op0, mode, FMA, 0, speed);
14966 *cost += rtx_cost (op1, mode, FMA, 1, speed);
14967 *cost += rtx_cost (op2, mode, FMA, 2, speed);
14968 return true;
14969
14970 case FLOAT:
14971 case UNSIGNED_FLOAT:
14972 if (speed)
14973 *cost += extra_cost->fp[mode == DFmode].fromint;
14974 return false;
14975
14976 case FLOAT_EXTEND:
14977 if (speed)
14978 {
14979 if (VECTOR_MODE_P (mode))
14980 {
14981 /*Vector truncate. */
14982 *cost += extra_cost->vect.alu;
14983 }
14984 else
14985 *cost += extra_cost->fp[mode == DFmode].widen;
14986 }
14987 return false;
14988
14989 case FLOAT_TRUNCATE:
14990 if (speed)
14991 {
14992 if (VECTOR_MODE_P (mode))
14993 {
14994 /*Vector conversion. */
14995 *cost += extra_cost->vect.alu;
14996 }
14997 else
14998 *cost += extra_cost->fp[mode == DFmode].narrow;
14999 }
15000 return false;
15001
15002 case FIX:
15003 case UNSIGNED_FIX:
15004 x = XEXP (x, 0);
15005 /* Strip the rounding part. They will all be implemented
15006 by the fcvt* family of instructions anyway. */
15007 if (GET_CODE (x) == UNSPEC)
15008 {
15009 unsigned int uns_code = XINT (x, 1);
15010
15011 if (uns_code == UNSPEC_FRINTA
15012 || uns_code == UNSPEC_FRINTM
15013 || uns_code == UNSPEC_FRINTN
15014 || uns_code == UNSPEC_FRINTP
15015 || uns_code == UNSPEC_FRINTZ)
15016 x = XVECEXP (x, 0, 0);
15017 }
15018
15019 if (speed)
15020 {
15021 if (VECTOR_MODE_P (mode))
15022 *cost += extra_cost->vect.alu;
15023 else
15024 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15025 }
15026
15027 /* We can combine fmul by a power of 2 followed by a fcvt into a single
15028 fixed-point fcvt. */
15029 if (GET_CODE (x) == MULT
15030 && ((VECTOR_MODE_P (mode)
15031 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15032 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15033 {
15034 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
15035 0, speed);
15036 return true;
15037 }
15038
15039 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
15040 return true;
15041
15042 case ABS:
15043 if (VECTOR_MODE_P (mode))
15044 {
15045 /* ABS (vector). */
15046 if (speed)
15047 *cost += extra_cost->vect.alu;
15048 }
15049 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15050 {
15051 op0 = XEXP (x, 0);
15052
15053 /* FABD, which is analogous to FADD. */
15054 if (GET_CODE (op0) == MINUS)
15055 {
15056 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15057 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15058 if (speed)
15059 *cost += extra_cost->fp[mode == DFmode].addsub;
15060
15061 return true;
15062 }
15063 /* Simple FABS is analogous to FNEG. */
15064 if (speed)
15065 *cost += extra_cost->fp[mode == DFmode].neg;
15066 }
15067 else
15068 {
15069 /* Integer ABS will either be split to
15070 two arithmetic instructions, or will be an ABS
15071 (scalar), which we don't model. */
15072 *cost = COSTS_N_INSNS (2);
15073 if (speed)
15074 *cost += 2 * extra_cost->alu.arith;
15075 }
15076 return false;
15077
15078 case SMAX:
15079 case SMIN:
15080 if (speed)
15081 {
15082 if (VECTOR_MODE_P (mode))
15083 *cost += extra_cost->vect.alu;
15084 else
15085 {
15086 /* FMAXNM/FMINNM/FMAX/FMIN.
15087 TODO: This may not be accurate for all implementations, but
15088 we do not model this in the cost tables. */
15089 *cost += extra_cost->fp[mode == DFmode].addsub;
15090 }
15091 }
15092 return false;
15093
15094 case UNSPEC:
15095 /* The floating point round to integer frint* instructions. */
15096 if (aarch64_frint_unspec_p (XINT (x, 1)))
15097 {
15098 if (speed)
15099 *cost += extra_cost->fp[mode == DFmode].roundint;
15100
15101 return false;
15102 }
15103
15104 if (XINT (x, 1) == UNSPEC_RBIT)
15105 {
15106 if (speed)
15107 *cost += extra_cost->alu.rev;
15108
15109 return false;
15110 }
15111 break;
15112
15113 case TRUNCATE:
15114
15115 /* Decompose <su>muldi3_highpart. */
15116 if (/* (truncate:DI */
15117 mode == DImode
15118 /* (lshiftrt:TI */
15119 && GET_MODE (XEXP (x, 0)) == TImode
15120 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15121 /* (mult:TI */
15122 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15123 /* (ANY_EXTEND:TI (reg:DI))
15124 (ANY_EXTEND:TI (reg:DI))) */
15125 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15126 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15127 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15128 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15129 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15130 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15131 /* (const_int 64) */
15132 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15133 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15134 {
15135 /* UMULH/SMULH. */
15136 if (speed)
15137 *cost += extra_cost->mult[mode == DImode].extend;
15138 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15139 mode, MULT, 0, speed);
15140 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15141 mode, MULT, 1, speed);
15142 return true;
15143 }
15144 break;
15145 case CONST_VECTOR:
15146 {
15147 /* Load using MOVI/MVNI. */
15148 if (aarch64_simd_valid_immediate (x, NULL))
15149 *cost = extra_cost->vect.movi;
15150 else /* Load using constant pool. */
15151 *cost = extra_cost->ldst.load;
15152 break;
15153 }
15154 case VEC_CONCAT:
15155 /* depending on the operation, either DUP or INS.
15156 For now, keep default costing. */
15157 break;
15158 case VEC_DUPLICATE:
15159 /* Load using a DUP. */
15160 *cost = extra_cost->vect.dup;
15161 return false;
15162 case VEC_SELECT:
15163 {
15164 rtx op0 = XEXP (x, 0);
15165 *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15166
15167 /* cost subreg of 0 as free, otherwise as DUP */
15168 rtx op1 = XEXP (x, 1);
15169 if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15170 ;
15171 else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15172 *cost = extra_cost->vect.dup;
15173 else
15174 *cost = extra_cost->vect.extract;
15175 return true;
15176 }
15177 default:
15178 break;
15179 }
15180
15181 if (dump_file
15182 && flag_aarch64_verbose_cost)
15183 fprintf (dump_file,
15184 "\nFailed to cost RTX. Assuming default cost.\n");
15185
15186 return true;
15187 }
15188
15189 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15190 calculated for X. This cost is stored in *COST. Returns true
15191 if the total cost of X was calculated. */
15192 static bool
15193 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15194 int param, int *cost, bool speed)
15195 {
15196 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15197
15198 if (dump_file
15199 && flag_aarch64_verbose_cost)
15200 {
15201 print_rtl_single (dump_file, x);
15202 fprintf (dump_file, "\n%s cost: %d (%s)\n",
15203 speed ? "Hot" : "Cold",
15204 *cost, result ? "final" : "partial");
15205 }
15206
15207 return result;
15208 }
15209
15210 static int
15211 aarch64_register_move_cost (machine_mode mode,
15212 reg_class_t from_i, reg_class_t to_i)
15213 {
15214 enum reg_class from = (enum reg_class) from_i;
15215 enum reg_class to = (enum reg_class) to_i;
15216 const struct cpu_regmove_cost *regmove_cost
15217 = aarch64_tune_params.regmove_cost;
15218
15219 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
15220 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
15221 || to == STUB_REGS)
15222 to = GENERAL_REGS;
15223
15224 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
15225 || from == STUB_REGS)
15226 from = GENERAL_REGS;
15227
15228 /* Make RDFFR very expensive. In particular, if we know that the FFR
15229 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15230 as a way of obtaining a PTRUE. */
15231 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15232 && hard_reg_set_subset_p (reg_class_contents[from_i],
15233 reg_class_contents[FFR_REGS]))
15234 return 80;
15235
15236 /* Moving between GPR and stack cost is the same as GP2GP. */
15237 if ((from == GENERAL_REGS && to == STACK_REG)
15238 || (to == GENERAL_REGS && from == STACK_REG))
15239 return regmove_cost->GP2GP;
15240
15241 /* To/From the stack register, we move via the gprs. */
15242 if (to == STACK_REG || from == STACK_REG)
15243 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15244 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15245
15246 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15247 if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15248 && known_eq (GET_MODE_SIZE (mode), 16))
15249 {
15250 /* 128-bit operations on general registers require 2 instructions. */
15251 if (from == GENERAL_REGS && to == GENERAL_REGS)
15252 return regmove_cost->GP2GP * 2;
15253 else if (from == GENERAL_REGS)
15254 return regmove_cost->GP2FP * 2;
15255 else if (to == GENERAL_REGS)
15256 return regmove_cost->FP2GP * 2;
15257
15258 /* When AdvSIMD instructions are disabled it is not possible to move
15259 a 128-bit value directly between Q registers. This is handled in
15260 secondary reload. A general register is used as a scratch to move
15261 the upper DI value and the lower DI value is moved directly,
15262 hence the cost is the sum of three moves. */
15263 if (! TARGET_SIMD)
15264 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15265
15266 return regmove_cost->FP2FP;
15267 }
15268
15269 if (from == GENERAL_REGS && to == GENERAL_REGS)
15270 return regmove_cost->GP2GP;
15271 else if (from == GENERAL_REGS)
15272 return regmove_cost->GP2FP;
15273 else if (to == GENERAL_REGS)
15274 return regmove_cost->FP2GP;
15275
15276 if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15277 {
15278 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15279 The cost must be greater than 2 units to indicate that direct
15280 moves aren't possible. */
15281 auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15282 + aarch64_tune_params.memmov_cost.store_fp);
15283 return MIN (CEIL (per_vector, 2), 4);
15284 }
15285
15286 return regmove_cost->FP2FP;
15287 }
15288
15289 /* Implements TARGET_MEMORY_MOVE_COST. */
15290 static int
15291 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15292 {
15293 enum reg_class rclass = (enum reg_class) rclass_i;
15294 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15295 ? reg_classes_intersect_p (rclass, PR_REGS)
15296 : reg_class_subset_p (rclass, PR_REGS))
15297 return (in
15298 ? aarch64_tune_params.memmov_cost.load_pred
15299 : aarch64_tune_params.memmov_cost.store_pred);
15300
15301 if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15302 ? reg_classes_intersect_p (rclass, FP_REGS)
15303 : reg_class_subset_p (rclass, FP_REGS))
15304 return (in
15305 ? aarch64_tune_params.memmov_cost.load_fp
15306 : aarch64_tune_params.memmov_cost.store_fp);
15307
15308 return (in
15309 ? aarch64_tune_params.memmov_cost.load_int
15310 : aarch64_tune_params.memmov_cost.store_int);
15311 }
15312
15313 /* Implement TARGET_INIT_BUILTINS. */
15314 static void
15315 aarch64_init_builtins ()
15316 {
15317 aarch64_general_init_builtins ();
15318 aarch64_sve::init_builtins ();
15319 #ifdef SUBTARGET_INIT_BUILTINS
15320 SUBTARGET_INIT_BUILTINS;
15321 #endif
15322 }
15323
15324 /* Implement TARGET_FOLD_BUILTIN. */
15325 static tree
15326 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15327 {
15328 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15329 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15330 tree type = TREE_TYPE (TREE_TYPE (fndecl));
15331 switch (code & AARCH64_BUILTIN_CLASS)
15332 {
15333 case AARCH64_BUILTIN_GENERAL:
15334 return aarch64_general_fold_builtin (subcode, type, nargs, args);
15335
15336 case AARCH64_BUILTIN_SVE:
15337 return NULL_TREE;
15338 }
15339 gcc_unreachable ();
15340 }
15341
15342 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15343 static bool
15344 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15345 {
15346 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15347 tree fndecl = gimple_call_fndecl (stmt);
15348 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15349 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15350 gimple *new_stmt = NULL;
15351 switch (code & AARCH64_BUILTIN_CLASS)
15352 {
15353 case AARCH64_BUILTIN_GENERAL:
15354 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15355 break;
15356
15357 case AARCH64_BUILTIN_SVE:
15358 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15359 break;
15360 }
15361
15362 if (!new_stmt)
15363 return false;
15364
15365 gsi_replace (gsi, new_stmt, false);
15366 return true;
15367 }
15368
15369 /* Implement TARGET_EXPAND_BUILTIN. */
15370 static rtx
15371 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15372 {
15373 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15374 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15375 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15376 switch (code & AARCH64_BUILTIN_CLASS)
15377 {
15378 case AARCH64_BUILTIN_GENERAL:
15379 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15380
15381 case AARCH64_BUILTIN_SVE:
15382 return aarch64_sve::expand_builtin (subcode, exp, target);
15383 }
15384 gcc_unreachable ();
15385 }
15386
15387 /* Implement TARGET_BUILTIN_DECL. */
15388 static tree
15389 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15390 {
15391 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15392 switch (code & AARCH64_BUILTIN_CLASS)
15393 {
15394 case AARCH64_BUILTIN_GENERAL:
15395 return aarch64_general_builtin_decl (subcode, initialize_p);
15396
15397 case AARCH64_BUILTIN_SVE:
15398 return aarch64_sve::builtin_decl (subcode, initialize_p);
15399 }
15400 gcc_unreachable ();
15401 }
15402
15403 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15404 to optimize 1.0/sqrt. */
15405
15406 static bool
15407 use_rsqrt_p (machine_mode mode)
15408 {
15409 return (!flag_trapping_math
15410 && flag_unsafe_math_optimizations
15411 && ((aarch64_tune_params.approx_modes->recip_sqrt
15412 & AARCH64_APPROX_MODE (mode))
15413 || flag_mrecip_low_precision_sqrt));
15414 }
15415
15416 /* Function to decide when to use the approximate reciprocal square root
15417 builtin. */
15418
15419 static tree
15420 aarch64_builtin_reciprocal (tree fndecl)
15421 {
15422 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15423
15424 if (!use_rsqrt_p (mode))
15425 return NULL_TREE;
15426 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15427 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15428 switch (code & AARCH64_BUILTIN_CLASS)
15429 {
15430 case AARCH64_BUILTIN_GENERAL:
15431 return aarch64_general_builtin_rsqrt (subcode);
15432
15433 case AARCH64_BUILTIN_SVE:
15434 return NULL_TREE;
15435 }
15436 gcc_unreachable ();
15437 }
15438
15439 /* Emit code to perform the floating-point operation:
15440
15441 DST = SRC1 * SRC2
15442
15443 where all three operands are already known to be registers.
15444 If the operation is an SVE one, PTRUE is a suitable all-true
15445 predicate. */
15446
15447 static void
15448 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15449 {
15450 if (ptrue)
15451 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15452 dst, ptrue, src1, src2,
15453 gen_int_mode (SVE_RELAXED_GP, SImode)));
15454 else
15455 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15456 }
15457
15458 /* Emit instruction sequence to compute either the approximate square root
15459 or its approximate reciprocal, depending on the flag RECP, and return
15460 whether the sequence was emitted or not. */
15461
15462 bool
15463 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15464 {
15465 machine_mode mode = GET_MODE (dst);
15466
15467 if (GET_MODE_INNER (mode) == HFmode)
15468 {
15469 gcc_assert (!recp);
15470 return false;
15471 }
15472
15473 if (!recp)
15474 {
15475 if (!(flag_mlow_precision_sqrt
15476 || (aarch64_tune_params.approx_modes->sqrt
15477 & AARCH64_APPROX_MODE (mode))))
15478 return false;
15479
15480 if (!flag_finite_math_only
15481 || flag_trapping_math
15482 || !flag_unsafe_math_optimizations
15483 || optimize_function_for_size_p (cfun))
15484 return false;
15485 }
15486 else
15487 /* Caller assumes we cannot fail. */
15488 gcc_assert (use_rsqrt_p (mode));
15489
15490 rtx pg = NULL_RTX;
15491 if (aarch64_sve_mode_p (mode))
15492 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15493 machine_mode mmsk = (VECTOR_MODE_P (mode)
15494 ? related_int_vector_mode (mode).require ()
15495 : int_mode_for_mode (mode).require ());
15496 rtx xmsk = NULL_RTX;
15497 if (!recp)
15498 {
15499 /* When calculating the approximate square root, compare the
15500 argument with 0.0 and create a mask. */
15501 rtx zero = CONST0_RTX (mode);
15502 if (pg)
15503 {
15504 xmsk = gen_reg_rtx (GET_MODE (pg));
15505 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15506 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15507 xmsk, pg, hint, src, zero));
15508 }
15509 else
15510 {
15511 xmsk = gen_reg_rtx (mmsk);
15512 emit_insn (gen_rtx_SET (xmsk,
15513 gen_rtx_NEG (mmsk,
15514 gen_rtx_EQ (mmsk, src, zero))));
15515 }
15516 }
15517
15518 /* Estimate the approximate reciprocal square root. */
15519 rtx xdst = gen_reg_rtx (mode);
15520 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15521
15522 /* Iterate over the series twice for SF and thrice for DF. */
15523 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15524
15525 /* Optionally iterate over the series once less for faster performance
15526 while sacrificing the accuracy. */
15527 if ((recp && flag_mrecip_low_precision_sqrt)
15528 || (!recp && flag_mlow_precision_sqrt))
15529 iterations--;
15530
15531 /* Iterate over the series to calculate the approximate reciprocal square
15532 root. */
15533 rtx x1 = gen_reg_rtx (mode);
15534 while (iterations--)
15535 {
15536 rtx x2 = gen_reg_rtx (mode);
15537 aarch64_emit_mult (x2, pg, xdst, xdst);
15538
15539 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15540
15541 if (iterations > 0)
15542 aarch64_emit_mult (xdst, pg, xdst, x1);
15543 }
15544
15545 if (!recp)
15546 {
15547 if (pg)
15548 /* Multiply nonzero source values by the corresponding intermediate
15549 result elements, so that the final calculation is the approximate
15550 square root rather than its reciprocal. Select a zero result for
15551 zero source values, to avoid the Inf * 0 -> NaN that we'd get
15552 otherwise. */
15553 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15554 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15555 else
15556 {
15557 /* Qualify the approximate reciprocal square root when the
15558 argument is 0.0 by squashing the intermediary result to 0.0. */
15559 rtx xtmp = gen_reg_rtx (mmsk);
15560 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15561 gen_rtx_SUBREG (mmsk, xdst, 0)));
15562 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15563
15564 /* Calculate the approximate square root. */
15565 aarch64_emit_mult (xdst, pg, xdst, src);
15566 }
15567 }
15568
15569 /* Finalize the approximation. */
15570 aarch64_emit_mult (dst, pg, xdst, x1);
15571
15572 return true;
15573 }
15574
15575 /* Emit the instruction sequence to compute the approximation for the division
15576 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
15577
15578 bool
15579 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15580 {
15581 machine_mode mode = GET_MODE (quo);
15582
15583 if (GET_MODE_INNER (mode) == HFmode)
15584 return false;
15585
15586 bool use_approx_division_p = (flag_mlow_precision_div
15587 || (aarch64_tune_params.approx_modes->division
15588 & AARCH64_APPROX_MODE (mode)));
15589
15590 if (!flag_finite_math_only
15591 || flag_trapping_math
15592 || !flag_unsafe_math_optimizations
15593 || optimize_function_for_size_p (cfun)
15594 || !use_approx_division_p)
15595 return false;
15596
15597 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15598 return false;
15599
15600 rtx pg = NULL_RTX;
15601 if (aarch64_sve_mode_p (mode))
15602 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15603
15604 /* Estimate the approximate reciprocal. */
15605 rtx xrcp = gen_reg_rtx (mode);
15606 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15607
15608 /* Iterate over the series twice for SF and thrice for DF. */
15609 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15610
15611 /* Optionally iterate over the series less for faster performance,
15612 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
15613 if (flag_mlow_precision_div)
15614 iterations = (GET_MODE_INNER (mode) == DFmode
15615 ? aarch64_double_recp_precision
15616 : aarch64_float_recp_precision);
15617
15618 /* Iterate over the series to calculate the approximate reciprocal. */
15619 rtx xtmp = gen_reg_rtx (mode);
15620 while (iterations--)
15621 {
15622 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15623
15624 if (iterations > 0)
15625 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15626 }
15627
15628 if (num != CONST1_RTX (mode))
15629 {
15630 /* As the approximate reciprocal of DEN is already calculated, only
15631 calculate the approximate division when NUM is not 1.0. */
15632 rtx xnum = force_reg (mode, num);
15633 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15634 }
15635
15636 /* Finalize the approximation. */
15637 aarch64_emit_mult (quo, pg, xrcp, xtmp);
15638 return true;
15639 }
15640
15641 /* Return the number of instructions that can be issued per cycle. */
15642 static int
15643 aarch64_sched_issue_rate (void)
15644 {
15645 return aarch64_tune_params.issue_rate;
15646 }
15647
15648 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
15649 static int
15650 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15651 {
15652 if (DEBUG_INSN_P (insn))
15653 return more;
15654
15655 rtx_code code = GET_CODE (PATTERN (insn));
15656 if (code == USE || code == CLOBBER)
15657 return more;
15658
15659 if (get_attr_type (insn) == TYPE_NO_INSN)
15660 return more;
15661
15662 return more - 1;
15663 }
15664
15665 static int
15666 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15667 {
15668 int issue_rate = aarch64_sched_issue_rate ();
15669
15670 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15671 }
15672
15673
15674 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15675 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
15676 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
15677
15678 static int
15679 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15680 int ready_index)
15681 {
15682 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15683 }
15684
15685
15686 /* Vectorizer cost model target hooks. */
15687
15688 /* Information about how the CPU would issue the scalar, Advanced SIMD
15689 or SVE version of a vector loop, using the scheme defined by the
15690 aarch64_base_vec_issue_info hierarchy of structures. */
15691 class aarch64_vec_op_count
15692 {
15693 public:
15694 aarch64_vec_op_count () = default;
15695 aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15696 unsigned int = 1);
15697
15698 unsigned int vec_flags () const { return m_vec_flags; }
15699 unsigned int vf_factor () const { return m_vf_factor; }
15700
15701 const aarch64_base_vec_issue_info *base_issue_info () const;
15702 const aarch64_simd_vec_issue_info *simd_issue_info () const;
15703 const aarch64_sve_vec_issue_info *sve_issue_info () const;
15704
15705 fractional_cost rename_cycles_per_iter () const;
15706 fractional_cost min_nonpred_cycles_per_iter () const;
15707 fractional_cost min_pred_cycles_per_iter () const;
15708 fractional_cost min_cycles_per_iter () const;
15709
15710 void dump () const;
15711
15712 /* The number of individual "general" operations. See the comments
15713 in aarch64_base_vec_issue_info for details. */
15714 unsigned int general_ops = 0;
15715
15716 /* The number of load and store operations, under the same scheme
15717 as above. */
15718 unsigned int loads = 0;
15719 unsigned int stores = 0;
15720
15721 /* The minimum number of cycles needed to execute all loop-carried
15722 operations, which in the vector code become associated with
15723 reductions. */
15724 unsigned int reduction_latency = 0;
15725
15726 /* The number of individual predicate operations. See the comments
15727 in aarch64_sve_vec_issue_info for details. */
15728 unsigned int pred_ops = 0;
15729
15730 private:
15731 /* The issue information for the core. */
15732 const aarch64_vec_issue_info *m_issue_info = nullptr;
15733
15734 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15735 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15736 Advanced SIMD code.
15737 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15738 SVE code. */
15739 unsigned int m_vec_flags = 0;
15740
15741 /* Assume that, when the code is executing on the core described
15742 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15743 times more data than the vectorizer anticipates.
15744
15745 This is only ever different from 1 for SVE. It allows us to consider
15746 what would happen on a 256-bit SVE target even when the -mtune
15747 parameters say that the “likely” SVE length is 128 bits. */
15748 unsigned int m_vf_factor = 1;
15749 };
15750
15751 aarch64_vec_op_count::
15752 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
15753 unsigned int vec_flags, unsigned int vf_factor)
15754 : m_issue_info (issue_info),
15755 m_vec_flags (vec_flags),
15756 m_vf_factor (vf_factor)
15757 {
15758 }
15759
15760 /* Return the base issue information (i.e. the parts that make sense
15761 for both scalar and vector code). Return null if we have no issue
15762 information. */
15763 const aarch64_base_vec_issue_info *
15764 aarch64_vec_op_count::base_issue_info () const
15765 {
15766 if (auto *ret = simd_issue_info ())
15767 return ret;
15768 return m_issue_info->scalar;
15769 }
15770
15771 /* If the structure describes vector code and we have associated issue
15772 information, return that issue information, otherwise return null. */
15773 const aarch64_simd_vec_issue_info *
15774 aarch64_vec_op_count::simd_issue_info () const
15775 {
15776 if (auto *ret = sve_issue_info ())
15777 return ret;
15778 if (m_vec_flags)
15779 return m_issue_info->advsimd;
15780 return nullptr;
15781 }
15782
15783 /* If the structure describes SVE code and we have associated issue
15784 information, return that issue information, otherwise return null. */
15785 const aarch64_sve_vec_issue_info *
15786 aarch64_vec_op_count::sve_issue_info () const
15787 {
15788 if (m_vec_flags & VEC_ANY_SVE)
15789 return m_issue_info->sve;
15790 return nullptr;
15791 }
15792
15793 /* Estimate the minimum number of cycles per iteration needed to rename
15794 the instructions.
15795
15796 ??? For now this is done inline rather than via cost tables, since it
15797 isn't clear how it should be parameterized for the general case. */
15798 fractional_cost
15799 aarch64_vec_op_count::rename_cycles_per_iter () const
15800 {
15801 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15802 || sve_issue_info () == &neoversen2_sve_issue_info
15803 || sve_issue_info () == &neoversev2_sve_issue_info)
15804 /* + 1 for an addition. We've already counted a general op for each
15805 store, so we don't need to account for stores separately. The branch
15806 reads no registers and so does not need to be counted either.
15807
15808 ??? This value is very much on the pessimistic side, but seems to work
15809 pretty well in practice. */
15810 return { general_ops + loads + pred_ops + 1, 5 };
15811
15812 return 0;
15813 }
15814
15815 /* Like min_cycles_per_iter, but excluding predicate operations. */
15816 fractional_cost
15817 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15818 {
15819 auto *issue_info = base_issue_info ();
15820
15821 fractional_cost cycles = MAX (reduction_latency, 1);
15822 cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15823 cycles = std::max (cycles, { loads + stores,
15824 issue_info->loads_stores_per_cycle });
15825 cycles = std::max (cycles, { general_ops,
15826 issue_info->general_ops_per_cycle });
15827 cycles = std::max (cycles, rename_cycles_per_iter ());
15828 return cycles;
15829 }
15830
15831 /* Like min_cycles_per_iter, but including only the predicate operations. */
15832 fractional_cost
15833 aarch64_vec_op_count::min_pred_cycles_per_iter () const
15834 {
15835 if (auto *issue_info = sve_issue_info ())
15836 return { pred_ops, issue_info->pred_ops_per_cycle };
15837 return 0;
15838 }
15839
15840 /* Estimate the minimum number of cycles needed to issue the operations.
15841 This is a very simplistic model! */
15842 fractional_cost
15843 aarch64_vec_op_count::min_cycles_per_iter () const
15844 {
15845 return std::max (min_nonpred_cycles_per_iter (),
15846 min_pred_cycles_per_iter ());
15847 }
15848
15849 /* Dump information about the structure. */
15850 void
15851 aarch64_vec_op_count::dump () const
15852 {
15853 dump_printf_loc (MSG_NOTE, vect_location,
15854 " load operations = %d\n", loads);
15855 dump_printf_loc (MSG_NOTE, vect_location,
15856 " store operations = %d\n", stores);
15857 dump_printf_loc (MSG_NOTE, vect_location,
15858 " general operations = %d\n", general_ops);
15859 if (sve_issue_info ())
15860 dump_printf_loc (MSG_NOTE, vect_location,
15861 " predicate operations = %d\n", pred_ops);
15862 dump_printf_loc (MSG_NOTE, vect_location,
15863 " reduction latency = %d\n", reduction_latency);
15864 if (auto rcpi = rename_cycles_per_iter ())
15865 dump_printf_loc (MSG_NOTE, vect_location,
15866 " estimated cycles per iteration to rename = %f\n",
15867 rcpi.as_double ());
15868 if (auto pred_cpi = min_pred_cycles_per_iter ())
15869 {
15870 dump_printf_loc (MSG_NOTE, vect_location,
15871 " estimated min cycles per iteration"
15872 " without predication = %f\n",
15873 min_nonpred_cycles_per_iter ().as_double ());
15874 dump_printf_loc (MSG_NOTE, vect_location,
15875 " estimated min cycles per iteration"
15876 " for predication = %f\n", pred_cpi.as_double ());
15877 }
15878 if (auto cpi = min_cycles_per_iter ())
15879 dump_printf_loc (MSG_NOTE, vect_location,
15880 " estimated min cycles per iteration = %f\n",
15881 cpi.as_double ());
15882 }
15883
15884 /* Information about vector code that we're in the process of costing. */
15885 class aarch64_vector_costs : public vector_costs
15886 {
15887 public:
15888 aarch64_vector_costs (vec_info *, bool);
15889
15890 unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
15891 stmt_vec_info stmt_info, slp_tree, tree vectype,
15892 int misalign,
15893 vect_cost_model_location where) override;
15894 void finish_cost (const vector_costs *) override;
15895 bool better_main_loop_than_p (const vector_costs *other) const override;
15896
15897 private:
15898 void record_potential_advsimd_unrolling (loop_vec_info);
15899 void analyze_loop_vinfo (loop_vec_info);
15900 void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
15901 aarch64_vec_op_count *);
15902 fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
15903 fractional_cost, unsigned int,
15904 unsigned int *, bool *);
15905 unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
15906 unsigned int);
15907 bool prefer_unrolled_loop () const;
15908 unsigned int determine_suggested_unroll_factor ();
15909
15910 /* True if we have performed one-time initialization based on the
15911 vec_info. */
15912 bool m_analyzed_vinfo = false;
15913
15914 /* This loop uses an average operation that is not supported by SVE, but is
15915 supported by Advanced SIMD and SVE2. */
15916 bool m_has_avg = false;
15917
15918 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
15919 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
15920 SIMD code.
15921 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
15922 unsigned int m_vec_flags = 0;
15923
15924 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
15925 This means that code such as:
15926
15927 a[0] = x;
15928 a[1] = x;
15929
15930 will be costed as two scalar instructions and two vector instructions
15931 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
15932 wins if the costs are equal, because of the fact that the vector costs
15933 include constant initializations whereas the scalar costs don't.
15934 We would therefore tend to vectorize the code above, even though
15935 the scalar version can use a single STP.
15936
15937 We should eventually fix this and model LDP and STP in the main costs;
15938 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
15939 Until then, we look specifically for code that does nothing more than
15940 STP-like operations. We cost them on that basis in addition to the
15941 normal latency-based costs.
15942
15943 If the scalar or vector code could be a sequence of STPs +
15944 initialization, this variable counts the cost of the sequence,
15945 with 2 units per instruction. The variable is ~0U for other
15946 kinds of code. */
15947 unsigned int m_stp_sequence_cost = 0;
15948
15949 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
15950 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
15951 situations, we try to predict whether an Advanced SIMD implementation
15952 of the loop could be completely unrolled and become straight-line code.
15953 If so, it is generally better to use the Advanced SIMD version rather
15954 than length-agnostic SVE, since the SVE loop would execute an unknown
15955 number of times and so could not be completely unrolled in the same way.
15956
15957 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
15958 number of Advanced SIMD loop iterations that would be unrolled and
15959 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
15960 in the unrolled loop. Both values are zero if we're not applying
15961 the heuristic. */
15962 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
15963 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
15964
15965 /* If we're vectorizing a loop that executes a constant number of times,
15966 this variable gives the number of times that the vector loop would
15967 iterate, otherwise it is zero. */
15968 uint64_t m_num_vector_iterations = 0;
15969
15970 /* Used only when vectorizing loops. Estimates the number and kind of
15971 operations that would be needed by one iteration of the scalar
15972 or vector loop. There is one entry for each tuning option of
15973 interest. */
15974 auto_vec<aarch64_vec_op_count, 2> m_ops;
15975 };
15976
15977 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
15978 bool costing_for_scalar)
15979 : vector_costs (vinfo, costing_for_scalar),
15980 m_vec_flags (costing_for_scalar ? 0
15981 : aarch64_classify_vector_mode (vinfo->vector_mode))
15982 {
15983 if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
15984 {
15985 m_ops.quick_push ({ issue_info, m_vec_flags });
15986 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
15987 {
15988 unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
15989 m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
15990 vf_factor });
15991 }
15992 }
15993 }
15994
15995 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
15996 vector_costs *
15997 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
15998 {
15999 return new aarch64_vector_costs (vinfo, costing_for_scalar);
16000 }
16001
16002 /* Return true if the current CPU should use the new costs defined
16003 in GCC 11. This should be removed for GCC 12 and above, with the
16004 costs applying to all CPUs instead. */
16005 static bool
16006 aarch64_use_new_vector_costs_p ()
16007 {
16008 return (aarch64_tune_params.extra_tuning_flags
16009 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16010 }
16011
16012 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
16013 static const simd_vec_cost *
16014 aarch64_simd_vec_costs (tree vectype)
16015 {
16016 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16017 if (vectype != NULL
16018 && aarch64_sve_mode_p (TYPE_MODE (vectype))
16019 && costs->sve != NULL)
16020 return costs->sve;
16021 return costs->advsimd;
16022 }
16023
16024 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
16025 static const simd_vec_cost *
16026 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16027 {
16028 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16029 if ((flags & VEC_ANY_SVE) && costs->sve)
16030 return costs->sve;
16031 return costs->advsimd;
16032 }
16033
16034 /* If STMT_INFO is a memory reference, return the scalar memory type,
16035 otherwise return null. */
16036 static tree
16037 aarch64_dr_type (stmt_vec_info stmt_info)
16038 {
16039 if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16040 return TREE_TYPE (DR_REF (dr));
16041 return NULL_TREE;
16042 }
16043
16044 /* Decide whether to use the unrolling heuristic described above
16045 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
16046 describes the loop that we're vectorizing. */
16047 void
16048 aarch64_vector_costs::
16049 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16050 {
16051 /* The heuristic only makes sense on targets that have the same
16052 vector throughput for SVE and Advanced SIMD. */
16053 if (!(aarch64_tune_params.extra_tuning_flags
16054 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16055 return;
16056
16057 /* We only want to apply the heuristic if LOOP_VINFO is being
16058 vectorized for SVE. */
16059 if (!(m_vec_flags & VEC_ANY_SVE))
16060 return;
16061
16062 /* Check whether it is possible in principle to use Advanced SIMD
16063 instead. */
16064 if (aarch64_autovec_preference == 2)
16065 return;
16066
16067 /* We don't want to apply the heuristic to outer loops, since it's
16068 harder to track two levels of unrolling. */
16069 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16070 return;
16071
16072 /* Only handle cases in which the number of Advanced SIMD iterations
16073 would be known at compile time but the number of SVE iterations
16074 would not. */
16075 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16076 || aarch64_sve_vg.is_constant ())
16077 return;
16078
16079 /* Guess how many times the Advanced SIMD loop would iterate and make
16080 sure that it is within the complete unrolling limit. Even if the
16081 number of iterations is small enough, the number of statements might
16082 not be, which is why we need to estimate the number of statements too. */
16083 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16084 unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16085 unsigned HOST_WIDE_INT unrolled_advsimd_niters
16086 = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16087 if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16088 return;
16089
16090 /* Record that we're applying the heuristic and should try to estimate
16091 the number of statements in the Advanced SIMD loop. */
16092 m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16093 }
16094
16095 /* Do one-time initialization of the aarch64_vector_costs given that we're
16096 costing the loop vectorization described by LOOP_VINFO. */
16097 void
16098 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16099 {
16100 /* Record the number of times that the vector loop would execute,
16101 if known. */
16102 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16103 auto scalar_niters = max_stmt_executions_int (loop);
16104 if (scalar_niters >= 0)
16105 {
16106 unsigned int vf = vect_vf_for_cost (loop_vinfo);
16107 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16108 m_num_vector_iterations = scalar_niters / vf;
16109 else
16110 m_num_vector_iterations = CEIL (scalar_niters, vf);
16111 }
16112
16113 /* Detect whether we're vectorizing for SVE and should apply the unrolling
16114 heuristic described above m_unrolled_advsimd_niters. */
16115 record_potential_advsimd_unrolling (loop_vinfo);
16116
16117 /* Record the issue information for any SVE WHILE instructions that the
16118 loop needs. */
16119 if (!m_ops.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16120 {
16121 unsigned int num_masks = 0;
16122 rgroup_controls *rgm;
16123 unsigned int num_vectors_m1;
16124 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
16125 if (rgm->type)
16126 num_masks += num_vectors_m1 + 1;
16127 for (auto &ops : m_ops)
16128 if (auto *issue = ops.sve_issue_info ())
16129 ops.pred_ops += num_masks * issue->while_pred_ops;
16130 }
16131 }
16132
16133 /* Implement targetm.vectorize.builtin_vectorization_cost. */
16134 static int
16135 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16136 tree vectype,
16137 int misalign ATTRIBUTE_UNUSED)
16138 {
16139 unsigned elements;
16140 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16141 bool fp = false;
16142
16143 if (vectype != NULL)
16144 fp = FLOAT_TYPE_P (vectype);
16145
16146 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16147
16148 switch (type_of_cost)
16149 {
16150 case scalar_stmt:
16151 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16152
16153 case scalar_load:
16154 return costs->scalar_load_cost;
16155
16156 case scalar_store:
16157 return costs->scalar_store_cost;
16158
16159 case vector_stmt:
16160 return fp ? simd_costs->fp_stmt_cost
16161 : simd_costs->int_stmt_cost;
16162
16163 case vector_load:
16164 return simd_costs->align_load_cost;
16165
16166 case vector_store:
16167 return simd_costs->store_cost;
16168
16169 case vec_to_scalar:
16170 return simd_costs->vec_to_scalar_cost;
16171
16172 case scalar_to_vec:
16173 return simd_costs->scalar_to_vec_cost;
16174
16175 case unaligned_load:
16176 case vector_gather_load:
16177 return simd_costs->unalign_load_cost;
16178
16179 case unaligned_store:
16180 case vector_scatter_store:
16181 return simd_costs->unalign_store_cost;
16182
16183 case cond_branch_taken:
16184 return costs->cond_taken_branch_cost;
16185
16186 case cond_branch_not_taken:
16187 return costs->cond_not_taken_branch_cost;
16188
16189 case vec_perm:
16190 return simd_costs->permute_cost;
16191
16192 case vec_promote_demote:
16193 return fp ? simd_costs->fp_stmt_cost
16194 : simd_costs->int_stmt_cost;
16195
16196 case vec_construct:
16197 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16198 return elements / 2 + 1;
16199
16200 default:
16201 gcc_unreachable ();
16202 }
16203 }
16204
16205 /* Return true if an access of kind KIND for STMT_INFO represents one
16206 vector of an LD[234] or ST[234] operation. Return the total number of
16207 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
16208 static int
16209 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16210 {
16211 if ((kind == vector_load
16212 || kind == unaligned_load
16213 || kind == vector_store
16214 || kind == unaligned_store)
16215 && STMT_VINFO_DATA_REF (stmt_info))
16216 {
16217 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16218 if (stmt_info
16219 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16220 return DR_GROUP_SIZE (stmt_info);
16221 }
16222 return 0;
16223 }
16224
16225 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16226 vectors would produce a series of LDP or STP operations. KIND is the
16227 kind of statement that STMT_INFO represents. */
16228 static bool
16229 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16230 stmt_vec_info stmt_info)
16231 {
16232 switch (kind)
16233 {
16234 case vector_load:
16235 case vector_store:
16236 case unaligned_load:
16237 case unaligned_store:
16238 break;
16239
16240 default:
16241 return false;
16242 }
16243
16244 if (aarch64_tune_params.extra_tuning_flags
16245 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16246 return false;
16247
16248 return is_gimple_assign (stmt_info->stmt);
16249 }
16250
16251 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16252 or multiply-subtract sequence that might be suitable for fusing into a
16253 single instruction. If VEC_FLAGS is zero, analyze the operation as
16254 a scalar one, otherwise analyze it as an operation on vectors with those
16255 VEC_* flags. */
16256 static bool
16257 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16258 unsigned int vec_flags)
16259 {
16260 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16261 if (!assign)
16262 return false;
16263 tree_code code = gimple_assign_rhs_code (assign);
16264 if (code != PLUS_EXPR && code != MINUS_EXPR)
16265 return false;
16266
16267 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
16268 || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
16269 return false;
16270
16271 for (int i = 1; i < 3; ++i)
16272 {
16273 tree rhs = gimple_op (assign, i);
16274 /* ??? Should we try to check for a single use as well? */
16275 if (TREE_CODE (rhs) != SSA_NAME)
16276 continue;
16277
16278 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16279 if (!def_stmt_info
16280 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16281 continue;
16282 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16283 if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16284 continue;
16285
16286 if (vec_flags & VEC_ADVSIMD)
16287 {
16288 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16289 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16290 only supports MLA forms, so will require a move if the result
16291 cannot be tied to the accumulator. The most important case in
16292 which this is true is when the accumulator input is invariant. */
16293 rhs = gimple_op (assign, 3 - i);
16294 if (TREE_CODE (rhs) != SSA_NAME)
16295 return false;
16296 def_stmt_info = vinfo->lookup_def (rhs);
16297 if (!def_stmt_info
16298 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
16299 return false;
16300 }
16301
16302 return true;
16303 }
16304 return false;
16305 }
16306
16307 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
16308 in-loop reduction that SVE supports directly, return its latency in cycles,
16309 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
16310 instructions. */
16311 static unsigned int
16312 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16313 stmt_vec_info stmt_info,
16314 const sve_vec_cost *sve_costs)
16315 {
16316 switch (vect_reduc_type (vinfo, stmt_info))
16317 {
16318 case EXTRACT_LAST_REDUCTION:
16319 return sve_costs->clast_cost;
16320
16321 case FOLD_LEFT_REDUCTION:
16322 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16323 {
16324 case E_HFmode:
16325 case E_BFmode:
16326 return sve_costs->fadda_f16_cost;
16327
16328 case E_SFmode:
16329 return sve_costs->fadda_f32_cost;
16330
16331 case E_DFmode:
16332 return sve_costs->fadda_f64_cost;
16333
16334 default:
16335 break;
16336 }
16337 break;
16338 }
16339
16340 return 0;
16341 }
16342
16343 /* STMT_INFO describes a loop-carried operation in the original scalar code
16344 that we are considering implementing as a reduction. Return one of the
16345 following values, depending on VEC_FLAGS:
16346
16347 - If VEC_FLAGS is zero, return the loop carry latency of the original
16348 scalar operation.
16349
16350 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16351 Advanced SIMD implementation.
16352
16353 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16354 SVE implementation. */
16355 static unsigned int
16356 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16357 unsigned int vec_flags)
16358 {
16359 const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16360 const sve_vec_cost *sve_costs = nullptr;
16361 if (vec_flags & VEC_ANY_SVE)
16362 sve_costs = aarch64_tune_params.vec_costs->sve;
16363
16364 /* If the caller is asking for the SVE latency, check for forms of reduction
16365 that only SVE can handle directly. */
16366 if (sve_costs)
16367 {
16368 unsigned int latency
16369 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16370 if (latency)
16371 return latency;
16372 }
16373
16374 /* Handle scalar costs. */
16375 bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16376 if (vec_flags == 0)
16377 {
16378 if (is_float)
16379 return vec_costs->scalar_fp_stmt_cost;
16380 return vec_costs->scalar_int_stmt_cost;
16381 }
16382
16383 /* Otherwise, the loop body just contains normal integer or FP operations,
16384 with a vector reduction outside the loop. */
16385 const simd_vec_cost *simd_costs
16386 = aarch64_simd_vec_costs_for_flags (vec_flags);
16387 if (is_float)
16388 return simd_costs->fp_stmt_cost;
16389 return simd_costs->int_stmt_cost;
16390 }
16391
16392 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16393 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
16394 try to subdivide the target-independent categorization provided by KIND
16395 to get a more accurate cost. */
16396 static fractional_cost
16397 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16398 stmt_vec_info stmt_info,
16399 fractional_cost stmt_cost)
16400 {
16401 /* Detect an extension of a loaded value. In general, we'll be able to fuse
16402 the extension with the load. */
16403 if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16404 return 0;
16405
16406 return stmt_cost;
16407 }
16408
16409 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16410 for the vectorized form of STMT_INFO, which has cost kind KIND and which
16411 when vectorized would operate on vector type VECTYPE. Try to subdivide
16412 the target-independent categorization provided by KIND to get a more
16413 accurate cost. WHERE specifies where the cost associated with KIND
16414 occurs. */
16415 static fractional_cost
16416 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16417 stmt_vec_info stmt_info, tree vectype,
16418 enum vect_cost_model_location where,
16419 fractional_cost stmt_cost)
16420 {
16421 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16422 const sve_vec_cost *sve_costs = nullptr;
16423 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16424 sve_costs = aarch64_tune_params.vec_costs->sve;
16425
16426 /* It's generally better to avoid costing inductions, since the induction
16427 will usually be hidden by other operations. This is particularly true
16428 for things like COND_REDUCTIONS. */
16429 if (is_a<gphi *> (stmt_info->stmt))
16430 return 0;
16431
16432 /* Detect cases in which vec_to_scalar is describing the extraction of a
16433 vector element in preparation for a scalar store. The store itself is
16434 costed separately. */
16435 if (vect_is_store_elt_extraction (kind, stmt_info))
16436 return simd_costs->store_elt_extra_cost;
16437
16438 /* Detect SVE gather loads, which are costed as a single scalar_load
16439 for each element. We therefore need to divide the full-instruction
16440 cost by the number of elements in the vector. */
16441 if (kind == scalar_load
16442 && sve_costs
16443 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16444 {
16445 unsigned int nunits = vect_nunits_for_cost (vectype);
16446 if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16447 return { sve_costs->gather_load_x64_cost, nunits };
16448 return { sve_costs->gather_load_x32_cost, nunits };
16449 }
16450
16451 /* Detect cases in which a scalar_store is really storing one element
16452 in a scatter operation. */
16453 if (kind == scalar_store
16454 && sve_costs
16455 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16456 return sve_costs->scatter_store_elt_cost;
16457
16458 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
16459 if (kind == vec_to_scalar
16460 && where == vect_body
16461 && sve_costs)
16462 {
16463 unsigned int latency
16464 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16465 if (latency)
16466 return latency;
16467 }
16468
16469 /* Detect cases in which vec_to_scalar represents a single reduction
16470 instruction like FADDP or MAXV. */
16471 if (kind == vec_to_scalar
16472 && where == vect_epilogue
16473 && vect_is_reduction (stmt_info))
16474 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16475 {
16476 case E_QImode:
16477 return simd_costs->reduc_i8_cost;
16478
16479 case E_HImode:
16480 return simd_costs->reduc_i16_cost;
16481
16482 case E_SImode:
16483 return simd_costs->reduc_i32_cost;
16484
16485 case E_DImode:
16486 return simd_costs->reduc_i64_cost;
16487
16488 case E_HFmode:
16489 case E_BFmode:
16490 return simd_costs->reduc_f16_cost;
16491
16492 case E_SFmode:
16493 return simd_costs->reduc_f32_cost;
16494
16495 case E_DFmode:
16496 return simd_costs->reduc_f64_cost;
16497
16498 default:
16499 break;
16500 }
16501
16502 /* Otherwise stick with the original categorization. */
16503 return stmt_cost;
16504 }
16505
16506 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16507 for STMT_INFO, which has cost kind KIND and which when vectorized would
16508 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
16509 targets. */
16510 static fractional_cost
16511 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16512 stmt_vec_info stmt_info, tree vectype,
16513 fractional_cost stmt_cost)
16514 {
16515 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16516 vector register size or number of units. Integer promotions of this
16517 type therefore map to SXT[BHW] or UXT[BHW].
16518
16519 Most loads have extending forms that can do the sign or zero extension
16520 on the fly. Optimistically assume that a load followed by an extension
16521 will fold to this form during combine, and that the extension therefore
16522 comes for free. */
16523 if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16524 stmt_cost = 0;
16525
16526 /* For similar reasons, vector_stmt integer truncations are a no-op,
16527 because we can just ignore the unused upper bits of the source. */
16528 if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16529 stmt_cost = 0;
16530
16531 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16532 but there are no equivalent instructions for SVE. This means that
16533 (all other things being equal) 128-bit SVE needs twice as many load
16534 and store instructions as Advanced SIMD in order to process vector pairs.
16535
16536 Also, scalar code can often use LDP and STP to access pairs of values,
16537 so it is too simplistic to say that one SVE load or store replaces
16538 VF scalar loads and stores.
16539
16540 Ideally we would account for this in the scalar and Advanced SIMD
16541 costs by making suitable load/store pairs as cheap as a single
16542 load/store. However, that would be a very invasive change and in
16543 practice it tends to stress other parts of the cost model too much.
16544 E.g. stores of scalar constants currently count just a store,
16545 whereas stores of vector constants count a store and a vec_init.
16546 This is an artificial distinction for AArch64, where stores of
16547 nonzero scalar constants need the same kind of register invariant
16548 as vector stores.
16549
16550 An alternative would be to double the cost of any SVE loads and stores
16551 that could be paired in Advanced SIMD (and possibly also paired in
16552 scalar code). But this tends to stress other parts of the cost model
16553 in the same way. It also means that we can fall back to Advanced SIMD
16554 even if full-loop predication would have been useful.
16555
16556 Here we go for a more conservative version: double the costs of SVE
16557 loads and stores if one iteration of the scalar loop processes enough
16558 elements for it to use a whole number of Advanced SIMD LDP or STP
16559 instructions. This makes it very likely that the VF would be 1 for
16560 Advanced SIMD, and so no epilogue should be needed. */
16561 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16562 {
16563 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16564 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16565 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16566 if (multiple_p (count * elt_bits, 256)
16567 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16568 stmt_cost *= 2;
16569 }
16570
16571 return stmt_cost;
16572 }
16573
16574 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16575 and which when vectorized would operate on vector type VECTYPE. Add the
16576 cost of any embedded operations. */
16577 static fractional_cost
16578 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16579 tree vectype, fractional_cost stmt_cost)
16580 {
16581 if (vectype)
16582 {
16583 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16584
16585 /* Detect cases in which a vector load or store represents an
16586 LD[234] or ST[234] instruction. */
16587 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16588 {
16589 case 2:
16590 stmt_cost += simd_costs->ld2_st2_permute_cost;
16591 break;
16592
16593 case 3:
16594 stmt_cost += simd_costs->ld3_st3_permute_cost;
16595 break;
16596
16597 case 4:
16598 stmt_cost += simd_costs->ld4_st4_permute_cost;
16599 break;
16600 }
16601
16602 if (kind == vector_stmt || kind == vec_to_scalar)
16603 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16604 {
16605 if (FLOAT_TYPE_P (cmp_type))
16606 stmt_cost += simd_costs->fp_stmt_cost;
16607 else
16608 stmt_cost += simd_costs->int_stmt_cost;
16609 }
16610 }
16611
16612 if (kind == scalar_stmt)
16613 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16614 {
16615 if (FLOAT_TYPE_P (cmp_type))
16616 stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16617 else
16618 stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16619 }
16620
16621 return stmt_cost;
16622 }
16623
16624 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16625 and they describe an operation in the body of a vector loop. Record issue
16626 information relating to the vector operation in OPS. */
16627 void
16628 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16629 stmt_vec_info stmt_info,
16630 aarch64_vec_op_count *ops)
16631 {
16632 const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16633 if (!base_issue)
16634 return;
16635 const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16636 const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
16637
16638 /* Calculate the minimum cycles per iteration imposed by a reduction
16639 operation. */
16640 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16641 && vect_is_reduction (stmt_info))
16642 {
16643 unsigned int base
16644 = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
16645
16646 /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
16647 that's not yet the case. */
16648 ops->reduction_latency = MAX (ops->reduction_latency, base * count);
16649 }
16650
16651 /* Assume that multiply-adds will become a single operation. */
16652 if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
16653 return;
16654
16655 /* Count the basic operation cost associated with KIND. */
16656 switch (kind)
16657 {
16658 case cond_branch_taken:
16659 case cond_branch_not_taken:
16660 case vector_gather_load:
16661 case vector_scatter_store:
16662 /* We currently don't expect these to be used in a loop body. */
16663 break;
16664
16665 case vec_perm:
16666 case vec_promote_demote:
16667 case vec_construct:
16668 case vec_to_scalar:
16669 case scalar_to_vec:
16670 case vector_stmt:
16671 case scalar_stmt:
16672 ops->general_ops += count;
16673 break;
16674
16675 case scalar_load:
16676 case vector_load:
16677 case unaligned_load:
16678 ops->loads += count;
16679 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16680 ops->general_ops += base_issue->fp_simd_load_general_ops * count;
16681 break;
16682
16683 case vector_store:
16684 case unaligned_store:
16685 case scalar_store:
16686 ops->stores += count;
16687 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16688 ops->general_ops += base_issue->fp_simd_store_general_ops * count;
16689 break;
16690 }
16691
16692 /* Add any embedded comparison operations. */
16693 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16694 && vect_embedded_comparison_type (stmt_info))
16695 ops->general_ops += count;
16696
16697 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16698 have only accounted for one. */
16699 if ((kind == vector_stmt || kind == vec_to_scalar)
16700 && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16701 ops->general_ops += count;
16702
16703 /* Count the predicate operations needed by an SVE comparison. */
16704 if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
16705 if (tree type = vect_comparison_type (stmt_info))
16706 {
16707 unsigned int base = (FLOAT_TYPE_P (type)
16708 ? sve_issue->fp_cmp_pred_ops
16709 : sve_issue->int_cmp_pred_ops);
16710 ops->pred_ops += base * count;
16711 }
16712
16713 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
16714 if (simd_issue)
16715 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16716 {
16717 case 2:
16718 ops->general_ops += simd_issue->ld2_st2_general_ops * count;
16719 break;
16720
16721 case 3:
16722 ops->general_ops += simd_issue->ld3_st3_general_ops * count;
16723 break;
16724
16725 case 4:
16726 ops->general_ops += simd_issue->ld4_st4_general_ops * count;
16727 break;
16728 }
16729
16730 /* Add any overhead associated with gather loads and scatter stores. */
16731 if (sve_issue
16732 && (kind == scalar_load || kind == scalar_store)
16733 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16734 {
16735 unsigned int pairs = CEIL (count, 2);
16736 ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
16737 ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
16738 }
16739 }
16740
16741 /* Return true if STMT_INFO contains a memory access and if the constant
16742 component of the memory address is aligned to SIZE bytes. */
16743 static bool
16744 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
16745 poly_uint64 size)
16746 {
16747 if (!STMT_VINFO_DATA_REF (stmt_info))
16748 return false;
16749
16750 if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
16751 stmt_info = first_stmt;
16752 tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
16753 /* Needed for gathers & scatters, for example. */
16754 if (!constant_offset)
16755 return false;
16756
16757 return multiple_p (wi::to_poly_offset (constant_offset), size);
16758 }
16759
16760 /* Check if a scalar or vector stmt could be part of a region of code
16761 that does nothing more than store values to memory, in the scalar
16762 case using STP. Return the cost of the stmt if so, counting 2 for
16763 one instruction. Return ~0U otherwise.
16764
16765 The arguments are a subset of those passed to add_stmt_cost. */
16766 unsigned int
16767 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
16768 stmt_vec_info stmt_info, tree vectype)
16769 {
16770 /* Code that stores vector constants uses a vector_load to create
16771 the constant. We don't apply the heuristic to that case for two
16772 main reasons:
16773
16774 - At the moment, STPs are only formed via peephole2, and the
16775 constant scalar moves would often come between STRs and so
16776 prevent STP formation.
16777
16778 - The scalar code also has to load the constant somehow, and that
16779 isn't costed. */
16780 switch (kind)
16781 {
16782 case scalar_to_vec:
16783 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
16784 return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
16785
16786 case vec_construct:
16787 if (FLOAT_TYPE_P (vectype))
16788 /* Count 1 insn for the maximum number of FP->SIMD INS
16789 instructions. */
16790 return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
16791
16792 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
16793 maximum number of GPR->SIMD INS instructions. */
16794 return vect_nunits_for_cost (vectype) * 4 * count;
16795
16796 case vector_store:
16797 case unaligned_store:
16798 /* Count 1 insn per vector if we can't form STP Q pairs. */
16799 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16800 return count * 2;
16801 if (aarch64_tune_params.extra_tuning_flags
16802 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16803 return count * 2;
16804
16805 if (stmt_info)
16806 {
16807 /* Assume we won't be able to use STP if the constant offset
16808 component of the address is misaligned. ??? This could be
16809 removed if we formed STP pairs earlier, rather than relying
16810 on peephole2. */
16811 auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
16812 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16813 return count * 2;
16814 }
16815 return CEIL (count, 2) * 2;
16816
16817 case scalar_store:
16818 if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
16819 {
16820 /* Check for a mode in which STP pairs can be formed. */
16821 auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
16822 if (maybe_ne (size, 4) && maybe_ne (size, 8))
16823 return ~0U;
16824
16825 /* Assume we won't be able to use STP if the constant offset
16826 component of the address is misaligned. ??? This could be
16827 removed if we formed STP pairs earlier, rather than relying
16828 on peephole2. */
16829 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16830 return ~0U;
16831 }
16832 return count;
16833
16834 default:
16835 return ~0U;
16836 }
16837 }
16838
16839 unsigned
16840 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
16841 stmt_vec_info stmt_info, slp_tree,
16842 tree vectype, int misalign,
16843 vect_cost_model_location where)
16844 {
16845 fractional_cost stmt_cost
16846 = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
16847
16848 bool in_inner_loop_p = (where == vect_body
16849 && stmt_info
16850 && stmt_in_inner_loop_p (m_vinfo, stmt_info));
16851
16852 /* Do one-time initialization based on the vinfo. */
16853 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16854 if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
16855 {
16856 if (loop_vinfo)
16857 analyze_loop_vinfo (loop_vinfo);
16858
16859 m_analyzed_vinfo = true;
16860 }
16861
16862 /* Apply the heuristic described above m_stp_sequence_cost. */
16863 if (m_stp_sequence_cost != ~0U)
16864 {
16865 uint64_t cost = aarch64_stp_sequence_cost (count, kind,
16866 stmt_info, vectype);
16867 m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
16868 }
16869
16870 /* Try to get a more accurate cost by looking at STMT_INFO instead
16871 of just looking at KIND. */
16872 if (stmt_info && aarch64_use_new_vector_costs_p ())
16873 {
16874 /* If we scalarize a strided store, the vectorizer costs one
16875 vec_to_scalar for each element. However, we can store the first
16876 element using an FP store without a separate extract step. */
16877 if (vect_is_store_elt_extraction (kind, stmt_info))
16878 count -= 1;
16879
16880 stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
16881 stmt_info, stmt_cost);
16882
16883 if (vectype && m_vec_flags)
16884 stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
16885 stmt_info, vectype,
16886 where, stmt_cost);
16887 }
16888
16889 /* Do any SVE-specific adjustments to the cost. */
16890 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
16891 stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
16892 vectype, stmt_cost);
16893
16894 if (stmt_info && aarch64_use_new_vector_costs_p ())
16895 {
16896 /* Account for any extra "embedded" costs that apply additively
16897 to the base cost calculated above. */
16898 stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
16899 stmt_cost);
16900
16901 /* If we're recording a nonzero vector loop body cost for the
16902 innermost loop, also estimate the operations that would need
16903 to be issued by all relevant implementations of the loop. */
16904 if (loop_vinfo
16905 && (m_costing_for_scalar || where == vect_body)
16906 && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
16907 && stmt_cost != 0)
16908 for (auto &ops : m_ops)
16909 count_ops (count, kind, stmt_info, &ops);
16910
16911 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
16912 estimate the number of statements in the unrolled Advanced SIMD
16913 loop. For simplicitly, we assume that one iteration of the
16914 Advanced SIMD loop would need the same number of statements
16915 as one iteration of the SVE loop. */
16916 if (where == vect_body && m_unrolled_advsimd_niters)
16917 m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
16918
16919 /* Detect the use of an averaging operation. */
16920 gimple *stmt = stmt_info->stmt;
16921 if (is_gimple_call (stmt)
16922 && gimple_call_internal_p (stmt))
16923 {
16924 switch (gimple_call_internal_fn (stmt))
16925 {
16926 case IFN_AVG_FLOOR:
16927 case IFN_AVG_CEIL:
16928 m_has_avg = true;
16929 default:
16930 break;
16931 }
16932 }
16933 }
16934 return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
16935 }
16936
16937 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
16938 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
16939 says that we should prefer the Advanced SIMD loop. */
16940 bool
16941 aarch64_vector_costs::prefer_unrolled_loop () const
16942 {
16943 if (!m_unrolled_advsimd_stmts)
16944 return false;
16945
16946 if (dump_enabled_p ())
16947 dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
16948 " unrolled Advanced SIMD loop = "
16949 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
16950 m_unrolled_advsimd_stmts);
16951
16952 /* The balance here is tricky. On the one hand, we can't be sure whether
16953 the code is vectorizable with Advanced SIMD or not. However, even if
16954 it isn't vectorizable with Advanced SIMD, there's a possibility that
16955 the scalar code could also be unrolled. Some of the code might then
16956 benefit from SLP, or from using LDP and STP. We therefore apply
16957 the heuristic regardless of can_use_advsimd_p. */
16958 return (m_unrolled_advsimd_stmts
16959 && (m_unrolled_advsimd_stmts
16960 <= (unsigned int) param_max_completely_peeled_insns));
16961 }
16962
16963 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
16964 how fast the SVE code can be issued and compare it to the equivalent value
16965 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
16966 also compare it to the issue rate of Advanced SIMD code
16967 (ADVSIMD_CYCLES_PER_ITER).
16968
16969 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
16970 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
16971 is true if we think the loop body is too expensive. */
16972
16973 fractional_cost
16974 aarch64_vector_costs::
16975 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
16976 fractional_cost scalar_cycles_per_iter,
16977 unsigned int orig_body_cost, unsigned int *body_cost,
16978 bool *should_disparage)
16979 {
16980 if (dump_enabled_p ())
16981 ops->dump ();
16982
16983 fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
16984 fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
16985
16986 /* If the scalar version of the loop could issue at least as
16987 quickly as the predicate parts of the SVE loop, make the SVE loop
16988 prohibitively expensive. In this case vectorization is adding an
16989 overhead that the original scalar code didn't have.
16990
16991 This is mostly intended to detect cases in which WHILELOs dominate
16992 for very tight loops, which is something that normal latency-based
16993 costs would not model. Adding this kind of cliffedge would be
16994 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
16995 code in the caller handles that case in a more conservative way. */
16996 fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
16997 if (scalar_cycles_per_iter < sve_estimate)
16998 {
16999 unsigned int min_cost
17000 = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17001 if (*body_cost < min_cost)
17002 {
17003 if (dump_enabled_p ())
17004 dump_printf_loc (MSG_NOTE, vect_location,
17005 "Increasing body cost to %d because the"
17006 " scalar code could issue within the limit"
17007 " imposed by predicate operations\n",
17008 min_cost);
17009 *body_cost = min_cost;
17010 *should_disparage = true;
17011 }
17012 }
17013
17014 return sve_cycles_per_iter;
17015 }
17016
17017 unsigned int
17018 aarch64_vector_costs::determine_suggested_unroll_factor ()
17019 {
17020 bool sve = m_vec_flags & VEC_ANY_SVE;
17021 /* If we are trying to unroll an Advanced SIMD main loop that contains
17022 an averaging operation that we do not support with SVE and we might use a
17023 predicated epilogue, we need to be conservative and block unrolling as
17024 this might lead to a less optimal loop for the first and only epilogue
17025 using the original loop's vectorization factor.
17026 TODO: Remove this constraint when we add support for multiple epilogue
17027 vectorization. */
17028 if (!sve && !TARGET_SVE2 && m_has_avg)
17029 return 1;
17030
17031 unsigned int max_unroll_factor = 1;
17032 for (auto vec_ops : m_ops)
17033 {
17034 aarch64_simd_vec_issue_info const *vec_issue
17035 = vec_ops.simd_issue_info ();
17036 if (!vec_issue)
17037 return 1;
17038 /* Limit unroll factor to a value adjustable by the user, the default
17039 value is 4. */
17040 unsigned int unroll_factor = aarch64_vect_unroll_limit;
17041 unsigned int factor
17042 = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17043 unsigned int temp;
17044
17045 /* Sanity check, this should never happen. */
17046 if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17047 return 1;
17048
17049 /* Check stores. */
17050 if (vec_ops.stores > 0)
17051 {
17052 temp = CEIL (factor * vec_issue->stores_per_cycle,
17053 vec_ops.stores);
17054 unroll_factor = MIN (unroll_factor, temp);
17055 }
17056
17057 /* Check loads + stores. */
17058 if (vec_ops.loads > 0)
17059 {
17060 temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17061 vec_ops.loads + vec_ops.stores);
17062 unroll_factor = MIN (unroll_factor, temp);
17063 }
17064
17065 /* Check general ops. */
17066 if (vec_ops.general_ops > 0)
17067 {
17068 temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17069 vec_ops.general_ops);
17070 unroll_factor = MIN (unroll_factor, temp);
17071 }
17072 max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17073 }
17074
17075 /* Make sure unroll factor is power of 2. */
17076 return 1 << ceil_log2 (max_unroll_factor);
17077 }
17078
17079 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
17080 and return the new cost. */
17081 unsigned int
17082 aarch64_vector_costs::
17083 adjust_body_cost (loop_vec_info loop_vinfo,
17084 const aarch64_vector_costs *scalar_costs,
17085 unsigned int body_cost)
17086 {
17087 if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17088 return body_cost;
17089
17090 const auto &scalar_ops = scalar_costs->m_ops[0];
17091 const auto &vector_ops = m_ops[0];
17092 unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17093 unsigned int orig_body_cost = body_cost;
17094 bool should_disparage = false;
17095
17096 if (dump_enabled_p ())
17097 dump_printf_loc (MSG_NOTE, vect_location,
17098 "Original vector body cost = %d\n", body_cost);
17099
17100 fractional_cost scalar_cycles_per_iter
17101 = scalar_ops.min_cycles_per_iter () * estimated_vf;
17102
17103 fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17104
17105 if (dump_enabled_p ())
17106 {
17107 if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17108 dump_printf_loc (MSG_NOTE, vect_location,
17109 "Vector loop iterates at most %wd times\n",
17110 m_num_vector_iterations);
17111 dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17112 scalar_ops.dump ();
17113 dump_printf_loc (MSG_NOTE, vect_location,
17114 " estimated cycles per vector iteration"
17115 " (for VF %d) = %f\n",
17116 estimated_vf, scalar_cycles_per_iter.as_double ());
17117 }
17118
17119 if (vector_ops.sve_issue_info ())
17120 {
17121 if (dump_enabled_p ())
17122 dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17123 vector_cycles_per_iter
17124 = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17125 orig_body_cost, &body_cost, &should_disparage);
17126
17127 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17128 {
17129 /* Also take Neoverse V1 tuning into account, doubling the
17130 scalar and Advanced SIMD estimates to account for the
17131 doubling in SVE vector length. */
17132 if (dump_enabled_p ())
17133 dump_printf_loc (MSG_NOTE, vect_location,
17134 "Neoverse V1 estimate:\n");
17135 auto vf_factor = m_ops[1].vf_factor ();
17136 adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17137 orig_body_cost, &body_cost, &should_disparage);
17138 }
17139 }
17140 else
17141 {
17142 if (dump_enabled_p ())
17143 {
17144 dump_printf_loc (MSG_NOTE, vect_location,
17145 "Vector issue estimate:\n");
17146 vector_ops.dump ();
17147 }
17148 }
17149
17150 /* Decide whether to stick to latency-based costs or whether to try to
17151 take issue rates into account. */
17152 unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17153 if (m_vec_flags & VEC_ANY_SVE)
17154 threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17155
17156 if (m_num_vector_iterations >= 1
17157 && m_num_vector_iterations < threshold)
17158 {
17159 if (dump_enabled_p ())
17160 dump_printf_loc (MSG_NOTE, vect_location,
17161 "Low iteration count, so using pure latency"
17162 " costs\n");
17163 }
17164 /* Increase the cost of the vector code if it looks like the scalar code
17165 could issue more quickly. These values are only rough estimates,
17166 so minor differences should only result in minor changes. */
17167 else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17168 {
17169 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17170 scalar_cycles_per_iter);
17171 if (dump_enabled_p ())
17172 dump_printf_loc (MSG_NOTE, vect_location,
17173 "Increasing body cost to %d because scalar code"
17174 " would issue more quickly\n", body_cost);
17175 }
17176 /* In general, it's expected that the proposed vector code would be able
17177 to issue more quickly than the original scalar code. This should
17178 already be reflected to some extent in the latency-based costs.
17179
17180 However, the latency-based costs effectively assume that the scalar
17181 code and the vector code execute serially, which tends to underplay
17182 one important case: if the real (non-serialized) execution time of
17183 a scalar iteration is dominated by loop-carried dependencies,
17184 and if the vector code is able to reduce both the length of
17185 the loop-carried dependencies *and* the number of cycles needed
17186 to issue the code in general, we can be more confident that the
17187 vector code is an improvement, even if adding the other (non-loop-carried)
17188 latencies tends to hide this saving. We therefore reduce the cost of the
17189 vector loop body in proportion to the saving. */
17190 else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17191 && scalar_ops.reduction_latency == scalar_cycles_per_iter
17192 && scalar_cycles_per_iter > vector_cycles_per_iter
17193 && !should_disparage)
17194 {
17195 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17196 scalar_cycles_per_iter);
17197 if (dump_enabled_p ())
17198 dump_printf_loc (MSG_NOTE, vect_location,
17199 "Decreasing body cost to %d account for smaller"
17200 " reduction latency\n", body_cost);
17201 }
17202
17203 return body_cost;
17204 }
17205
17206 void
17207 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17208 {
17209 auto *scalar_costs
17210 = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17211 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17212 if (loop_vinfo
17213 && m_vec_flags
17214 && aarch64_use_new_vector_costs_p ())
17215 {
17216 m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17217 m_costs[vect_body]);
17218 m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17219 }
17220
17221 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
17222 the scalar code in the event of a tie, since there is more chance
17223 of scalar code being optimized with surrounding operations. */
17224 if (!loop_vinfo
17225 && scalar_costs
17226 && m_stp_sequence_cost != ~0U
17227 && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17228 m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17229
17230 vector_costs::finish_cost (scalar_costs);
17231 }
17232
17233 bool
17234 aarch64_vector_costs::
17235 better_main_loop_than_p (const vector_costs *uncast_other) const
17236 {
17237 auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17238
17239 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17240 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17241
17242 if (dump_enabled_p ())
17243 dump_printf_loc (MSG_NOTE, vect_location,
17244 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17245 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17246 vect_vf_for_cost (this_loop_vinfo),
17247 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17248 vect_vf_for_cost (other_loop_vinfo));
17249
17250 /* Apply the unrolling heuristic described above
17251 m_unrolled_advsimd_niters. */
17252 if (bool (m_unrolled_advsimd_stmts)
17253 != bool (other->m_unrolled_advsimd_stmts))
17254 {
17255 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17256 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17257 if (this_prefer_unrolled != other_prefer_unrolled)
17258 {
17259 if (dump_enabled_p ())
17260 dump_printf_loc (MSG_NOTE, vect_location,
17261 "Preferring Advanced SIMD loop because"
17262 " it can be unrolled\n");
17263 return other_prefer_unrolled;
17264 }
17265 }
17266
17267 for (unsigned int i = 0; i < m_ops.length (); ++i)
17268 {
17269 if (dump_enabled_p ())
17270 {
17271 if (i)
17272 dump_printf_loc (MSG_NOTE, vect_location,
17273 "Reconsidering with subtuning %d\n", i);
17274 dump_printf_loc (MSG_NOTE, vect_location,
17275 "Issue info for %s loop:\n",
17276 GET_MODE_NAME (this_loop_vinfo->vector_mode));
17277 this->m_ops[i].dump ();
17278 dump_printf_loc (MSG_NOTE, vect_location,
17279 "Issue info for %s loop:\n",
17280 GET_MODE_NAME (other_loop_vinfo->vector_mode));
17281 other->m_ops[i].dump ();
17282 }
17283
17284 auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17285 * this->m_ops[i].vf_factor ());
17286 auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17287 * other->m_ops[i].vf_factor ());
17288
17289 /* If it appears that one loop could process the same amount of data
17290 in fewer cycles, prefer that loop over the other one. */
17291 fractional_cost this_cost
17292 = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17293 fractional_cost other_cost
17294 = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17295 if (dump_enabled_p ())
17296 {
17297 dump_printf_loc (MSG_NOTE, vect_location,
17298 "Weighted cycles per iteration of %s loop ~= %f\n",
17299 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17300 this_cost.as_double ());
17301 dump_printf_loc (MSG_NOTE, vect_location,
17302 "Weighted cycles per iteration of %s loop ~= %f\n",
17303 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17304 other_cost.as_double ());
17305 }
17306 if (this_cost != other_cost)
17307 {
17308 if (dump_enabled_p ())
17309 dump_printf_loc (MSG_NOTE, vect_location,
17310 "Preferring loop with lower cycles"
17311 " per iteration\n");
17312 return this_cost < other_cost;
17313 }
17314
17315 /* If the issue rate of SVE code is limited by predicate operations
17316 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17317 and if Advanced SIMD code could issue within the limit imposed
17318 by the predicate operations, the predicate operations are adding an
17319 overhead that the original code didn't have and so we should prefer
17320 the Advanced SIMD version. */
17321 auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17322 const aarch64_vec_op_count &b) -> bool
17323 {
17324 if (a.pred_ops == 0
17325 && (b.min_pred_cycles_per_iter ()
17326 > b.min_nonpred_cycles_per_iter ()))
17327 {
17328 if (dump_enabled_p ())
17329 dump_printf_loc (MSG_NOTE, vect_location,
17330 "Preferring Advanced SIMD loop since"
17331 " SVE loop is predicate-limited\n");
17332 return true;
17333 }
17334 return false;
17335 };
17336 if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17337 return true;
17338 if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17339 return false;
17340 }
17341
17342 return vector_costs::better_main_loop_than_p (other);
17343 }
17344
17345 static void initialize_aarch64_code_model (struct gcc_options *);
17346
17347 /* Parse the TO_PARSE string and put the architecture struct that it
17348 selects into RES and the architectural features into ISA_FLAGS.
17349 Return an aarch64_parse_opt_result describing the parse result.
17350 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17351 When the TO_PARSE string contains an invalid extension,
17352 a copy of the string is created and stored to INVALID_EXTENSION. */
17353
17354 static enum aarch64_parse_opt_result
17355 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17356 aarch64_feature_flags *isa_flags,
17357 std::string *invalid_extension)
17358 {
17359 const char *ext;
17360 const struct processor *arch;
17361 size_t len;
17362
17363 ext = strchr (to_parse, '+');
17364
17365 if (ext != NULL)
17366 len = ext - to_parse;
17367 else
17368 len = strlen (to_parse);
17369
17370 if (len == 0)
17371 return AARCH64_PARSE_MISSING_ARG;
17372
17373
17374 /* Loop through the list of supported ARCHes to find a match. */
17375 for (arch = all_architectures; arch->name != NULL; arch++)
17376 {
17377 if (strlen (arch->name) == len
17378 && strncmp (arch->name, to_parse, len) == 0)
17379 {
17380 auto isa_temp = arch->flags;
17381
17382 if (ext != NULL)
17383 {
17384 /* TO_PARSE string contains at least one extension. */
17385 enum aarch64_parse_opt_result ext_res
17386 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17387
17388 if (ext_res != AARCH64_PARSE_OK)
17389 return ext_res;
17390 }
17391 /* Extension parsing was successful. Confirm the result
17392 arch and ISA flags. */
17393 *res = arch;
17394 *isa_flags = isa_temp;
17395 return AARCH64_PARSE_OK;
17396 }
17397 }
17398
17399 /* ARCH name not found in list. */
17400 return AARCH64_PARSE_INVALID_ARG;
17401 }
17402
17403 /* Parse the TO_PARSE string and put the result tuning in RES and the
17404 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
17405 describing the parse result. If there is an error parsing, RES and
17406 ISA_FLAGS are left unchanged.
17407 When the TO_PARSE string contains an invalid extension,
17408 a copy of the string is created and stored to INVALID_EXTENSION. */
17409
17410 static enum aarch64_parse_opt_result
17411 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17412 aarch64_feature_flags *isa_flags,
17413 std::string *invalid_extension)
17414 {
17415 const char *ext;
17416 const struct processor *cpu;
17417 size_t len;
17418
17419 ext = strchr (to_parse, '+');
17420
17421 if (ext != NULL)
17422 len = ext - to_parse;
17423 else
17424 len = strlen (to_parse);
17425
17426 if (len == 0)
17427 return AARCH64_PARSE_MISSING_ARG;
17428
17429
17430 /* Loop through the list of supported CPUs to find a match. */
17431 for (cpu = all_cores; cpu->name != NULL; cpu++)
17432 {
17433 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17434 {
17435 auto isa_temp = cpu->flags;
17436
17437 if (ext != NULL)
17438 {
17439 /* TO_PARSE string contains at least one extension. */
17440 enum aarch64_parse_opt_result ext_res
17441 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17442
17443 if (ext_res != AARCH64_PARSE_OK)
17444 return ext_res;
17445 }
17446 /* Extension parsing was successfull. Confirm the result
17447 cpu and ISA flags. */
17448 *res = cpu;
17449 *isa_flags = isa_temp;
17450 return AARCH64_PARSE_OK;
17451 }
17452 }
17453
17454 /* CPU name not found in list. */
17455 return AARCH64_PARSE_INVALID_ARG;
17456 }
17457
17458 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17459 Return an aarch64_parse_opt_result describing the parse result.
17460 If the parsing fails the RES does not change. */
17461
17462 static enum aarch64_parse_opt_result
17463 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17464 {
17465 const struct processor *cpu;
17466
17467 /* Loop through the list of supported CPUs to find a match. */
17468 for (cpu = all_cores; cpu->name != NULL; cpu++)
17469 {
17470 if (strcmp (cpu->name, to_parse) == 0)
17471 {
17472 *res = cpu;
17473 return AARCH64_PARSE_OK;
17474 }
17475 }
17476
17477 /* CPU name not found in list. */
17478 return AARCH64_PARSE_INVALID_ARG;
17479 }
17480
17481 /* Parse TOKEN, which has length LENGTH to see if it is an option
17482 described in FLAG. If it is, return the index bit for that fusion type.
17483 If not, error (printing OPTION_NAME) and return zero. */
17484
17485 static unsigned int
17486 aarch64_parse_one_option_token (const char *token,
17487 size_t length,
17488 const struct aarch64_flag_desc *flag,
17489 const char *option_name)
17490 {
17491 for (; flag->name != NULL; flag++)
17492 {
17493 if (length == strlen (flag->name)
17494 && !strncmp (flag->name, token, length))
17495 return flag->flag;
17496 }
17497
17498 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17499 return 0;
17500 }
17501
17502 /* Parse OPTION which is a comma-separated list of flags to enable.
17503 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17504 default state we inherit from the CPU tuning structures. OPTION_NAME
17505 gives the top-level option we are parsing in the -moverride string,
17506 for use in error messages. */
17507
17508 static unsigned int
17509 aarch64_parse_boolean_options (const char *option,
17510 const struct aarch64_flag_desc *flags,
17511 unsigned int initial_state,
17512 const char *option_name)
17513 {
17514 const char separator = '.';
17515 const char* specs = option;
17516 const char* ntoken = option;
17517 unsigned int found_flags = initial_state;
17518
17519 while ((ntoken = strchr (specs, separator)))
17520 {
17521 size_t token_length = ntoken - specs;
17522 unsigned token_ops = aarch64_parse_one_option_token (specs,
17523 token_length,
17524 flags,
17525 option_name);
17526 /* If we find "none" (or, for simplicity's sake, an error) anywhere
17527 in the token stream, reset the supported operations. So:
17528
17529 adrp+add.cmp+branch.none.adrp+add
17530
17531 would have the result of turning on only adrp+add fusion. */
17532 if (!token_ops)
17533 found_flags = 0;
17534
17535 found_flags |= token_ops;
17536 specs = ++ntoken;
17537 }
17538
17539 /* We ended with a comma, print something. */
17540 if (!(*specs))
17541 {
17542 error ("%qs string ill-formed", option_name);
17543 return 0;
17544 }
17545
17546 /* We still have one more token to parse. */
17547 size_t token_length = strlen (specs);
17548 unsigned token_ops = aarch64_parse_one_option_token (specs,
17549 token_length,
17550 flags,
17551 option_name);
17552 if (!token_ops)
17553 found_flags = 0;
17554
17555 found_flags |= token_ops;
17556 return found_flags;
17557 }
17558
17559 /* Support for overriding instruction fusion. */
17560
17561 static void
17562 aarch64_parse_fuse_string (const char *fuse_string,
17563 struct tune_params *tune)
17564 {
17565 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17566 aarch64_fusible_pairs,
17567 tune->fusible_ops,
17568 "fuse=");
17569 }
17570
17571 /* Support for overriding other tuning flags. */
17572
17573 static void
17574 aarch64_parse_tune_string (const char *tune_string,
17575 struct tune_params *tune)
17576 {
17577 tune->extra_tuning_flags
17578 = aarch64_parse_boolean_options (tune_string,
17579 aarch64_tuning_flags,
17580 tune->extra_tuning_flags,
17581 "tune=");
17582 }
17583
17584 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17585 Accept the valid SVE vector widths allowed by
17586 aarch64_sve_vector_bits_enum and use it to override sve_width
17587 in TUNE. */
17588
17589 static void
17590 aarch64_parse_sve_width_string (const char *tune_string,
17591 struct tune_params *tune)
17592 {
17593 int width = -1;
17594
17595 int n = sscanf (tune_string, "%d", &width);
17596 if (n == EOF)
17597 {
17598 error ("invalid format for %<sve_width%>");
17599 return;
17600 }
17601 switch (width)
17602 {
17603 case SVE_128:
17604 case SVE_256:
17605 case SVE_512:
17606 case SVE_1024:
17607 case SVE_2048:
17608 break;
17609 default:
17610 error ("invalid %<sve_width%> value: %d", width);
17611 }
17612 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17613 }
17614
17615 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17616 we understand. If it is, extract the option string and handoff to
17617 the appropriate function. */
17618
17619 void
17620 aarch64_parse_one_override_token (const char* token,
17621 size_t length,
17622 struct tune_params *tune)
17623 {
17624 const struct aarch64_tuning_override_function *fn
17625 = aarch64_tuning_override_functions;
17626
17627 const char *option_part = strchr (token, '=');
17628 if (!option_part)
17629 {
17630 error ("tuning string missing in option (%s)", token);
17631 return;
17632 }
17633
17634 /* Get the length of the option name. */
17635 length = option_part - token;
17636 /* Skip the '=' to get to the option string. */
17637 option_part++;
17638
17639 for (; fn->name != NULL; fn++)
17640 {
17641 if (!strncmp (fn->name, token, length))
17642 {
17643 fn->parse_override (option_part, tune);
17644 return;
17645 }
17646 }
17647
17648 error ("unknown tuning option (%s)",token);
17649 return;
17650 }
17651
17652 /* A checking mechanism for the implementation of the tls size. */
17653
17654 static void
17655 initialize_aarch64_tls_size (struct gcc_options *opts)
17656 {
17657 if (aarch64_tls_size == 0)
17658 aarch64_tls_size = 24;
17659
17660 switch (opts->x_aarch64_cmodel_var)
17661 {
17662 case AARCH64_CMODEL_TINY:
17663 /* Both the default and maximum TLS size allowed under tiny is 1M which
17664 needs two instructions to address, so we clamp the size to 24. */
17665 if (aarch64_tls_size > 24)
17666 aarch64_tls_size = 24;
17667 break;
17668 case AARCH64_CMODEL_SMALL:
17669 /* The maximum TLS size allowed under small is 4G. */
17670 if (aarch64_tls_size > 32)
17671 aarch64_tls_size = 32;
17672 break;
17673 case AARCH64_CMODEL_LARGE:
17674 /* The maximum TLS size allowed under large is 16E.
17675 FIXME: 16E should be 64bit, we only support 48bit offset now. */
17676 if (aarch64_tls_size > 48)
17677 aarch64_tls_size = 48;
17678 break;
17679 default:
17680 gcc_unreachable ();
17681 }
17682
17683 return;
17684 }
17685
17686 /* Return the CPU corresponding to the enum CPU. */
17687
17688 static const struct processor *
17689 aarch64_get_tune_cpu (enum aarch64_processor cpu)
17690 {
17691 gcc_assert (cpu != aarch64_none);
17692
17693 return &all_cores[cpu];
17694 }
17695
17696 /* Return the architecture corresponding to the enum ARCH. */
17697
17698 static const struct processor *
17699 aarch64_get_arch (enum aarch64_arch arch)
17700 {
17701 gcc_assert (arch != aarch64_no_arch);
17702
17703 return &all_architectures[arch];
17704 }
17705
17706 /* Parse STRING looking for options in the format:
17707 string :: option:string
17708 option :: name=substring
17709 name :: {a-z}
17710 substring :: defined by option. */
17711
17712 static void
17713 aarch64_parse_override_string (const char* input_string,
17714 struct tune_params* tune)
17715 {
17716 const char separator = ':';
17717 size_t string_length = strlen (input_string) + 1;
17718 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
17719 char *string = string_root;
17720 strncpy (string, input_string, string_length);
17721 string[string_length - 1] = '\0';
17722
17723 char* ntoken = string;
17724
17725 while ((ntoken = strchr (string, separator)))
17726 {
17727 size_t token_length = ntoken - string;
17728 /* Make this substring look like a string. */
17729 *ntoken = '\0';
17730 aarch64_parse_one_override_token (string, token_length, tune);
17731 string = ++ntoken;
17732 }
17733
17734 /* One last option to parse. */
17735 aarch64_parse_one_override_token (string, strlen (string), tune);
17736 free (string_root);
17737 }
17738
17739 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
17740 are best for a generic target with the currently-enabled architecture
17741 extensions. */
17742 static void
17743 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
17744 {
17745 /* Neoverse V1 is the only core that is known to benefit from
17746 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
17747 point enabling it for SVE2 and above. */
17748 if (TARGET_SVE2)
17749 current_tune.extra_tuning_flags
17750 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
17751 }
17752
17753 static void
17754 aarch64_override_options_after_change_1 (struct gcc_options *opts)
17755 {
17756 if (accepted_branch_protection_string)
17757 {
17758 opts->x_aarch64_branch_protection_string
17759 = xstrdup (accepted_branch_protection_string);
17760 }
17761
17762 /* PR 70044: We have to be careful about being called multiple times for the
17763 same function. This means all changes should be repeatable. */
17764
17765 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
17766 Disable the frame pointer flag so the mid-end will not use a frame
17767 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
17768 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
17769 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
17770 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
17771 if (opts->x_flag_omit_frame_pointer == 0)
17772 opts->x_flag_omit_frame_pointer = 2;
17773
17774 /* If not optimizing for size, set the default
17775 alignment to what the target wants. */
17776 if (!opts->x_optimize_size)
17777 {
17778 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
17779 opts->x_str_align_loops = aarch64_tune_params.loop_align;
17780 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
17781 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
17782 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
17783 opts->x_str_align_functions = aarch64_tune_params.function_align;
17784 }
17785
17786 /* We default to no pc-relative literal loads. */
17787
17788 aarch64_pcrelative_literal_loads = false;
17789
17790 /* If -mpc-relative-literal-loads is set on the command line, this
17791 implies that the user asked for PC relative literal loads. */
17792 if (opts->x_pcrelative_literal_loads == 1)
17793 aarch64_pcrelative_literal_loads = true;
17794
17795 /* In the tiny memory model it makes no sense to disallow PC relative
17796 literal pool loads. */
17797 if (aarch64_cmodel == AARCH64_CMODEL_TINY
17798 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
17799 aarch64_pcrelative_literal_loads = true;
17800
17801 /* When enabling the lower precision Newton series for the square root, also
17802 enable it for the reciprocal square root, since the latter is an
17803 intermediary step for the former. */
17804 if (flag_mlow_precision_sqrt)
17805 flag_mrecip_low_precision_sqrt = true;
17806 }
17807
17808 /* 'Unpack' up the internal tuning structs and update the options
17809 in OPTS. The caller must have set up selected_tune and selected_arch
17810 as all the other target-specific codegen decisions are
17811 derived from them. */
17812
17813 void
17814 aarch64_override_options_internal (struct gcc_options *opts)
17815 {
17816 const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
17817 aarch64_tune_flags = tune->flags;
17818 aarch64_tune = tune->sched_core;
17819 /* Make a copy of the tuning parameters attached to the core, which
17820 we may later overwrite. */
17821 aarch64_tune_params = *(tune->tune);
17822 if (tune->tune == &generic_tunings)
17823 aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
17824
17825 if (opts->x_aarch64_override_tune_string)
17826 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
17827 &aarch64_tune_params);
17828
17829 /* This target defaults to strict volatile bitfields. */
17830 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
17831 opts->x_flag_strict_volatile_bitfields = 1;
17832
17833 if (aarch64_stack_protector_guard == SSP_GLOBAL
17834 && opts->x_aarch64_stack_protector_guard_offset_str)
17835 {
17836 error ("incompatible options %<-mstack-protector-guard=global%> and "
17837 "%<-mstack-protector-guard-offset=%s%>",
17838 aarch64_stack_protector_guard_offset_str);
17839 }
17840
17841 if (aarch64_stack_protector_guard == SSP_SYSREG
17842 && !(opts->x_aarch64_stack_protector_guard_offset_str
17843 && opts->x_aarch64_stack_protector_guard_reg_str))
17844 {
17845 error ("both %<-mstack-protector-guard-offset%> and "
17846 "%<-mstack-protector-guard-reg%> must be used "
17847 "with %<-mstack-protector-guard=sysreg%>");
17848 }
17849
17850 if (opts->x_aarch64_stack_protector_guard_reg_str)
17851 {
17852 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
17853 error ("specify a system register with a small string length");
17854 }
17855
17856 if (opts->x_aarch64_stack_protector_guard_offset_str)
17857 {
17858 char *end;
17859 const char *str = aarch64_stack_protector_guard_offset_str;
17860 errno = 0;
17861 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
17862 if (!*str || *end || errno)
17863 error ("%qs is not a valid offset in %qs", str,
17864 "-mstack-protector-guard-offset=");
17865 aarch64_stack_protector_guard_offset = offs;
17866 }
17867
17868 if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
17869 && !fixed_regs[R18_REGNUM])
17870 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
17871
17872 initialize_aarch64_code_model (opts);
17873 initialize_aarch64_tls_size (opts);
17874
17875 int queue_depth = 0;
17876 switch (aarch64_tune_params.autoprefetcher_model)
17877 {
17878 case tune_params::AUTOPREFETCHER_OFF:
17879 queue_depth = -1;
17880 break;
17881 case tune_params::AUTOPREFETCHER_WEAK:
17882 queue_depth = 0;
17883 break;
17884 case tune_params::AUTOPREFETCHER_STRONG:
17885 queue_depth = max_insn_queue_index + 1;
17886 break;
17887 default:
17888 gcc_unreachable ();
17889 }
17890
17891 /* We don't mind passing in global_options_set here as we don't use
17892 the *options_set structs anyway. */
17893 SET_OPTION_IF_UNSET (opts, &global_options_set,
17894 param_sched_autopref_queue_depth, queue_depth);
17895
17896 /* If using Advanced SIMD only for autovectorization disable SVE vector costs
17897 comparison. */
17898 if (aarch64_autovec_preference == 1)
17899 SET_OPTION_IF_UNSET (opts, &global_options_set,
17900 aarch64_sve_compare_costs, 0);
17901
17902 /* Set up parameters to be used in prefetching algorithm. Do not
17903 override the defaults unless we are tuning for a core we have
17904 researched values for. */
17905 if (aarch64_tune_params.prefetch->num_slots > 0)
17906 SET_OPTION_IF_UNSET (opts, &global_options_set,
17907 param_simultaneous_prefetches,
17908 aarch64_tune_params.prefetch->num_slots);
17909 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
17910 SET_OPTION_IF_UNSET (opts, &global_options_set,
17911 param_l1_cache_size,
17912 aarch64_tune_params.prefetch->l1_cache_size);
17913 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17914 SET_OPTION_IF_UNSET (opts, &global_options_set,
17915 param_l1_cache_line_size,
17916 aarch64_tune_params.prefetch->l1_cache_line_size);
17917
17918 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17919 {
17920 SET_OPTION_IF_UNSET (opts, &global_options_set,
17921 param_destruct_interfere_size,
17922 aarch64_tune_params.prefetch->l1_cache_line_size);
17923 SET_OPTION_IF_UNSET (opts, &global_options_set,
17924 param_construct_interfere_size,
17925 aarch64_tune_params.prefetch->l1_cache_line_size);
17926 }
17927 else
17928 {
17929 /* For a generic AArch64 target, cover the current range of cache line
17930 sizes. */
17931 SET_OPTION_IF_UNSET (opts, &global_options_set,
17932 param_destruct_interfere_size,
17933 256);
17934 SET_OPTION_IF_UNSET (opts, &global_options_set,
17935 param_construct_interfere_size,
17936 64);
17937 }
17938
17939 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
17940 SET_OPTION_IF_UNSET (opts, &global_options_set,
17941 param_l2_cache_size,
17942 aarch64_tune_params.prefetch->l2_cache_size);
17943 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
17944 SET_OPTION_IF_UNSET (opts, &global_options_set,
17945 param_prefetch_dynamic_strides, 0);
17946 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
17947 SET_OPTION_IF_UNSET (opts, &global_options_set,
17948 param_prefetch_minimum_stride,
17949 aarch64_tune_params.prefetch->minimum_stride);
17950
17951 /* Use the alternative scheduling-pressure algorithm by default. */
17952 SET_OPTION_IF_UNSET (opts, &global_options_set,
17953 param_sched_pressure_algorithm,
17954 SCHED_PRESSURE_MODEL);
17955
17956 /* Validate the guard size. */
17957 int guard_size = param_stack_clash_protection_guard_size;
17958
17959 if (guard_size != 12 && guard_size != 16)
17960 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
17961 "size. Given value %d (%llu KB) is out of range",
17962 guard_size, (1ULL << guard_size) / 1024ULL);
17963
17964 /* Enforce that interval is the same size as size so the mid-end does the
17965 right thing. */
17966 SET_OPTION_IF_UNSET (opts, &global_options_set,
17967 param_stack_clash_protection_probe_interval,
17968 guard_size);
17969
17970 /* The maybe_set calls won't update the value if the user has explicitly set
17971 one. Which means we need to validate that probing interval and guard size
17972 are equal. */
17973 int probe_interval
17974 = param_stack_clash_protection_probe_interval;
17975 if (guard_size != probe_interval)
17976 error ("stack clash guard size %<%d%> must be equal to probing interval "
17977 "%<%d%>", guard_size, probe_interval);
17978
17979 /* Enable sw prefetching at specified optimization level for
17980 CPUS that have prefetch. Lower optimization level threshold by 1
17981 when profiling is enabled. */
17982 if (opts->x_flag_prefetch_loop_arrays < 0
17983 && !opts->x_optimize_size
17984 && aarch64_tune_params.prefetch->default_opt_level >= 0
17985 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
17986 opts->x_flag_prefetch_loop_arrays = 1;
17987
17988 aarch64_override_options_after_change_1 (opts);
17989 }
17990
17991 /* Print a hint with a suggestion for a core or architecture name that
17992 most closely resembles what the user passed in STR. ARCH is true if
17993 the user is asking for an architecture name. ARCH is false if the user
17994 is asking for a core name. */
17995
17996 static void
17997 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
17998 {
17999 auto_vec<const char *> candidates;
18000 const struct processor *entry = arch ? all_architectures : all_cores;
18001 for (; entry->name != NULL; entry++)
18002 candidates.safe_push (entry->name);
18003
18004 #ifdef HAVE_LOCAL_CPU_DETECT
18005 /* Add also "native" as possible value. */
18006 if (arch)
18007 candidates.safe_push ("native");
18008 #endif
18009
18010 char *s;
18011 const char *hint = candidates_list_and_hint (str, s, candidates);
18012 if (hint)
18013 inform (input_location, "valid arguments are: %s;"
18014 " did you mean %qs?", s, hint);
18015 else
18016 inform (input_location, "valid arguments are: %s", s);
18017
18018 XDELETEVEC (s);
18019 }
18020
18021 /* Print a hint with a suggestion for a core name that most closely resembles
18022 what the user passed in STR. */
18023
18024 inline static void
18025 aarch64_print_hint_for_core (const char *str)
18026 {
18027 aarch64_print_hint_for_core_or_arch (str, false);
18028 }
18029
18030 /* Print a hint with a suggestion for an architecture name that most closely
18031 resembles what the user passed in STR. */
18032
18033 inline static void
18034 aarch64_print_hint_for_arch (const char *str)
18035 {
18036 aarch64_print_hint_for_core_or_arch (str, true);
18037 }
18038
18039
18040 /* Print a hint with a suggestion for an extension name
18041 that most closely resembles what the user passed in STR. */
18042
18043 void
18044 aarch64_print_hint_for_extensions (const std::string &str)
18045 {
18046 auto_vec<const char *> candidates;
18047 aarch64_get_all_extension_candidates (&candidates);
18048 char *s;
18049 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18050 if (hint)
18051 inform (input_location, "valid arguments are: %s;"
18052 " did you mean %qs?", s, hint);
18053 else
18054 inform (input_location, "valid arguments are: %s", s);
18055
18056 XDELETEVEC (s);
18057 }
18058
18059 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
18060 specified in STR and throw errors if appropriate. Put the results if
18061 they are valid in RES and ISA_FLAGS. Return whether the option is
18062 valid. */
18063
18064 static bool
18065 aarch64_validate_mcpu (const char *str, const struct processor **res,
18066 aarch64_feature_flags *isa_flags)
18067 {
18068 std::string invalid_extension;
18069 enum aarch64_parse_opt_result parse_res
18070 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18071
18072 if (parse_res == AARCH64_PARSE_OK)
18073 return true;
18074
18075 switch (parse_res)
18076 {
18077 case AARCH64_PARSE_MISSING_ARG:
18078 error ("missing cpu name in %<-mcpu=%s%>", str);
18079 break;
18080 case AARCH64_PARSE_INVALID_ARG:
18081 error ("unknown value %qs for %<-mcpu%>", str);
18082 aarch64_print_hint_for_core (str);
18083 break;
18084 case AARCH64_PARSE_INVALID_FEATURE:
18085 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18086 invalid_extension.c_str (), str);
18087 aarch64_print_hint_for_extensions (invalid_extension);
18088 break;
18089 default:
18090 gcc_unreachable ();
18091 }
18092
18093 return false;
18094 }
18095
18096 /* Straight line speculation indicators. */
18097 enum aarch64_sls_hardening_type
18098 {
18099 SLS_NONE = 0,
18100 SLS_RETBR = 1,
18101 SLS_BLR = 2,
18102 SLS_ALL = 3,
18103 };
18104 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18105
18106 /* Return whether we should mitigatate Straight Line Speculation for the RET
18107 and BR instructions. */
18108 bool
18109 aarch64_harden_sls_retbr_p (void)
18110 {
18111 return aarch64_sls_hardening & SLS_RETBR;
18112 }
18113
18114 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18115 instruction. */
18116 bool
18117 aarch64_harden_sls_blr_p (void)
18118 {
18119 return aarch64_sls_hardening & SLS_BLR;
18120 }
18121
18122 /* As of yet we only allow setting these options globally, in the future we may
18123 allow setting them per function. */
18124 static void
18125 aarch64_validate_sls_mitigation (const char *const_str)
18126 {
18127 char *token_save = NULL;
18128 char *str = NULL;
18129
18130 if (strcmp (const_str, "none") == 0)
18131 {
18132 aarch64_sls_hardening = SLS_NONE;
18133 return;
18134 }
18135 if (strcmp (const_str, "all") == 0)
18136 {
18137 aarch64_sls_hardening = SLS_ALL;
18138 return;
18139 }
18140
18141 char *str_root = xstrdup (const_str);
18142 str = strtok_r (str_root, ",", &token_save);
18143 if (!str)
18144 error ("invalid argument given to %<-mharden-sls=%>");
18145
18146 int temp = SLS_NONE;
18147 while (str)
18148 {
18149 if (strcmp (str, "blr") == 0)
18150 temp |= SLS_BLR;
18151 else if (strcmp (str, "retbr") == 0)
18152 temp |= SLS_RETBR;
18153 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18154 {
18155 error ("%qs must be by itself for %<-mharden-sls=%>", str);
18156 break;
18157 }
18158 else
18159 {
18160 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18161 break;
18162 }
18163 str = strtok_r (NULL, ",", &token_save);
18164 }
18165 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18166 free (str_root);
18167 }
18168
18169 /* Parses CONST_STR for branch protection features specified in
18170 aarch64_branch_protect_types, and set any global variables required. Returns
18171 the parsing result and assigns LAST_STR to the last processed token from
18172 CONST_STR so that it can be used for error reporting. */
18173
18174 static enum
18175 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
18176 char** last_str)
18177 {
18178 char *str_root = xstrdup (const_str);
18179 char* token_save = NULL;
18180 char *str = strtok_r (str_root, "+", &token_save);
18181 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
18182 if (!str)
18183 res = AARCH64_PARSE_MISSING_ARG;
18184 else
18185 {
18186 char *next_str = strtok_r (NULL, "+", &token_save);
18187 /* Reset the branch protection features to their defaults. */
18188 aarch64_handle_no_branch_protection (NULL, NULL);
18189
18190 while (str && res == AARCH64_PARSE_OK)
18191 {
18192 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
18193 bool found = false;
18194 /* Search for this type. */
18195 while (type && type->name && !found && res == AARCH64_PARSE_OK)
18196 {
18197 if (strcmp (str, type->name) == 0)
18198 {
18199 found = true;
18200 res = type->handler (str, next_str);
18201 str = next_str;
18202 next_str = strtok_r (NULL, "+", &token_save);
18203 }
18204 else
18205 type++;
18206 }
18207 if (found && res == AARCH64_PARSE_OK)
18208 {
18209 bool found_subtype = true;
18210 /* Loop through each token until we find one that isn't a
18211 subtype. */
18212 while (found_subtype)
18213 {
18214 found_subtype = false;
18215 const aarch64_branch_protect_type *subtype = type->subtypes;
18216 /* Search for the subtype. */
18217 while (str && subtype && subtype->name && !found_subtype
18218 && res == AARCH64_PARSE_OK)
18219 {
18220 if (strcmp (str, subtype->name) == 0)
18221 {
18222 found_subtype = true;
18223 res = subtype->handler (str, next_str);
18224 str = next_str;
18225 next_str = strtok_r (NULL, "+", &token_save);
18226 }
18227 else
18228 subtype++;
18229 }
18230 }
18231 }
18232 else if (!found)
18233 res = AARCH64_PARSE_INVALID_ARG;
18234 }
18235 }
18236 /* Copy the last processed token into the argument to pass it back.
18237 Used by option and attribute validation to print the offending token. */
18238 if (last_str)
18239 {
18240 if (str) strcpy (*last_str, str);
18241 else *last_str = NULL;
18242 }
18243 if (res == AARCH64_PARSE_OK)
18244 {
18245 /* If needed, alloc the accepted string then copy in const_str.
18246 Used by override_option_after_change_1. */
18247 if (!accepted_branch_protection_string)
18248 accepted_branch_protection_string = (char *) xmalloc (
18249 BRANCH_PROTECT_STR_MAX
18250 + 1);
18251 strncpy (accepted_branch_protection_string, const_str,
18252 BRANCH_PROTECT_STR_MAX + 1);
18253 /* Forcibly null-terminate. */
18254 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
18255 }
18256 return res;
18257 }
18258
18259 static bool
18260 aarch64_validate_mbranch_protection (const char *const_str)
18261 {
18262 char *str = (char *) xmalloc (strlen (const_str));
18263 enum aarch64_parse_opt_result res =
18264 aarch64_parse_branch_protection (const_str, &str);
18265 if (res == AARCH64_PARSE_INVALID_ARG)
18266 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
18267 else if (res == AARCH64_PARSE_MISSING_ARG)
18268 error ("missing argument for %<-mbranch-protection=%>");
18269 free (str);
18270 return res == AARCH64_PARSE_OK;
18271 }
18272
18273 /* Validate a command-line -march option. Parse the arch and extensions
18274 (if any) specified in STR and throw errors if appropriate. Put the
18275 results, if they are valid, in RES and ISA_FLAGS. Return whether the
18276 option is valid. */
18277
18278 static bool
18279 aarch64_validate_march (const char *str, const struct processor **res,
18280 aarch64_feature_flags *isa_flags)
18281 {
18282 std::string invalid_extension;
18283 enum aarch64_parse_opt_result parse_res
18284 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18285
18286 if (parse_res == AARCH64_PARSE_OK)
18287 return true;
18288
18289 switch (parse_res)
18290 {
18291 case AARCH64_PARSE_MISSING_ARG:
18292 error ("missing arch name in %<-march=%s%>", str);
18293 break;
18294 case AARCH64_PARSE_INVALID_ARG:
18295 error ("unknown value %qs for %<-march%>", str);
18296 aarch64_print_hint_for_arch (str);
18297 /* A common user error is confusing -march and -mcpu.
18298 If the -march string matches a known CPU suggest -mcpu. */
18299 parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18300 if (parse_res == AARCH64_PARSE_OK)
18301 inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18302 break;
18303 case AARCH64_PARSE_INVALID_FEATURE:
18304 error ("invalid feature modifier %qs in %<-march=%s%>",
18305 invalid_extension.c_str (), str);
18306 aarch64_print_hint_for_extensions (invalid_extension);
18307 break;
18308 default:
18309 gcc_unreachable ();
18310 }
18311
18312 return false;
18313 }
18314
18315 /* Validate a command-line -mtune option. Parse the cpu
18316 specified in STR and throw errors if appropriate. Put the
18317 result, if it is valid, in RES. Return whether the option is
18318 valid. */
18319
18320 static bool
18321 aarch64_validate_mtune (const char *str, const struct processor **res)
18322 {
18323 enum aarch64_parse_opt_result parse_res
18324 = aarch64_parse_tune (str, res);
18325
18326 if (parse_res == AARCH64_PARSE_OK)
18327 return true;
18328
18329 switch (parse_res)
18330 {
18331 case AARCH64_PARSE_MISSING_ARG:
18332 error ("missing cpu name in %<-mtune=%s%>", str);
18333 break;
18334 case AARCH64_PARSE_INVALID_ARG:
18335 error ("unknown value %qs for %<-mtune%>", str);
18336 aarch64_print_hint_for_core (str);
18337 break;
18338 default:
18339 gcc_unreachable ();
18340 }
18341 return false;
18342 }
18343
18344 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18345
18346 static poly_uint16
18347 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18348 {
18349 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18350 on big-endian targets, so we would need to forbid subregs that convert
18351 from one to the other. By default a reinterpret sequence would then
18352 involve a store to memory in one mode and a load back in the other.
18353 Even if we optimize that sequence using reverse instructions,
18354 it would still be a significant potential overhead.
18355
18356 For now, it seems better to generate length-agnostic code for that
18357 case instead. */
18358 if (value == SVE_SCALABLE
18359 || (value == SVE_128 && BYTES_BIG_ENDIAN))
18360 return poly_uint16 (2, 2);
18361 else
18362 return (int) value / 64;
18363 }
18364
18365 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18366 aarch64_isa_flags accordingly. */
18367
18368 void
18369 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18370 {
18371 aarch64_set_asm_isa_flags (&global_options, flags);
18372 }
18373
18374 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18375 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18376 tuning structs. In particular it must set selected_tune and
18377 aarch64_asm_isa_flags that define the available ISA features and tuning
18378 decisions. It must also set selected_arch as this will be used to
18379 output the .arch asm tags for each function. */
18380
18381 static void
18382 aarch64_override_options (void)
18383 {
18384 aarch64_feature_flags cpu_isa = 0;
18385 aarch64_feature_flags arch_isa = 0;
18386 aarch64_set_asm_isa_flags (0);
18387
18388 const struct processor *cpu = NULL;
18389 const struct processor *arch = NULL;
18390 const struct processor *tune = NULL;
18391
18392 if (aarch64_harden_sls_string)
18393 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18394
18395 if (aarch64_branch_protection_string)
18396 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
18397
18398 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18399 If either of -march or -mtune is given, they override their
18400 respective component of -mcpu. */
18401 if (aarch64_cpu_string)
18402 aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18403
18404 if (aarch64_arch_string)
18405 aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18406
18407 if (aarch64_tune_string)
18408 aarch64_validate_mtune (aarch64_tune_string, &tune);
18409
18410 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18411 SUBTARGET_OVERRIDE_OPTIONS;
18412 #endif
18413
18414 if (cpu && arch)
18415 {
18416 /* If both -mcpu and -march are specified, warn if they are not
18417 architecturally compatible and prefer the -march ISA flags. */
18418 if (arch->arch != cpu->arch)
18419 {
18420 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
18421 aarch64_cpu_string,
18422 aarch64_arch_string);
18423 }
18424
18425 selected_arch = arch->arch;
18426 aarch64_set_asm_isa_flags (arch_isa);
18427 }
18428 else if (cpu)
18429 {
18430 selected_arch = cpu->arch;
18431 aarch64_set_asm_isa_flags (cpu_isa);
18432 }
18433 else if (arch)
18434 {
18435 cpu = &all_cores[arch->ident];
18436 selected_arch = arch->arch;
18437 aarch64_set_asm_isa_flags (arch_isa);
18438 }
18439 else
18440 {
18441 /* No -mcpu or -march specified, so use the default CPU. */
18442 cpu = &all_cores[TARGET_CPU_DEFAULT];
18443 selected_arch = cpu->arch;
18444 aarch64_set_asm_isa_flags (cpu->flags);
18445 }
18446
18447 selected_tune = tune ? tune->ident : cpu->ident;
18448
18449 if (aarch64_enable_bti == 2)
18450 {
18451 #ifdef TARGET_ENABLE_BTI
18452 aarch64_enable_bti = 1;
18453 #else
18454 aarch64_enable_bti = 0;
18455 #endif
18456 }
18457
18458 /* Return address signing is currently not supported for ILP32 targets. For
18459 LP64 targets use the configured option in the absence of a command-line
18460 option for -mbranch-protection. */
18461 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
18462 {
18463 #ifdef TARGET_ENABLE_PAC_RET
18464 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
18465 #else
18466 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
18467 #endif
18468 }
18469
18470 #ifndef HAVE_AS_MABI_OPTION
18471 /* The compiler may have been configured with 2.23.* binutils, which does
18472 not have support for ILP32. */
18473 if (TARGET_ILP32)
18474 error ("assembler does not support %<-mabi=ilp32%>");
18475 #endif
18476
18477 /* Convert -msve-vector-bits to a VG count. */
18478 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18479
18480 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
18481 sorry ("return address signing is only supported for %<-mabi=lp64%>");
18482
18483 /* The pass to insert speculation tracking runs before
18484 shrink-wrapping and the latter does not know how to update the
18485 tracking status. So disable it in this case. */
18486 if (aarch64_track_speculation)
18487 flag_shrink_wrap = 0;
18488
18489 aarch64_override_options_internal (&global_options);
18490
18491 /* Save these options as the default ones in case we push and pop them later
18492 while processing functions with potential target attributes. */
18493 target_option_default_node = target_option_current_node
18494 = build_target_option_node (&global_options, &global_options_set);
18495 }
18496
18497 /* Implement targetm.override_options_after_change. */
18498
18499 static void
18500 aarch64_override_options_after_change (void)
18501 {
18502 aarch64_override_options_after_change_1 (&global_options);
18503 }
18504
18505 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
18506 static char *
18507 aarch64_offload_options (void)
18508 {
18509 if (TARGET_ILP32)
18510 return xstrdup ("-foffload-abi=ilp32");
18511 else
18512 return xstrdup ("-foffload-abi=lp64");
18513 }
18514
18515 static struct machine_function *
18516 aarch64_init_machine_status (void)
18517 {
18518 struct machine_function *machine;
18519 machine = ggc_cleared_alloc<machine_function> ();
18520 return machine;
18521 }
18522
18523 void
18524 aarch64_init_expanders (void)
18525 {
18526 init_machine_status = aarch64_init_machine_status;
18527 }
18528
18529 /* A checking mechanism for the implementation of the various code models. */
18530 static void
18531 initialize_aarch64_code_model (struct gcc_options *opts)
18532 {
18533 aarch64_cmodel = opts->x_aarch64_cmodel_var;
18534 switch (opts->x_aarch64_cmodel_var)
18535 {
18536 case AARCH64_CMODEL_TINY:
18537 if (opts->x_flag_pic)
18538 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18539 break;
18540 case AARCH64_CMODEL_SMALL:
18541 if (opts->x_flag_pic)
18542 {
18543 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18544 aarch64_cmodel = (flag_pic == 2
18545 ? AARCH64_CMODEL_SMALL_PIC
18546 : AARCH64_CMODEL_SMALL_SPIC);
18547 #else
18548 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18549 #endif
18550 }
18551 break;
18552 case AARCH64_CMODEL_LARGE:
18553 if (opts->x_flag_pic)
18554 sorry ("code model %qs with %<-f%s%>", "large",
18555 opts->x_flag_pic > 1 ? "PIC" : "pic");
18556 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18557 sorry ("code model %qs not supported in ilp32 mode", "large");
18558 break;
18559 case AARCH64_CMODEL_TINY_PIC:
18560 case AARCH64_CMODEL_SMALL_PIC:
18561 case AARCH64_CMODEL_SMALL_SPIC:
18562 gcc_unreachable ();
18563 }
18564 }
18565
18566 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
18567 using the information saved in PTR. */
18568
18569 static void
18570 aarch64_option_restore (struct gcc_options *opts,
18571 struct gcc_options * /* opts_set */,
18572 struct cl_target_option * /* ptr */)
18573 {
18574 aarch64_override_options_internal (opts);
18575 }
18576
18577 /* Implement TARGET_OPTION_PRINT. */
18578
18579 static void
18580 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18581 {
18582 const struct processor *cpu
18583 = aarch64_get_tune_cpu (ptr->x_selected_tune);
18584 const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
18585 std::string extension
18586 = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
18587 arch->flags);
18588
18589 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
18590 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18591 arch->name, extension.c_str ());
18592 }
18593
18594 static GTY(()) tree aarch64_previous_fndecl;
18595
18596 void
18597 aarch64_reset_previous_fndecl (void)
18598 {
18599 aarch64_previous_fndecl = NULL;
18600 }
18601
18602 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18603 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18604 make sure optab availability predicates are recomputed when necessary. */
18605
18606 void
18607 aarch64_save_restore_target_globals (tree new_tree)
18608 {
18609 if (TREE_TARGET_GLOBALS (new_tree))
18610 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18611 else if (new_tree == target_option_default_node)
18612 restore_target_globals (&default_target_globals);
18613 else
18614 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18615 }
18616
18617 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
18618 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18619 of the function, if such exists. This function may be called multiple
18620 times on a single function so use aarch64_previous_fndecl to avoid
18621 setting up identical state. */
18622
18623 static void
18624 aarch64_set_current_function (tree fndecl)
18625 {
18626 if (!fndecl || fndecl == aarch64_previous_fndecl)
18627 return;
18628
18629 tree old_tree = (aarch64_previous_fndecl
18630 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
18631 : NULL_TREE);
18632
18633 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18634
18635 /* If current function has no attributes but the previous one did,
18636 use the default node. */
18637 if (!new_tree && old_tree)
18638 new_tree = target_option_default_node;
18639
18640 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
18641 the default have been handled by aarch64_save_restore_target_globals from
18642 aarch64_pragma_target_parse. */
18643 if (old_tree == new_tree)
18644 return;
18645
18646 aarch64_previous_fndecl = fndecl;
18647
18648 /* First set the target options. */
18649 cl_target_option_restore (&global_options, &global_options_set,
18650 TREE_TARGET_OPTION (new_tree));
18651
18652 aarch64_save_restore_target_globals (new_tree);
18653 }
18654
18655 /* Enum describing the various ways we can handle attributes.
18656 In many cases we can reuse the generic option handling machinery. */
18657
18658 enum aarch64_attr_opt_type
18659 {
18660 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
18661 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
18662 aarch64_attr_enum, /* Attribute sets an enum variable. */
18663 aarch64_attr_custom /* Attribute requires a custom handling function. */
18664 };
18665
18666 /* All the information needed to handle a target attribute.
18667 NAME is the name of the attribute.
18668 ATTR_TYPE specifies the type of behavior of the attribute as described
18669 in the definition of enum aarch64_attr_opt_type.
18670 ALLOW_NEG is true if the attribute supports a "no-" form.
18671 HANDLER is the function that takes the attribute string as an argument
18672 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18673 OPT_NUM is the enum specifying the option that the attribute modifies.
18674 This is needed for attributes that mirror the behavior of a command-line
18675 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18676 aarch64_attr_enum. */
18677
18678 struct aarch64_attribute_info
18679 {
18680 const char *name;
18681 enum aarch64_attr_opt_type attr_type;
18682 bool allow_neg;
18683 bool (*handler) (const char *);
18684 enum opt_code opt_num;
18685 };
18686
18687 /* Handle the ARCH_STR argument to the arch= target attribute. */
18688
18689 static bool
18690 aarch64_handle_attr_arch (const char *str)
18691 {
18692 const struct processor *tmp_arch = NULL;
18693 std::string invalid_extension;
18694 aarch64_feature_flags tmp_flags;
18695 enum aarch64_parse_opt_result parse_res
18696 = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
18697
18698 if (parse_res == AARCH64_PARSE_OK)
18699 {
18700 gcc_assert (tmp_arch);
18701 selected_arch = tmp_arch->arch;
18702 aarch64_set_asm_isa_flags (tmp_flags);
18703 return true;
18704 }
18705
18706 switch (parse_res)
18707 {
18708 case AARCH64_PARSE_MISSING_ARG:
18709 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
18710 break;
18711 case AARCH64_PARSE_INVALID_ARG:
18712 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
18713 aarch64_print_hint_for_arch (str);
18714 break;
18715 case AARCH64_PARSE_INVALID_FEATURE:
18716 error ("invalid feature modifier %s of value %qs in "
18717 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18718 aarch64_print_hint_for_extensions (invalid_extension);
18719 break;
18720 default:
18721 gcc_unreachable ();
18722 }
18723
18724 return false;
18725 }
18726
18727 /* Handle the argument CPU_STR to the cpu= target attribute. */
18728
18729 static bool
18730 aarch64_handle_attr_cpu (const char *str)
18731 {
18732 const struct processor *tmp_cpu = NULL;
18733 std::string invalid_extension;
18734 aarch64_feature_flags tmp_flags;
18735 enum aarch64_parse_opt_result parse_res
18736 = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
18737
18738 if (parse_res == AARCH64_PARSE_OK)
18739 {
18740 gcc_assert (tmp_cpu);
18741 selected_tune = tmp_cpu->ident;
18742 selected_arch = tmp_cpu->arch;
18743 aarch64_set_asm_isa_flags (tmp_flags);
18744 return true;
18745 }
18746
18747 switch (parse_res)
18748 {
18749 case AARCH64_PARSE_MISSING_ARG:
18750 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
18751 break;
18752 case AARCH64_PARSE_INVALID_ARG:
18753 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
18754 aarch64_print_hint_for_core (str);
18755 break;
18756 case AARCH64_PARSE_INVALID_FEATURE:
18757 error ("invalid feature modifier %qs of value %qs in "
18758 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18759 aarch64_print_hint_for_extensions (invalid_extension);
18760 break;
18761 default:
18762 gcc_unreachable ();
18763 }
18764
18765 return false;
18766 }
18767
18768 /* Handle the argument STR to the branch-protection= attribute. */
18769
18770 static bool
18771 aarch64_handle_attr_branch_protection (const char* str)
18772 {
18773 char *err_str = (char *) xmalloc (strlen (str) + 1);
18774 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
18775 &err_str);
18776 bool success = false;
18777 switch (res)
18778 {
18779 case AARCH64_PARSE_MISSING_ARG:
18780 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
18781 " attribute");
18782 break;
18783 case AARCH64_PARSE_INVALID_ARG:
18784 error ("invalid protection type %qs in %<target(\"branch-protection"
18785 "=\")%> pragma or attribute", err_str);
18786 break;
18787 case AARCH64_PARSE_OK:
18788 success = true;
18789 /* Fall through. */
18790 case AARCH64_PARSE_INVALID_FEATURE:
18791 break;
18792 default:
18793 gcc_unreachable ();
18794 }
18795 free (err_str);
18796 return success;
18797 }
18798
18799 /* Handle the argument STR to the tune= target attribute. */
18800
18801 static bool
18802 aarch64_handle_attr_tune (const char *str)
18803 {
18804 const struct processor *tmp_tune = NULL;
18805 enum aarch64_parse_opt_result parse_res
18806 = aarch64_parse_tune (str, &tmp_tune);
18807
18808 if (parse_res == AARCH64_PARSE_OK)
18809 {
18810 gcc_assert (tmp_tune);
18811 selected_tune = tmp_tune->ident;
18812 return true;
18813 }
18814
18815 switch (parse_res)
18816 {
18817 case AARCH64_PARSE_INVALID_ARG:
18818 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
18819 aarch64_print_hint_for_core (str);
18820 break;
18821 default:
18822 gcc_unreachable ();
18823 }
18824
18825 return false;
18826 }
18827
18828 /* Parse an architecture extensions target attribute string specified in STR.
18829 For example "+fp+nosimd". Show any errors if needed. Return TRUE
18830 if successful. Update aarch64_isa_flags to reflect the ISA features
18831 modified. */
18832
18833 static bool
18834 aarch64_handle_attr_isa_flags (char *str)
18835 {
18836 enum aarch64_parse_opt_result parse_res;
18837 auto isa_flags = aarch64_asm_isa_flags;
18838
18839 /* We allow "+nothing" in the beginning to clear out all architectural
18840 features if the user wants to handpick specific features. */
18841 if (strncmp ("+nothing", str, 8) == 0)
18842 {
18843 isa_flags = 0;
18844 str += 8;
18845 }
18846
18847 std::string invalid_extension;
18848 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
18849
18850 if (parse_res == AARCH64_PARSE_OK)
18851 {
18852 aarch64_set_asm_isa_flags (isa_flags);
18853 return true;
18854 }
18855
18856 switch (parse_res)
18857 {
18858 case AARCH64_PARSE_MISSING_ARG:
18859 error ("missing value in %<target()%> pragma or attribute");
18860 break;
18861
18862 case AARCH64_PARSE_INVALID_FEATURE:
18863 error ("invalid feature modifier %qs of value %qs in "
18864 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18865 break;
18866
18867 default:
18868 gcc_unreachable ();
18869 }
18870
18871 return false;
18872 }
18873
18874 /* The target attributes that we support. On top of these we also support just
18875 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
18876 handled explicitly in aarch64_process_one_target_attr. */
18877
18878 static const struct aarch64_attribute_info aarch64_attributes[] =
18879 {
18880 { "general-regs-only", aarch64_attr_mask, false, NULL,
18881 OPT_mgeneral_regs_only },
18882 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
18883 OPT_mfix_cortex_a53_835769 },
18884 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
18885 OPT_mfix_cortex_a53_843419 },
18886 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
18887 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
18888 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
18889 OPT_momit_leaf_frame_pointer },
18890 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
18891 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
18892 OPT_march_ },
18893 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
18894 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
18895 OPT_mtune_ },
18896 { "branch-protection", aarch64_attr_custom, false,
18897 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
18898 { "sign-return-address", aarch64_attr_enum, false, NULL,
18899 OPT_msign_return_address_ },
18900 { "outline-atomics", aarch64_attr_bool, true, NULL,
18901 OPT_moutline_atomics},
18902 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
18903 };
18904
18905 /* Parse ARG_STR which contains the definition of one target attribute.
18906 Show appropriate errors if any or return true if the attribute is valid. */
18907
18908 static bool
18909 aarch64_process_one_target_attr (char *arg_str)
18910 {
18911 bool invert = false;
18912
18913 size_t len = strlen (arg_str);
18914
18915 if (len == 0)
18916 {
18917 error ("malformed %<target()%> pragma or attribute");
18918 return false;
18919 }
18920
18921 char *str_to_check = (char *) alloca (len + 1);
18922 strcpy (str_to_check, arg_str);
18923
18924 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
18925 It is easier to detect and handle it explicitly here rather than going
18926 through the machinery for the rest of the target attributes in this
18927 function. */
18928 if (*str_to_check == '+')
18929 return aarch64_handle_attr_isa_flags (str_to_check);
18930
18931 if (len > 3 && startswith (str_to_check, "no-"))
18932 {
18933 invert = true;
18934 str_to_check += 3;
18935 }
18936 char *arg = strchr (str_to_check, '=');
18937
18938 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
18939 and point ARG to "foo". */
18940 if (arg)
18941 {
18942 *arg = '\0';
18943 arg++;
18944 }
18945 const struct aarch64_attribute_info *p_attr;
18946 bool found = false;
18947 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
18948 {
18949 /* If the names don't match up, or the user has given an argument
18950 to an attribute that doesn't accept one, or didn't give an argument
18951 to an attribute that expects one, fail to match. */
18952 if (strcmp (str_to_check, p_attr->name) != 0)
18953 continue;
18954
18955 found = true;
18956 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
18957 || p_attr->attr_type == aarch64_attr_enum;
18958
18959 if (attr_need_arg_p ^ (arg != NULL))
18960 {
18961 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
18962 return false;
18963 }
18964
18965 /* If the name matches but the attribute does not allow "no-" versions
18966 then we can't match. */
18967 if (invert && !p_attr->allow_neg)
18968 {
18969 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
18970 return false;
18971 }
18972
18973 switch (p_attr->attr_type)
18974 {
18975 /* Has a custom handler registered.
18976 For example, cpu=, arch=, tune=. */
18977 case aarch64_attr_custom:
18978 gcc_assert (p_attr->handler);
18979 if (!p_attr->handler (arg))
18980 return false;
18981 break;
18982
18983 /* Either set or unset a boolean option. */
18984 case aarch64_attr_bool:
18985 {
18986 struct cl_decoded_option decoded;
18987
18988 generate_option (p_attr->opt_num, NULL, !invert,
18989 CL_TARGET, &decoded);
18990 aarch64_handle_option (&global_options, &global_options_set,
18991 &decoded, input_location);
18992 break;
18993 }
18994 /* Set or unset a bit in the target_flags. aarch64_handle_option
18995 should know what mask to apply given the option number. */
18996 case aarch64_attr_mask:
18997 {
18998 struct cl_decoded_option decoded;
18999 /* We only need to specify the option number.
19000 aarch64_handle_option will know which mask to apply. */
19001 decoded.opt_index = p_attr->opt_num;
19002 decoded.value = !invert;
19003 aarch64_handle_option (&global_options, &global_options_set,
19004 &decoded, input_location);
19005 break;
19006 }
19007 /* Use the option setting machinery to set an option to an enum. */
19008 case aarch64_attr_enum:
19009 {
19010 gcc_assert (arg);
19011 bool valid;
19012 int value;
19013 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19014 &value, CL_TARGET);
19015 if (valid)
19016 {
19017 set_option (&global_options, NULL, p_attr->opt_num, value,
19018 NULL, DK_UNSPECIFIED, input_location,
19019 global_dc);
19020 }
19021 else
19022 {
19023 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19024 }
19025 break;
19026 }
19027 default:
19028 gcc_unreachable ();
19029 }
19030 }
19031
19032 /* If we reached here we either have found an attribute and validated
19033 it or didn't match any. If we matched an attribute but its arguments
19034 were malformed we will have returned false already. */
19035 return found;
19036 }
19037
19038 /* Count how many times the character C appears in
19039 NULL-terminated string STR. */
19040
19041 static unsigned int
19042 num_occurences_in_str (char c, char *str)
19043 {
19044 unsigned int res = 0;
19045 while (*str != '\0')
19046 {
19047 if (*str == c)
19048 res++;
19049
19050 str++;
19051 }
19052
19053 return res;
19054 }
19055
19056 /* Parse the tree in ARGS that contains the target attribute information
19057 and update the global target options space. */
19058
19059 bool
19060 aarch64_process_target_attr (tree args)
19061 {
19062 if (TREE_CODE (args) == TREE_LIST)
19063 {
19064 do
19065 {
19066 tree head = TREE_VALUE (args);
19067 if (head)
19068 {
19069 if (!aarch64_process_target_attr (head))
19070 return false;
19071 }
19072 args = TREE_CHAIN (args);
19073 } while (args);
19074
19075 return true;
19076 }
19077
19078 if (TREE_CODE (args) != STRING_CST)
19079 {
19080 error ("attribute %<target%> argument not a string");
19081 return false;
19082 }
19083
19084 size_t len = strlen (TREE_STRING_POINTER (args));
19085 char *str_to_check = (char *) alloca (len + 1);
19086 strcpy (str_to_check, TREE_STRING_POINTER (args));
19087
19088 if (len == 0)
19089 {
19090 error ("malformed %<target()%> pragma or attribute");
19091 return false;
19092 }
19093
19094 /* Used to catch empty spaces between commas i.e.
19095 attribute ((target ("attr1,,attr2"))). */
19096 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19097
19098 /* Handle multiple target attributes separated by ','. */
19099 char *token = strtok_r (str_to_check, ",", &str_to_check);
19100
19101 unsigned int num_attrs = 0;
19102 while (token)
19103 {
19104 num_attrs++;
19105 if (!aarch64_process_one_target_attr (token))
19106 {
19107 /* Check if token is possibly an arch extension without
19108 leading '+'. */
19109 aarch64_feature_flags isa_temp = 0;
19110 auto with_plus = std::string ("+") + token;
19111 enum aarch64_parse_opt_result ext_res
19112 = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19113
19114 if (ext_res == AARCH64_PARSE_OK)
19115 error ("arch extension %<%s%> should be prefixed by %<+%>",
19116 token);
19117 else
19118 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19119 return false;
19120 }
19121
19122 token = strtok_r (NULL, ",", &str_to_check);
19123 }
19124
19125 if (num_attrs != num_commas + 1)
19126 {
19127 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19128 return false;
19129 }
19130
19131 return true;
19132 }
19133
19134 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19135 process attribute ((target ("..."))). */
19136
19137 static bool
19138 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19139 {
19140 struct cl_target_option cur_target;
19141 bool ret;
19142 tree old_optimize;
19143 tree new_target, new_optimize;
19144 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19145
19146 /* If what we're processing is the current pragma string then the
19147 target option node is already stored in target_option_current_node
19148 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
19149 having to re-parse the string. This is especially useful to keep
19150 arm_neon.h compile times down since that header contains a lot
19151 of intrinsics enclosed in pragmas. */
19152 if (!existing_target && args == current_target_pragma)
19153 {
19154 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19155 return true;
19156 }
19157 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19158
19159 old_optimize
19160 = build_optimization_node (&global_options, &global_options_set);
19161 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19162
19163 /* If the function changed the optimization levels as well as setting
19164 target options, start with the optimizations specified. */
19165 if (func_optimize && func_optimize != old_optimize)
19166 cl_optimization_restore (&global_options, &global_options_set,
19167 TREE_OPTIMIZATION (func_optimize));
19168
19169 /* Save the current target options to restore at the end. */
19170 cl_target_option_save (&cur_target, &global_options, &global_options_set);
19171
19172 /* If fndecl already has some target attributes applied to it, unpack
19173 them so that we add this attribute on top of them, rather than
19174 overwriting them. */
19175 if (existing_target)
19176 {
19177 struct cl_target_option *existing_options
19178 = TREE_TARGET_OPTION (existing_target);
19179
19180 if (existing_options)
19181 cl_target_option_restore (&global_options, &global_options_set,
19182 existing_options);
19183 }
19184 else
19185 cl_target_option_restore (&global_options, &global_options_set,
19186 TREE_TARGET_OPTION (target_option_current_node));
19187
19188 ret = aarch64_process_target_attr (args);
19189
19190 /* Set up any additional state. */
19191 if (ret)
19192 {
19193 aarch64_override_options_internal (&global_options);
19194 new_target = build_target_option_node (&global_options,
19195 &global_options_set);
19196 }
19197 else
19198 new_target = NULL;
19199
19200 new_optimize = build_optimization_node (&global_options,
19201 &global_options_set);
19202
19203 if (fndecl && ret)
19204 {
19205 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19206
19207 if (old_optimize != new_optimize)
19208 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19209 }
19210
19211 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19212
19213 if (old_optimize != new_optimize)
19214 cl_optimization_restore (&global_options, &global_options_set,
19215 TREE_OPTIMIZATION (old_optimize));
19216 return ret;
19217 }
19218
19219 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
19220 tri-bool options (yes, no, don't care) and the default value is
19221 DEF, determine whether to reject inlining. */
19222
19223 static bool
19224 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
19225 int dont_care, int def)
19226 {
19227 /* If the callee doesn't care, always allow inlining. */
19228 if (callee == dont_care)
19229 return true;
19230
19231 /* If the caller doesn't care, always allow inlining. */
19232 if (caller == dont_care)
19233 return true;
19234
19235 /* Otherwise, allow inlining if either the callee and caller values
19236 agree, or if the callee is using the default value. */
19237 return (callee == caller || callee == def);
19238 }
19239
19240 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
19241 to inline CALLEE into CALLER based on target-specific info.
19242 Make sure that the caller and callee have compatible architectural
19243 features. Then go through the other possible target attributes
19244 and see if they can block inlining. Try not to reject always_inline
19245 callees unless they are incompatible architecturally. */
19246
19247 static bool
19248 aarch64_can_inline_p (tree caller, tree callee)
19249 {
19250 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
19251 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
19252
19253 struct cl_target_option *caller_opts
19254 = TREE_TARGET_OPTION (caller_tree ? caller_tree
19255 : target_option_default_node);
19256
19257 struct cl_target_option *callee_opts
19258 = TREE_TARGET_OPTION (callee_tree ? callee_tree
19259 : target_option_default_node);
19260
19261 /* Callee's ISA flags should be a subset of the caller's. */
19262 if ((caller_opts->x_aarch64_asm_isa_flags
19263 & callee_opts->x_aarch64_asm_isa_flags)
19264 != callee_opts->x_aarch64_asm_isa_flags)
19265 return false;
19266 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
19267 != callee_opts->x_aarch64_isa_flags)
19268 return false;
19269
19270 /* Allow non-strict aligned functions inlining into strict
19271 aligned ones. */
19272 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
19273 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
19274 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
19275 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
19276 return false;
19277
19278 bool always_inline = lookup_attribute ("always_inline",
19279 DECL_ATTRIBUTES (callee));
19280
19281 /* If the architectural features match up and the callee is always_inline
19282 then the other attributes don't matter. */
19283 if (always_inline)
19284 return true;
19285
19286 if (caller_opts->x_aarch64_cmodel_var
19287 != callee_opts->x_aarch64_cmodel_var)
19288 return false;
19289
19290 if (caller_opts->x_aarch64_tls_dialect
19291 != callee_opts->x_aarch64_tls_dialect)
19292 return false;
19293
19294 /* Honour explicit requests to workaround errata. */
19295 if (!aarch64_tribools_ok_for_inlining_p (
19296 caller_opts->x_aarch64_fix_a53_err835769,
19297 callee_opts->x_aarch64_fix_a53_err835769,
19298 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
19299 return false;
19300
19301 if (!aarch64_tribools_ok_for_inlining_p (
19302 caller_opts->x_aarch64_fix_a53_err843419,
19303 callee_opts->x_aarch64_fix_a53_err843419,
19304 2, TARGET_FIX_ERR_A53_843419))
19305 return false;
19306
19307 /* If the user explicitly specified -momit-leaf-frame-pointer for the
19308 caller and calle and they don't match up, reject inlining. */
19309 if (!aarch64_tribools_ok_for_inlining_p (
19310 caller_opts->x_flag_omit_leaf_frame_pointer,
19311 callee_opts->x_flag_omit_leaf_frame_pointer,
19312 2, 1))
19313 return false;
19314
19315 /* If the callee has specific tuning overrides, respect them. */
19316 if (callee_opts->x_aarch64_override_tune_string != NULL
19317 && caller_opts->x_aarch64_override_tune_string == NULL)
19318 return false;
19319
19320 /* If the user specified tuning override strings for the
19321 caller and callee and they don't match up, reject inlining.
19322 We just do a string compare here, we don't analyze the meaning
19323 of the string, as it would be too costly for little gain. */
19324 if (callee_opts->x_aarch64_override_tune_string
19325 && caller_opts->x_aarch64_override_tune_string
19326 && (strcmp (callee_opts->x_aarch64_override_tune_string,
19327 caller_opts->x_aarch64_override_tune_string) != 0))
19328 return false;
19329
19330 return true;
19331 }
19332
19333 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
19334 been already. */
19335
19336 unsigned int
19337 aarch64_tlsdesc_abi_id ()
19338 {
19339 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
19340 if (!tlsdesc_abi.initialized_p ())
19341 {
19342 HARD_REG_SET full_reg_clobbers;
19343 CLEAR_HARD_REG_SET (full_reg_clobbers);
19344 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
19345 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
19346 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
19347 SET_HARD_REG_BIT (full_reg_clobbers, regno);
19348 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
19349 }
19350 return tlsdesc_abi.id ();
19351 }
19352
19353 /* Return true if SYMBOL_REF X binds locally. */
19354
19355 static bool
19356 aarch64_symbol_binds_local_p (const_rtx x)
19357 {
19358 return (SYMBOL_REF_DECL (x)
19359 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
19360 : SYMBOL_REF_LOCAL_P (x));
19361 }
19362
19363 /* Return true if SYMBOL_REF X is thread local */
19364 static bool
19365 aarch64_tls_symbol_p (rtx x)
19366 {
19367 if (! TARGET_HAVE_TLS)
19368 return false;
19369
19370 x = strip_salt (x);
19371 if (!SYMBOL_REF_P (x))
19372 return false;
19373
19374 return SYMBOL_REF_TLS_MODEL (x) != 0;
19375 }
19376
19377 /* Classify a TLS symbol into one of the TLS kinds. */
19378 enum aarch64_symbol_type
19379 aarch64_classify_tls_symbol (rtx x)
19380 {
19381 enum tls_model tls_kind = tls_symbolic_operand_type (x);
19382
19383 switch (tls_kind)
19384 {
19385 case TLS_MODEL_GLOBAL_DYNAMIC:
19386 case TLS_MODEL_LOCAL_DYNAMIC:
19387 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
19388
19389 case TLS_MODEL_INITIAL_EXEC:
19390 switch (aarch64_cmodel)
19391 {
19392 case AARCH64_CMODEL_TINY:
19393 case AARCH64_CMODEL_TINY_PIC:
19394 return SYMBOL_TINY_TLSIE;
19395 default:
19396 return SYMBOL_SMALL_TLSIE;
19397 }
19398
19399 case TLS_MODEL_LOCAL_EXEC:
19400 if (aarch64_tls_size == 12)
19401 return SYMBOL_TLSLE12;
19402 else if (aarch64_tls_size == 24)
19403 return SYMBOL_TLSLE24;
19404 else if (aarch64_tls_size == 32)
19405 return SYMBOL_TLSLE32;
19406 else if (aarch64_tls_size == 48)
19407 return SYMBOL_TLSLE48;
19408 else
19409 gcc_unreachable ();
19410
19411 case TLS_MODEL_EMULATED:
19412 case TLS_MODEL_NONE:
19413 return SYMBOL_FORCE_TO_MEM;
19414
19415 default:
19416 gcc_unreachable ();
19417 }
19418 }
19419
19420 /* Return the correct method for accessing X + OFFSET, where X is either
19421 a SYMBOL_REF or LABEL_REF. */
19422
19423 enum aarch64_symbol_type
19424 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
19425 {
19426 x = strip_salt (x);
19427
19428 if (LABEL_REF_P (x))
19429 {
19430 switch (aarch64_cmodel)
19431 {
19432 case AARCH64_CMODEL_LARGE:
19433 return SYMBOL_FORCE_TO_MEM;
19434
19435 case AARCH64_CMODEL_TINY_PIC:
19436 case AARCH64_CMODEL_TINY:
19437 return SYMBOL_TINY_ABSOLUTE;
19438
19439 case AARCH64_CMODEL_SMALL_SPIC:
19440 case AARCH64_CMODEL_SMALL_PIC:
19441 case AARCH64_CMODEL_SMALL:
19442 return SYMBOL_SMALL_ABSOLUTE;
19443
19444 default:
19445 gcc_unreachable ();
19446 }
19447 }
19448
19449 if (SYMBOL_REF_P (x))
19450 {
19451 if (aarch64_tls_symbol_p (x))
19452 return aarch64_classify_tls_symbol (x);
19453
19454 switch (aarch64_cmodel)
19455 {
19456 case AARCH64_CMODEL_TINY_PIC:
19457 case AARCH64_CMODEL_TINY:
19458 /* With -fPIC non-local symbols use the GOT. For orthogonality
19459 always use the GOT for extern weak symbols. */
19460 if ((flag_pic || SYMBOL_REF_WEAK (x))
19461 && !aarch64_symbol_binds_local_p (x))
19462 return SYMBOL_TINY_GOT;
19463
19464 /* When we retrieve symbol + offset address, we have to make sure
19465 the offset does not cause overflow of the final address. But
19466 we have no way of knowing the address of symbol at compile time
19467 so we can't accurately say if the distance between the PC and
19468 symbol + offset is outside the addressible range of +/-1MB in the
19469 TINY code model. So we limit the maximum offset to +/-64KB and
19470 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
19471 If offset_within_block_p is true we allow larger offsets. */
19472 if (!(IN_RANGE (offset, -0x10000, 0x10000)
19473 || offset_within_block_p (x, offset)))
19474 return SYMBOL_FORCE_TO_MEM;
19475
19476 return SYMBOL_TINY_ABSOLUTE;
19477
19478
19479 case AARCH64_CMODEL_SMALL_SPIC:
19480 case AARCH64_CMODEL_SMALL_PIC:
19481 case AARCH64_CMODEL_SMALL:
19482 if ((flag_pic || SYMBOL_REF_WEAK (x))
19483 && !aarch64_symbol_binds_local_p (x))
19484 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
19485 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
19486
19487 /* Same reasoning as the tiny code model, but the offset cap here is
19488 1MB, allowing +/-3.9GB for the offset to the symbol. */
19489 if (!(IN_RANGE (offset, -0x100000, 0x100000)
19490 || offset_within_block_p (x, offset)))
19491 return SYMBOL_FORCE_TO_MEM;
19492
19493 return SYMBOL_SMALL_ABSOLUTE;
19494
19495 case AARCH64_CMODEL_LARGE:
19496 /* This is alright even in PIC code as the constant
19497 pool reference is always PC relative and within
19498 the same translation unit. */
19499 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
19500 return SYMBOL_SMALL_ABSOLUTE;
19501 else
19502 return SYMBOL_FORCE_TO_MEM;
19503
19504 default:
19505 gcc_unreachable ();
19506 }
19507 }
19508
19509 /* By default push everything into the constant pool. */
19510 return SYMBOL_FORCE_TO_MEM;
19511 }
19512
19513 bool
19514 aarch64_constant_address_p (rtx x)
19515 {
19516 return (CONSTANT_P (x) && memory_address_p (DImode, x));
19517 }
19518
19519 bool
19520 aarch64_legitimate_pic_operand_p (rtx x)
19521 {
19522 poly_int64 offset;
19523 x = strip_offset_and_salt (x, &offset);
19524 if (SYMBOL_REF_P (x))
19525 return false;
19526
19527 return true;
19528 }
19529
19530 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
19531 that should be rematerialized rather than spilled. */
19532
19533 static bool
19534 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
19535 {
19536 /* Support CSE and rematerialization of common constants. */
19537 if (CONST_INT_P (x)
19538 || CONST_DOUBLE_P (x))
19539 return true;
19540
19541 /* Only accept variable-length vector constants if they can be
19542 handled directly.
19543
19544 ??? It would be possible (but complex) to handle rematerialization
19545 of other constants via secondary reloads. */
19546 if (!GET_MODE_SIZE (mode).is_constant ())
19547 return aarch64_simd_valid_immediate (x, NULL);
19548
19549 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
19550 least be forced to memory and loaded from there. */
19551 if (CONST_VECTOR_P (x))
19552 return !targetm.cannot_force_const_mem (mode, x);
19553
19554 /* Do not allow vector struct mode constants for Advanced SIMD.
19555 We could support 0 and -1 easily, but they need support in
19556 aarch64-simd.md. */
19557 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19558 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
19559 return false;
19560
19561 if (GET_CODE (x) == HIGH)
19562 x = XEXP (x, 0);
19563
19564 /* Accept polynomial constants that can be calculated by using the
19565 destination of a move as the sole temporary. Constants that
19566 require a second temporary cannot be rematerialized (they can't be
19567 forced to memory and also aren't legitimate constants). */
19568 poly_int64 offset;
19569 if (poly_int_rtx_p (x, &offset))
19570 return aarch64_offset_temporaries (false, offset) <= 1;
19571
19572 /* If an offset is being added to something else, we need to allow the
19573 base to be moved into the destination register, meaning that there
19574 are no free temporaries for the offset. */
19575 x = strip_offset_and_salt (x, &offset);
19576 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
19577 return false;
19578
19579 /* Do not allow const (plus (anchor_symbol, const_int)). */
19580 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
19581 return false;
19582
19583 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
19584 so spilling them is better than rematerialization. */
19585 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
19586 return true;
19587
19588 /* Label references are always constant. */
19589 if (LABEL_REF_P (x))
19590 return true;
19591
19592 return false;
19593 }
19594
19595 rtx
19596 aarch64_load_tp (rtx target)
19597 {
19598 if (!target
19599 || GET_MODE (target) != Pmode
19600 || !register_operand (target, Pmode))
19601 target = gen_reg_rtx (Pmode);
19602
19603 /* Can return in any reg. */
19604 emit_insn (gen_aarch64_load_tp_hard (target));
19605 return target;
19606 }
19607
19608 /* On AAPCS systems, this is the "struct __va_list". */
19609 static GTY(()) tree va_list_type;
19610
19611 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
19612 Return the type to use as __builtin_va_list.
19613
19614 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
19615
19616 struct __va_list
19617 {
19618 void *__stack;
19619 void *__gr_top;
19620 void *__vr_top;
19621 int __gr_offs;
19622 int __vr_offs;
19623 }; */
19624
19625 static tree
19626 aarch64_build_builtin_va_list (void)
19627 {
19628 tree va_list_name;
19629 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19630
19631 /* Create the type. */
19632 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
19633 /* Give it the required name. */
19634 va_list_name = build_decl (BUILTINS_LOCATION,
19635 TYPE_DECL,
19636 get_identifier ("__va_list"),
19637 va_list_type);
19638 DECL_ARTIFICIAL (va_list_name) = 1;
19639 TYPE_NAME (va_list_type) = va_list_name;
19640 TYPE_STUB_DECL (va_list_type) = va_list_name;
19641
19642 /* Create the fields. */
19643 f_stack = build_decl (BUILTINS_LOCATION,
19644 FIELD_DECL, get_identifier ("__stack"),
19645 ptr_type_node);
19646 f_grtop = build_decl (BUILTINS_LOCATION,
19647 FIELD_DECL, get_identifier ("__gr_top"),
19648 ptr_type_node);
19649 f_vrtop = build_decl (BUILTINS_LOCATION,
19650 FIELD_DECL, get_identifier ("__vr_top"),
19651 ptr_type_node);
19652 f_groff = build_decl (BUILTINS_LOCATION,
19653 FIELD_DECL, get_identifier ("__gr_offs"),
19654 integer_type_node);
19655 f_vroff = build_decl (BUILTINS_LOCATION,
19656 FIELD_DECL, get_identifier ("__vr_offs"),
19657 integer_type_node);
19658
19659 /* Tell tree-stdarg pass about our internal offset fields.
19660 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
19661 purpose to identify whether the code is updating va_list internal
19662 offset fields through irregular way. */
19663 va_list_gpr_counter_field = f_groff;
19664 va_list_fpr_counter_field = f_vroff;
19665
19666 DECL_ARTIFICIAL (f_stack) = 1;
19667 DECL_ARTIFICIAL (f_grtop) = 1;
19668 DECL_ARTIFICIAL (f_vrtop) = 1;
19669 DECL_ARTIFICIAL (f_groff) = 1;
19670 DECL_ARTIFICIAL (f_vroff) = 1;
19671
19672 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
19673 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
19674 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
19675 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
19676 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
19677
19678 TYPE_FIELDS (va_list_type) = f_stack;
19679 DECL_CHAIN (f_stack) = f_grtop;
19680 DECL_CHAIN (f_grtop) = f_vrtop;
19681 DECL_CHAIN (f_vrtop) = f_groff;
19682 DECL_CHAIN (f_groff) = f_vroff;
19683
19684 /* Compute its layout. */
19685 layout_type (va_list_type);
19686
19687 return va_list_type;
19688 }
19689
19690 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
19691 static void
19692 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
19693 {
19694 const CUMULATIVE_ARGS *cum;
19695 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19696 tree stack, grtop, vrtop, groff, vroff;
19697 tree t;
19698 int gr_save_area_size = cfun->va_list_gpr_size;
19699 int vr_save_area_size = cfun->va_list_fpr_size;
19700 int vr_offset;
19701
19702 cum = &crtl->args.info;
19703 if (cfun->va_list_gpr_size)
19704 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
19705 cfun->va_list_gpr_size);
19706 if (cfun->va_list_fpr_size)
19707 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
19708 * UNITS_PER_VREG, cfun->va_list_fpr_size);
19709
19710 if (!TARGET_FLOAT)
19711 {
19712 gcc_assert (cum->aapcs_nvrn == 0);
19713 vr_save_area_size = 0;
19714 }
19715
19716 f_stack = TYPE_FIELDS (va_list_type_node);
19717 f_grtop = DECL_CHAIN (f_stack);
19718 f_vrtop = DECL_CHAIN (f_grtop);
19719 f_groff = DECL_CHAIN (f_vrtop);
19720 f_vroff = DECL_CHAIN (f_groff);
19721
19722 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
19723 NULL_TREE);
19724 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
19725 NULL_TREE);
19726 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
19727 NULL_TREE);
19728 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
19729 NULL_TREE);
19730 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
19731 NULL_TREE);
19732
19733 /* Emit code to initialize STACK, which points to the next varargs stack
19734 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
19735 by named arguments. STACK is 8-byte aligned. */
19736 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
19737 if (cum->aapcs_stack_size > 0)
19738 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
19739 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
19740 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19741
19742 /* Emit code to initialize GRTOP, the top of the GR save area.
19743 virtual_incoming_args_rtx should have been 16 byte aligned. */
19744 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
19745 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
19746 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19747
19748 /* Emit code to initialize VRTOP, the top of the VR save area.
19749 This address is gr_save_area_bytes below GRTOP, rounded
19750 down to the next 16-byte boundary. */
19751 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
19752 vr_offset = ROUND_UP (gr_save_area_size,
19753 STACK_BOUNDARY / BITS_PER_UNIT);
19754
19755 if (vr_offset)
19756 t = fold_build_pointer_plus_hwi (t, -vr_offset);
19757 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
19758 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19759
19760 /* Emit code to initialize GROFF, the offset from GRTOP of the
19761 next GPR argument. */
19762 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
19763 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
19764 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19765
19766 /* Likewise emit code to initialize VROFF, the offset from FTOP
19767 of the next VR argument. */
19768 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
19769 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
19770 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19771 }
19772
19773 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
19774
19775 static tree
19776 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
19777 gimple_seq *post_p ATTRIBUTE_UNUSED)
19778 {
19779 tree addr;
19780 bool indirect_p;
19781 bool is_ha; /* is HFA or HVA. */
19782 bool dw_align; /* double-word align. */
19783 machine_mode ag_mode = VOIDmode;
19784 int nregs;
19785 machine_mode mode;
19786
19787 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19788 tree stack, f_top, f_off, off, arg, roundup, on_stack;
19789 HOST_WIDE_INT size, rsize, adjust, align;
19790 tree t, u, cond1, cond2;
19791
19792 indirect_p = pass_va_arg_by_reference (type);
19793 if (indirect_p)
19794 type = build_pointer_type (type);
19795
19796 mode = TYPE_MODE (type);
19797
19798 f_stack = TYPE_FIELDS (va_list_type_node);
19799 f_grtop = DECL_CHAIN (f_stack);
19800 f_vrtop = DECL_CHAIN (f_grtop);
19801 f_groff = DECL_CHAIN (f_vrtop);
19802 f_vroff = DECL_CHAIN (f_groff);
19803
19804 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
19805 f_stack, NULL_TREE);
19806 size = int_size_in_bytes (type);
19807
19808 unsigned int abi_break;
19809 unsigned int abi_break_packed;
19810 align
19811 = aarch64_function_arg_alignment (mode, type, &abi_break, &abi_break_packed)
19812 / BITS_PER_UNIT;
19813
19814 dw_align = false;
19815 adjust = 0;
19816 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
19817 &is_ha, false))
19818 {
19819 /* No frontends can create types with variable-sized modes, so we
19820 shouldn't be asked to pass or return them. */
19821 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
19822
19823 /* TYPE passed in fp/simd registers. */
19824 if (!TARGET_FLOAT)
19825 aarch64_err_no_fpadvsimd (mode);
19826
19827 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
19828 unshare_expr (valist), f_vrtop, NULL_TREE);
19829 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
19830 unshare_expr (valist), f_vroff, NULL_TREE);
19831
19832 rsize = nregs * UNITS_PER_VREG;
19833
19834 if (is_ha)
19835 {
19836 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
19837 adjust = UNITS_PER_VREG - ag_size;
19838 }
19839 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19840 && size < UNITS_PER_VREG)
19841 {
19842 adjust = UNITS_PER_VREG - size;
19843 }
19844 }
19845 else
19846 {
19847 /* TYPE passed in general registers. */
19848 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
19849 unshare_expr (valist), f_grtop, NULL_TREE);
19850 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
19851 unshare_expr (valist), f_groff, NULL_TREE);
19852 rsize = ROUND_UP (size, UNITS_PER_WORD);
19853 nregs = rsize / UNITS_PER_WORD;
19854
19855 if (align <= 8 && abi_break_packed && warn_psabi)
19856 inform (input_location, "parameter passing for argument of type "
19857 "%qT changed in GCC 13.1", type);
19858
19859 if (align > 8)
19860 {
19861 if (abi_break && warn_psabi)
19862 inform (input_location, "parameter passing for argument of type "
19863 "%qT changed in GCC 9.1", type);
19864 dw_align = true;
19865 }
19866
19867 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19868 && size < UNITS_PER_WORD)
19869 {
19870 adjust = UNITS_PER_WORD - size;
19871 }
19872 }
19873
19874 /* Get a local temporary for the field value. */
19875 off = get_initialized_tmp_var (f_off, pre_p, NULL);
19876
19877 /* Emit code to branch if off >= 0. */
19878 t = build2 (GE_EXPR, boolean_type_node, off,
19879 build_int_cst (TREE_TYPE (off), 0));
19880 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
19881
19882 if (dw_align)
19883 {
19884 /* Emit: offs = (offs + 15) & -16. */
19885 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19886 build_int_cst (TREE_TYPE (off), 15));
19887 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
19888 build_int_cst (TREE_TYPE (off), -16));
19889 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
19890 }
19891 else
19892 roundup = NULL;
19893
19894 /* Update ap.__[g|v]r_offs */
19895 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19896 build_int_cst (TREE_TYPE (off), rsize));
19897 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
19898
19899 /* String up. */
19900 if (roundup)
19901 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19902
19903 /* [cond2] if (ap.__[g|v]r_offs > 0) */
19904 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
19905 build_int_cst (TREE_TYPE (f_off), 0));
19906 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
19907
19908 /* String up: make sure the assignment happens before the use. */
19909 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
19910 COND_EXPR_ELSE (cond1) = t;
19911
19912 /* Prepare the trees handling the argument that is passed on the stack;
19913 the top level node will store in ON_STACK. */
19914 arg = get_initialized_tmp_var (stack, pre_p, NULL);
19915 if (align > 8)
19916 {
19917 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
19918 t = fold_build_pointer_plus_hwi (arg, 15);
19919 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19920 build_int_cst (TREE_TYPE (t), -16));
19921 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
19922 }
19923 else
19924 roundup = NULL;
19925 /* Advance ap.__stack */
19926 t = fold_build_pointer_plus_hwi (arg, size + 7);
19927 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19928 build_int_cst (TREE_TYPE (t), -8));
19929 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
19930 /* String up roundup and advance. */
19931 if (roundup)
19932 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19933 /* String up with arg */
19934 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
19935 /* Big-endianness related address adjustment. */
19936 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19937 && size < UNITS_PER_WORD)
19938 {
19939 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
19940 size_int (UNITS_PER_WORD - size));
19941 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
19942 }
19943
19944 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
19945 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
19946
19947 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
19948 t = off;
19949 if (adjust)
19950 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
19951 build_int_cst (TREE_TYPE (off), adjust));
19952
19953 t = fold_convert (sizetype, t);
19954 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
19955
19956 if (is_ha)
19957 {
19958 /* type ha; // treat as "struct {ftype field[n];}"
19959 ... [computing offs]
19960 for (i = 0; i <nregs; ++i, offs += 16)
19961 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
19962 return ha; */
19963 int i;
19964 tree tmp_ha, field_t, field_ptr_t;
19965
19966 /* Declare a local variable. */
19967 tmp_ha = create_tmp_var_raw (type, "ha");
19968 gimple_add_tmp_var (tmp_ha);
19969
19970 /* Establish the base type. */
19971 switch (ag_mode)
19972 {
19973 case E_SFmode:
19974 field_t = float_type_node;
19975 field_ptr_t = float_ptr_type_node;
19976 break;
19977 case E_DFmode:
19978 field_t = double_type_node;
19979 field_ptr_t = double_ptr_type_node;
19980 break;
19981 case E_TFmode:
19982 field_t = long_double_type_node;
19983 field_ptr_t = long_double_ptr_type_node;
19984 break;
19985 case E_SDmode:
19986 field_t = dfloat32_type_node;
19987 field_ptr_t = build_pointer_type (dfloat32_type_node);
19988 break;
19989 case E_DDmode:
19990 field_t = dfloat64_type_node;
19991 field_ptr_t = build_pointer_type (dfloat64_type_node);
19992 break;
19993 case E_TDmode:
19994 field_t = dfloat128_type_node;
19995 field_ptr_t = build_pointer_type (dfloat128_type_node);
19996 break;
19997 case E_HFmode:
19998 field_t = aarch64_fp16_type_node;
19999 field_ptr_t = aarch64_fp16_ptr_type_node;
20000 break;
20001 case E_BFmode:
20002 field_t = aarch64_bf16_type_node;
20003 field_ptr_t = aarch64_bf16_ptr_type_node;
20004 break;
20005 case E_V2SImode:
20006 case E_V4SImode:
20007 {
20008 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
20009 field_t = build_vector_type_for_mode (innertype, ag_mode);
20010 field_ptr_t = build_pointer_type (field_t);
20011 }
20012 break;
20013 default:
20014 gcc_assert (0);
20015 }
20016
20017 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
20018 TREE_ADDRESSABLE (tmp_ha) = 1;
20019 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
20020 addr = t;
20021 t = fold_convert (field_ptr_t, addr);
20022 t = build2 (MODIFY_EXPR, field_t,
20023 build1 (INDIRECT_REF, field_t, tmp_ha),
20024 build1 (INDIRECT_REF, field_t, t));
20025
20026 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
20027 for (i = 1; i < nregs; ++i)
20028 {
20029 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
20030 u = fold_convert (field_ptr_t, addr);
20031 u = build2 (MODIFY_EXPR, field_t,
20032 build2 (MEM_REF, field_t, tmp_ha,
20033 build_int_cst (field_ptr_t,
20034 (i *
20035 int_size_in_bytes (field_t)))),
20036 build1 (INDIRECT_REF, field_t, u));
20037 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
20038 }
20039
20040 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
20041 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
20042 }
20043
20044 COND_EXPR_ELSE (cond2) = t;
20045 addr = fold_convert (build_pointer_type (type), cond1);
20046 addr = build_va_arg_indirect_ref (addr);
20047
20048 if (indirect_p)
20049 addr = build_va_arg_indirect_ref (addr);
20050
20051 return addr;
20052 }
20053
20054 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
20055
20056 static void
20057 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
20058 const function_arg_info &arg,
20059 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
20060 {
20061 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
20062 CUMULATIVE_ARGS local_cum;
20063 int gr_saved = cfun->va_list_gpr_size;
20064 int vr_saved = cfun->va_list_fpr_size;
20065
20066 /* The caller has advanced CUM up to, but not beyond, the last named
20067 argument. Advance a local copy of CUM past the last "real" named
20068 argument, to find out how many registers are left over. */
20069 local_cum = *cum;
20070 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
20071 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
20072
20073 /* Found out how many registers we need to save.
20074 Honor tree-stdvar analysis results. */
20075 if (cfun->va_list_gpr_size)
20076 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
20077 cfun->va_list_gpr_size / UNITS_PER_WORD);
20078 if (cfun->va_list_fpr_size)
20079 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
20080 cfun->va_list_fpr_size / UNITS_PER_VREG);
20081
20082 if (!TARGET_FLOAT)
20083 {
20084 gcc_assert (local_cum.aapcs_nvrn == 0);
20085 vr_saved = 0;
20086 }
20087
20088 if (!no_rtl)
20089 {
20090 if (gr_saved > 0)
20091 {
20092 rtx ptr, mem;
20093
20094 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
20095 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
20096 - gr_saved * UNITS_PER_WORD);
20097 mem = gen_frame_mem (BLKmode, ptr);
20098 set_mem_alias_set (mem, get_varargs_alias_set ());
20099
20100 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
20101 mem, gr_saved);
20102 }
20103 if (vr_saved > 0)
20104 {
20105 /* We can't use move_block_from_reg, because it will use
20106 the wrong mode, storing D regs only. */
20107 machine_mode mode = TImode;
20108 int off, i, vr_start;
20109
20110 /* Set OFF to the offset from virtual_incoming_args_rtx of
20111 the first vector register. The VR save area lies below
20112 the GR one, and is aligned to 16 bytes. */
20113 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
20114 STACK_BOUNDARY / BITS_PER_UNIT);
20115 off -= vr_saved * UNITS_PER_VREG;
20116
20117 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
20118 for (i = 0; i < vr_saved; ++i)
20119 {
20120 rtx ptr, mem;
20121
20122 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
20123 mem = gen_frame_mem (mode, ptr);
20124 set_mem_alias_set (mem, get_varargs_alias_set ());
20125 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
20126 off += UNITS_PER_VREG;
20127 }
20128 }
20129 }
20130
20131 /* We don't save the size into *PRETEND_SIZE because we want to avoid
20132 any complication of having crtl->args.pretend_args_size changed. */
20133 cfun->machine->frame.saved_varargs_size
20134 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
20135 STACK_BOUNDARY / BITS_PER_UNIT)
20136 + vr_saved * UNITS_PER_VREG);
20137 }
20138
20139 static void
20140 aarch64_conditional_register_usage (void)
20141 {
20142 int i;
20143 if (!TARGET_FLOAT)
20144 {
20145 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
20146 {
20147 fixed_regs[i] = 1;
20148 call_used_regs[i] = 1;
20149 CLEAR_HARD_REG_BIT (operand_reg_set, i);
20150 }
20151 }
20152 if (!TARGET_SVE)
20153 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
20154 {
20155 fixed_regs[i] = 1;
20156 call_used_regs[i] = 1;
20157 }
20158
20159 /* Only allow the FFR and FFRT to be accessed via special patterns. */
20160 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
20161 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
20162
20163 /* When tracking speculation, we need a couple of call-clobbered registers
20164 to track the speculation state. It would be nice to just use
20165 IP0 and IP1, but currently there are numerous places that just
20166 assume these registers are free for other uses (eg pointer
20167 authentication). */
20168 if (aarch64_track_speculation)
20169 {
20170 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
20171 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
20172 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20173 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20174 }
20175 }
20176
20177 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
20178
20179 bool
20180 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
20181 {
20182 /* For records we're passed a FIELD_DECL, for arrays we're passed
20183 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
20184 const_tree type = TREE_TYPE (field_or_array);
20185
20186 /* Assign BLKmode to anything that contains multiple SVE predicates.
20187 For structures, the "multiple" case is indicated by MODE being
20188 VOIDmode. */
20189 unsigned int num_zr, num_pr;
20190 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
20191 {
20192 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
20193 return !simple_cst_equal (TYPE_SIZE (field_or_array),
20194 TYPE_SIZE (type));
20195 return mode == VOIDmode;
20196 }
20197
20198 return default_member_type_forces_blk (field_or_array, mode);
20199 }
20200
20201 /* Bitmasks that indicate whether earlier versions of GCC would have
20202 taken a different path through the ABI logic. This should result in
20203 a -Wpsabi warning if the earlier path led to a different ABI decision.
20204
20205 WARN_PSABI_EMPTY_CXX17_BASE
20206 Indicates that the type includes an artificial empty C++17 base field
20207 that, prior to GCC 10.1, would prevent the type from being treated as
20208 a HFA or HVA. See PR94383 for details.
20209
20210 WARN_PSABI_NO_UNIQUE_ADDRESS
20211 Indicates that the type includes an empty [[no_unique_address]] field
20212 that, prior to GCC 10.1, would prevent the type from being treated as
20213 a HFA or HVA. */
20214 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
20215 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
20216 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
20217
20218 /* Walk down the type tree of TYPE counting consecutive base elements.
20219 If *MODEP is VOIDmode, then set it to the first valid floating point
20220 type. If a non-floating point type is found, or if a floating point
20221 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
20222 otherwise return the count in the sub-tree.
20223
20224 The WARN_PSABI_FLAGS argument allows the caller to check whether this
20225 function has changed its behavior relative to earlier versions of GCC.
20226 Normally the argument should be nonnull and point to a zero-initialized
20227 variable. The function then records whether the ABI decision might
20228 be affected by a known fix to the ABI logic, setting the associated
20229 WARN_PSABI_* bits if so.
20230
20231 When the argument is instead a null pointer, the function tries to
20232 simulate the behavior of GCC before all such ABI fixes were made.
20233 This is useful to check whether the function returns something
20234 different after the ABI fixes. */
20235 static int
20236 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
20237 unsigned int *warn_psabi_flags)
20238 {
20239 machine_mode mode;
20240 HOST_WIDE_INT size;
20241
20242 if (aarch64_sve::builtin_type_p (type))
20243 return -1;
20244
20245 switch (TREE_CODE (type))
20246 {
20247 case REAL_TYPE:
20248 mode = TYPE_MODE (type);
20249 if (mode != DFmode && mode != SFmode
20250 && mode != TFmode && mode != HFmode
20251 && mode != SDmode && mode != DDmode && mode != TDmode)
20252 return -1;
20253
20254 if (*modep == VOIDmode)
20255 *modep = mode;
20256
20257 if (*modep == mode)
20258 return 1;
20259
20260 break;
20261
20262 case COMPLEX_TYPE:
20263 mode = TYPE_MODE (TREE_TYPE (type));
20264 if (mode != DFmode && mode != SFmode
20265 && mode != TFmode && mode != HFmode)
20266 return -1;
20267
20268 if (*modep == VOIDmode)
20269 *modep = mode;
20270
20271 if (*modep == mode)
20272 return 2;
20273
20274 break;
20275
20276 case VECTOR_TYPE:
20277 /* Use V2SImode and V4SImode as representatives of all 64-bit
20278 and 128-bit vector types. */
20279 size = int_size_in_bytes (type);
20280 switch (size)
20281 {
20282 case 8:
20283 mode = V2SImode;
20284 break;
20285 case 16:
20286 mode = V4SImode;
20287 break;
20288 default:
20289 return -1;
20290 }
20291
20292 if (*modep == VOIDmode)
20293 *modep = mode;
20294
20295 /* Vector modes are considered to be opaque: two vectors are
20296 equivalent for the purposes of being homogeneous aggregates
20297 if they are the same size. */
20298 if (*modep == mode)
20299 return 1;
20300
20301 break;
20302
20303 case ARRAY_TYPE:
20304 {
20305 int count;
20306 tree index = TYPE_DOMAIN (type);
20307
20308 /* Can't handle incomplete types nor sizes that are not
20309 fixed. */
20310 if (!COMPLETE_TYPE_P (type)
20311 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20312 return -1;
20313
20314 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
20315 warn_psabi_flags);
20316 if (count == -1
20317 || !index
20318 || !TYPE_MAX_VALUE (index)
20319 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
20320 || !TYPE_MIN_VALUE (index)
20321 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
20322 || count < 0)
20323 return -1;
20324
20325 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
20326 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
20327
20328 /* There must be no padding. */
20329 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20330 count * GET_MODE_BITSIZE (*modep)))
20331 return -1;
20332
20333 return count;
20334 }
20335
20336 case RECORD_TYPE:
20337 {
20338 int count = 0;
20339 int sub_count;
20340 tree field;
20341
20342 /* Can't handle incomplete types nor sizes that are not
20343 fixed. */
20344 if (!COMPLETE_TYPE_P (type)
20345 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20346 return -1;
20347
20348 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20349 {
20350 if (TREE_CODE (field) != FIELD_DECL)
20351 continue;
20352
20353 if (DECL_FIELD_ABI_IGNORED (field))
20354 {
20355 /* See whether this is something that earlier versions of
20356 GCC failed to ignore. */
20357 unsigned int flag;
20358 if (lookup_attribute ("no_unique_address",
20359 DECL_ATTRIBUTES (field)))
20360 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
20361 else if (cxx17_empty_base_field_p (field))
20362 flag = WARN_PSABI_EMPTY_CXX17_BASE;
20363 else
20364 /* No compatibility problem. */
20365 continue;
20366
20367 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
20368 if (warn_psabi_flags)
20369 {
20370 *warn_psabi_flags |= flag;
20371 continue;
20372 }
20373 }
20374 /* A zero-width bitfield may affect layout in some
20375 circumstances, but adds no members. The determination
20376 of whether or not a type is an HFA is performed after
20377 layout is complete, so if the type still looks like an
20378 HFA afterwards, it is still classed as one. This is
20379 potentially an ABI break for the hard-float ABI. */
20380 else if (DECL_BIT_FIELD (field)
20381 && integer_zerop (DECL_SIZE (field)))
20382 {
20383 /* Prior to GCC-12 these fields were striped early,
20384 hiding them from the back-end entirely and
20385 resulting in the correct behaviour for argument
20386 passing. Simulate that old behaviour without
20387 generating a warning. */
20388 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
20389 continue;
20390 if (warn_psabi_flags)
20391 {
20392 *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
20393 continue;
20394 }
20395 }
20396
20397 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20398 warn_psabi_flags);
20399 if (sub_count < 0)
20400 return -1;
20401 count += sub_count;
20402 }
20403
20404 /* There must be no padding. */
20405 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20406 count * GET_MODE_BITSIZE (*modep)))
20407 return -1;
20408
20409 return count;
20410 }
20411
20412 case UNION_TYPE:
20413 case QUAL_UNION_TYPE:
20414 {
20415 /* These aren't very interesting except in a degenerate case. */
20416 int count = 0;
20417 int sub_count;
20418 tree field;
20419
20420 /* Can't handle incomplete types nor sizes that are not
20421 fixed. */
20422 if (!COMPLETE_TYPE_P (type)
20423 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20424 return -1;
20425
20426 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20427 {
20428 if (TREE_CODE (field) != FIELD_DECL)
20429 continue;
20430
20431 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20432 warn_psabi_flags);
20433 if (sub_count < 0)
20434 return -1;
20435 count = count > sub_count ? count : sub_count;
20436 }
20437
20438 /* There must be no padding. */
20439 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20440 count * GET_MODE_BITSIZE (*modep)))
20441 return -1;
20442
20443 return count;
20444 }
20445
20446 default:
20447 break;
20448 }
20449
20450 return -1;
20451 }
20452
20453 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
20454 type as described in AAPCS64 \S 4.1.2.
20455
20456 See the comment above aarch64_composite_type_p for the notes on MODE. */
20457
20458 static bool
20459 aarch64_short_vector_p (const_tree type,
20460 machine_mode mode)
20461 {
20462 poly_int64 size = -1;
20463
20464 if (type && TREE_CODE (type) == VECTOR_TYPE)
20465 {
20466 if (aarch64_sve::builtin_type_p (type))
20467 return false;
20468 size = int_size_in_bytes (type);
20469 }
20470 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
20471 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
20472 {
20473 /* The containing "else if" is too loose: it means that we look at TYPE
20474 if the type is a vector type (good), but that we otherwise ignore TYPE
20475 and look only at the mode. This is wrong because the type describes
20476 the language-level information whereas the mode is purely an internal
20477 GCC concept. We can therefore reach here for types that are not
20478 vectors in the AAPCS64 sense.
20479
20480 We can't "fix" that for the traditional Advanced SIMD vector modes
20481 without breaking backwards compatibility. However, there's no such
20482 baggage for the structure modes, which were introduced in GCC 12. */
20483 if (aarch64_advsimd_struct_mode_p (mode))
20484 return false;
20485
20486 /* For similar reasons, rely only on the type, not the mode, when
20487 processing SVE types. */
20488 if (type && aarch64_some_values_include_pst_objects_p (type))
20489 /* Leave later code to report an error if SVE is disabled. */
20490 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
20491 else
20492 size = GET_MODE_SIZE (mode);
20493 }
20494 if (known_eq (size, 8) || known_eq (size, 16))
20495 {
20496 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
20497 they are being treated as scalable AAPCS64 types. */
20498 gcc_assert (!aarch64_sve_mode_p (mode)
20499 && !aarch64_advsimd_struct_mode_p (mode));
20500 return true;
20501 }
20502 return false;
20503 }
20504
20505 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
20506 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
20507 array types. The C99 floating-point complex types are also considered
20508 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
20509 types, which are GCC extensions and out of the scope of AAPCS64, are
20510 treated as composite types here as well.
20511
20512 Note that MODE itself is not sufficient in determining whether a type
20513 is such a composite type or not. This is because
20514 stor-layout.cc:compute_record_mode may have already changed the MODE
20515 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
20516 structure with only one field may have its MODE set to the mode of the
20517 field. Also an integer mode whose size matches the size of the
20518 RECORD_TYPE type may be used to substitute the original mode
20519 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
20520 solely relied on. */
20521
20522 static bool
20523 aarch64_composite_type_p (const_tree type,
20524 machine_mode mode)
20525 {
20526 if (aarch64_short_vector_p (type, mode))
20527 return false;
20528
20529 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
20530 return true;
20531
20532 if (mode == BLKmode
20533 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
20534 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
20535 return true;
20536
20537 return false;
20538 }
20539
20540 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
20541 shall be passed or returned in simd/fp register(s) (providing these
20542 parameter passing registers are available).
20543
20544 Upon successful return, *COUNT returns the number of needed registers,
20545 *BASE_MODE returns the mode of the individual register and when IS_HA
20546 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
20547 floating-point aggregate or a homogeneous short-vector aggregate.
20548
20549 SILENT_P is true if the function should refrain from reporting any
20550 diagnostics. This should only be used if the caller is certain that
20551 any ABI decisions would eventually come through this function with
20552 SILENT_P set to false. */
20553
20554 static bool
20555 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
20556 const_tree type,
20557 machine_mode *base_mode,
20558 int *count,
20559 bool *is_ha,
20560 bool silent_p)
20561 {
20562 if (is_ha != NULL) *is_ha = false;
20563
20564 machine_mode new_mode = VOIDmode;
20565 bool composite_p = aarch64_composite_type_p (type, mode);
20566
20567 if ((!composite_p
20568 && (GET_MODE_CLASS (mode) == MODE_FLOAT
20569 || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
20570 || aarch64_short_vector_p (type, mode))
20571 {
20572 *count = 1;
20573 new_mode = mode;
20574 }
20575 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
20576 {
20577 if (is_ha != NULL) *is_ha = true;
20578 *count = 2;
20579 new_mode = GET_MODE_INNER (mode);
20580 }
20581 else if (type && composite_p)
20582 {
20583 unsigned int warn_psabi_flags = 0;
20584 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
20585 &warn_psabi_flags);
20586 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
20587 {
20588 static unsigned last_reported_type_uid;
20589 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
20590 int alt;
20591 if (!silent_p
20592 && warn_psabi
20593 && warn_psabi_flags
20594 && uid != last_reported_type_uid
20595 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
20596 != ag_count))
20597 {
20598 const char *url10
20599 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
20600 const char *url12
20601 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
20602 gcc_assert (alt == -1);
20603 last_reported_type_uid = uid;
20604 /* Use TYPE_MAIN_VARIANT to strip any redundant const
20605 qualification. */
20606 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
20607 inform (input_location, "parameter passing for argument of "
20608 "type %qT with %<[[no_unique_address]]%> members "
20609 "changed %{in GCC 10.1%}",
20610 TYPE_MAIN_VARIANT (type), url10);
20611 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
20612 inform (input_location, "parameter passing for argument of "
20613 "type %qT when C++17 is enabled changed to match "
20614 "C++14 %{in GCC 10.1%}",
20615 TYPE_MAIN_VARIANT (type), url10);
20616 else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
20617 inform (input_location, "parameter passing for argument of "
20618 "type %qT changed %{in GCC 12.1%}",
20619 TYPE_MAIN_VARIANT (type), url12);
20620 }
20621
20622 if (is_ha != NULL) *is_ha = true;
20623 *count = ag_count;
20624 }
20625 else
20626 return false;
20627 }
20628 else
20629 return false;
20630
20631 gcc_assert (!aarch64_sve_mode_p (new_mode));
20632 *base_mode = new_mode;
20633 return true;
20634 }
20635
20636 /* Implement TARGET_STRUCT_VALUE_RTX. */
20637
20638 static rtx
20639 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
20640 int incoming ATTRIBUTE_UNUSED)
20641 {
20642 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
20643 }
20644
20645 /* Implements target hook vector_mode_supported_p. */
20646 static bool
20647 aarch64_vector_mode_supported_p (machine_mode mode)
20648 {
20649 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20650 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
20651 }
20652
20653 /* Return the full-width SVE vector mode for element mode MODE, if one
20654 exists. */
20655 opt_machine_mode
20656 aarch64_full_sve_mode (scalar_mode mode)
20657 {
20658 switch (mode)
20659 {
20660 case E_DFmode:
20661 return VNx2DFmode;
20662 case E_SFmode:
20663 return VNx4SFmode;
20664 case E_HFmode:
20665 return VNx8HFmode;
20666 case E_BFmode:
20667 return VNx8BFmode;
20668 case E_DImode:
20669 return VNx2DImode;
20670 case E_SImode:
20671 return VNx4SImode;
20672 case E_HImode:
20673 return VNx8HImode;
20674 case E_QImode:
20675 return VNx16QImode;
20676 default:
20677 return opt_machine_mode ();
20678 }
20679 }
20680
20681 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
20682 if it exists. */
20683 opt_machine_mode
20684 aarch64_vq_mode (scalar_mode mode)
20685 {
20686 switch (mode)
20687 {
20688 case E_DFmode:
20689 return V2DFmode;
20690 case E_SFmode:
20691 return V4SFmode;
20692 case E_HFmode:
20693 return V8HFmode;
20694 case E_BFmode:
20695 return V8BFmode;
20696 case E_SImode:
20697 return V4SImode;
20698 case E_HImode:
20699 return V8HImode;
20700 case E_QImode:
20701 return V16QImode;
20702 case E_DImode:
20703 return V2DImode;
20704 default:
20705 return opt_machine_mode ();
20706 }
20707 }
20708
20709 /* Return appropriate SIMD container
20710 for MODE within a vector of WIDTH bits. */
20711 static machine_mode
20712 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
20713 {
20714 if (TARGET_SVE
20715 && maybe_ne (width, 128)
20716 && known_eq (width, BITS_PER_SVE_VECTOR))
20717 return aarch64_full_sve_mode (mode).else_mode (word_mode);
20718
20719 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
20720 if (TARGET_SIMD)
20721 {
20722 if (known_eq (width, 128))
20723 return aarch64_vq_mode (mode).else_mode (word_mode);
20724 else
20725 switch (mode)
20726 {
20727 case E_SFmode:
20728 return V2SFmode;
20729 case E_HFmode:
20730 return V4HFmode;
20731 case E_BFmode:
20732 return V4BFmode;
20733 case E_SImode:
20734 return V2SImode;
20735 case E_HImode:
20736 return V4HImode;
20737 case E_QImode:
20738 return V8QImode;
20739 default:
20740 break;
20741 }
20742 }
20743 return word_mode;
20744 }
20745
20746 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
20747 and return whether the SVE mode should be preferred over the
20748 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
20749 static bool
20750 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
20751 {
20752 /* Take into account the aarch64-autovec-preference param if non-zero. */
20753 bool only_asimd_p = aarch64_autovec_preference == 1;
20754 bool only_sve_p = aarch64_autovec_preference == 2;
20755
20756 if (only_asimd_p)
20757 return false;
20758 if (only_sve_p)
20759 return true;
20760
20761 /* The preference in case of a tie in costs. */
20762 bool prefer_asimd = aarch64_autovec_preference == 3;
20763 bool prefer_sve = aarch64_autovec_preference == 4;
20764
20765 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
20766 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
20767 /* If the CPU information does not have an SVE width registered use the
20768 generic poly_int comparison that prefers SVE. If a preference is
20769 explicitly requested avoid this path. */
20770 if (aarch64_tune_params.sve_width == SVE_SCALABLE
20771 && !prefer_asimd
20772 && !prefer_sve)
20773 return maybe_gt (nunits_sve, nunits_asimd);
20774
20775 /* Otherwise estimate the runtime width of the modes involved. */
20776 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
20777 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
20778
20779 /* Preferring SVE means picking it first unless the Advanced SIMD mode
20780 is clearly wider. */
20781 if (prefer_sve)
20782 return est_sve >= est_asimd;
20783 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
20784 is clearly wider. */
20785 if (prefer_asimd)
20786 return est_sve > est_asimd;
20787
20788 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
20789 return est_sve > est_asimd;
20790 }
20791
20792 /* Return 128-bit container as the preferred SIMD mode for MODE. */
20793 static machine_mode
20794 aarch64_preferred_simd_mode (scalar_mode mode)
20795 {
20796 /* Take into account explicit auto-vectorization ISA preferences through
20797 aarch64_cmp_autovec_modes. */
20798 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
20799 return aarch64_full_sve_mode (mode).else_mode (word_mode);
20800 if (TARGET_SIMD)
20801 return aarch64_vq_mode (mode).else_mode (word_mode);
20802 return word_mode;
20803 }
20804
20805 /* Return a list of possible vector sizes for the vectorizer
20806 to iterate over. */
20807 static unsigned int
20808 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
20809 {
20810 static const machine_mode sve_modes[] = {
20811 /* Try using full vectors for all element types. */
20812 VNx16QImode,
20813
20814 /* Try using 16-bit containers for 8-bit elements and full vectors
20815 for wider elements. */
20816 VNx8QImode,
20817
20818 /* Try using 32-bit containers for 8-bit and 16-bit elements and
20819 full vectors for wider elements. */
20820 VNx4QImode,
20821
20822 /* Try using 64-bit containers for all element types. */
20823 VNx2QImode
20824 };
20825
20826 static const machine_mode advsimd_modes[] = {
20827 /* Try using 128-bit vectors for all element types. */
20828 V16QImode,
20829
20830 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
20831 for wider elements. */
20832 V8QImode,
20833
20834 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
20835 for wider elements.
20836
20837 TODO: We could support a limited form of V4QImode too, so that
20838 we use 32-bit vectors for 8-bit elements. */
20839 V4HImode,
20840
20841 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
20842 for 64-bit elements.
20843
20844 TODO: We could similarly support limited forms of V2QImode and V2HImode
20845 for this case. */
20846 V2SImode
20847 };
20848
20849 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
20850 This is because:
20851
20852 - If we can't use N-byte Advanced SIMD vectors then the placement
20853 doesn't matter; we'll just continue as though the Advanced SIMD
20854 entry didn't exist.
20855
20856 - If an SVE main loop with N bytes ends up being cheaper than an
20857 Advanced SIMD main loop with N bytes then by default we'll replace
20858 the Advanced SIMD version with the SVE one.
20859
20860 - If an Advanced SIMD main loop with N bytes ends up being cheaper
20861 than an SVE main loop with N bytes then by default we'll try to
20862 use the SVE loop to vectorize the epilogue instead. */
20863
20864 bool only_asimd_p = aarch64_autovec_preference == 1;
20865 bool only_sve_p = aarch64_autovec_preference == 2;
20866
20867 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
20868 unsigned int advsimd_i = 0;
20869
20870 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
20871 {
20872 if (sve_i < ARRAY_SIZE (sve_modes)
20873 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
20874 advsimd_modes[advsimd_i]))
20875 modes->safe_push (sve_modes[sve_i++]);
20876 else
20877 modes->safe_push (advsimd_modes[advsimd_i++]);
20878 }
20879 while (sve_i < ARRAY_SIZE (sve_modes))
20880 modes->safe_push (sve_modes[sve_i++]);
20881
20882 unsigned int flags = 0;
20883 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
20884 can compare SVE against Advanced SIMD and so that we can compare
20885 multiple SVE vectorization approaches against each other. There's
20886 not really any point doing this for Advanced SIMD only, since the
20887 first mode that works should always be the best. */
20888 if (TARGET_SVE && aarch64_sve_compare_costs)
20889 flags |= VECT_COMPARE_COSTS;
20890 return flags;
20891 }
20892
20893 /* Implement TARGET_MANGLE_TYPE. */
20894
20895 static const char *
20896 aarch64_mangle_type (const_tree type)
20897 {
20898 /* The AArch64 ABI documents say that "__va_list" has to be
20899 mangled as if it is in the "std" namespace. */
20900 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
20901 return "St9__va_list";
20902
20903 /* Half-precision floating point types. */
20904 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
20905 {
20906 if (TYPE_MAIN_VARIANT (type) == float16_type_node)
20907 return NULL;
20908 if (TYPE_MODE (type) == BFmode)
20909 return "u6__bf16";
20910 else
20911 return "Dh";
20912 }
20913
20914 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
20915 builtin types. */
20916 if (TYPE_NAME (type) != NULL)
20917 {
20918 const char *res;
20919 if ((res = aarch64_general_mangle_builtin_type (type))
20920 || (res = aarch64_sve::mangle_builtin_type (type)))
20921 return res;
20922 }
20923
20924 /* Use the default mangling. */
20925 return NULL;
20926 }
20927
20928 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
20929
20930 static bool
20931 aarch64_verify_type_context (location_t loc, type_context_kind context,
20932 const_tree type, bool silent_p)
20933 {
20934 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
20935 }
20936
20937 /* Find the first rtx_insn before insn that will generate an assembly
20938 instruction. */
20939
20940 static rtx_insn *
20941 aarch64_prev_real_insn (rtx_insn *insn)
20942 {
20943 if (!insn)
20944 return NULL;
20945
20946 do
20947 {
20948 insn = prev_real_insn (insn);
20949 }
20950 while (insn && recog_memoized (insn) < 0);
20951
20952 return insn;
20953 }
20954
20955 static bool
20956 is_madd_op (enum attr_type t1)
20957 {
20958 unsigned int i;
20959 /* A number of these may be AArch32 only. */
20960 enum attr_type mlatypes[] = {
20961 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
20962 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
20963 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
20964 };
20965
20966 for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
20967 {
20968 if (t1 == mlatypes[i])
20969 return true;
20970 }
20971
20972 return false;
20973 }
20974
20975 /* Check if there is a register dependency between a load and the insn
20976 for which we hold recog_data. */
20977
20978 static bool
20979 dep_between_memop_and_curr (rtx memop)
20980 {
20981 rtx load_reg;
20982 int opno;
20983
20984 gcc_assert (GET_CODE (memop) == SET);
20985
20986 if (!REG_P (SET_DEST (memop)))
20987 return false;
20988
20989 load_reg = SET_DEST (memop);
20990 for (opno = 1; opno < recog_data.n_operands; opno++)
20991 {
20992 rtx operand = recog_data.operand[opno];
20993 if (REG_P (operand)
20994 && reg_overlap_mentioned_p (load_reg, operand))
20995 return true;
20996
20997 }
20998 return false;
20999 }
21000
21001
21002 /* When working around the Cortex-A53 erratum 835769,
21003 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
21004 instruction and has a preceding memory instruction such that a NOP
21005 should be inserted between them. */
21006
21007 bool
21008 aarch64_madd_needs_nop (rtx_insn* insn)
21009 {
21010 enum attr_type attr_type;
21011 rtx_insn *prev;
21012 rtx body;
21013
21014 if (!TARGET_FIX_ERR_A53_835769)
21015 return false;
21016
21017 if (!INSN_P (insn) || recog_memoized (insn) < 0)
21018 return false;
21019
21020 attr_type = get_attr_type (insn);
21021 if (!is_madd_op (attr_type))
21022 return false;
21023
21024 prev = aarch64_prev_real_insn (insn);
21025 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
21026 Restore recog state to INSN to avoid state corruption. */
21027 extract_constrain_insn_cached (insn);
21028
21029 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
21030 return false;
21031
21032 body = single_set (prev);
21033
21034 /* If the previous insn is a memory op and there is no dependency between
21035 it and the DImode madd, emit a NOP between them. If body is NULL then we
21036 have a complex memory operation, probably a load/store pair.
21037 Be conservative for now and emit a NOP. */
21038 if (GET_MODE (recog_data.operand[0]) == DImode
21039 && (!body || !dep_between_memop_and_curr (body)))
21040 return true;
21041
21042 return false;
21043
21044 }
21045
21046
21047 /* Implement FINAL_PRESCAN_INSN. */
21048
21049 void
21050 aarch64_final_prescan_insn (rtx_insn *insn)
21051 {
21052 if (aarch64_madd_needs_nop (insn))
21053 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
21054 }
21055
21056
21057 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
21058 instruction. */
21059
21060 bool
21061 aarch64_sve_index_immediate_p (rtx base_or_step)
21062 {
21063 return (CONST_INT_P (base_or_step)
21064 && IN_RANGE (INTVAL (base_or_step), -16, 15));
21065 }
21066
21067 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
21068 when applied to mode MODE. Negate X first if NEGATE_P is true. */
21069
21070 bool
21071 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
21072 {
21073 rtx elt = unwrap_const_vec_duplicate (x);
21074 if (!CONST_INT_P (elt))
21075 return false;
21076
21077 HOST_WIDE_INT val = INTVAL (elt);
21078 if (negate_p)
21079 val = -val;
21080 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
21081
21082 if (val & 0xff)
21083 return IN_RANGE (val, 0, 0xff);
21084 return IN_RANGE (val, 0, 0xff00);
21085 }
21086
21087 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
21088 instructions when applied to mode MODE. Negate X first if NEGATE_P
21089 is true. */
21090
21091 bool
21092 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
21093 {
21094 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
21095 return false;
21096
21097 /* After the optional negation, the immediate must be nonnegative.
21098 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
21099 instead of SQADD Zn.B, Zn.B, #129. */
21100 rtx elt = unwrap_const_vec_duplicate (x);
21101 return negate_p == (INTVAL (elt) < 0);
21102 }
21103
21104 /* Return true if X is a valid immediate operand for an SVE logical
21105 instruction such as AND. */
21106
21107 bool
21108 aarch64_sve_bitmask_immediate_p (rtx x)
21109 {
21110 rtx elt;
21111
21112 return (const_vec_duplicate_p (x, &elt)
21113 && CONST_INT_P (elt)
21114 && aarch64_bitmask_imm (INTVAL (elt),
21115 GET_MODE_INNER (GET_MODE (x))));
21116 }
21117
21118 /* Return true if X is a valid immediate for the SVE DUP and CPY
21119 instructions. */
21120
21121 bool
21122 aarch64_sve_dup_immediate_p (rtx x)
21123 {
21124 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
21125 if (!CONST_INT_P (x))
21126 return false;
21127
21128 HOST_WIDE_INT val = INTVAL (x);
21129 if (val & 0xff)
21130 return IN_RANGE (val, -0x80, 0x7f);
21131 return IN_RANGE (val, -0x8000, 0x7f00);
21132 }
21133
21134 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
21135 SIGNED_P says whether the operand is signed rather than unsigned. */
21136
21137 bool
21138 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
21139 {
21140 x = unwrap_const_vec_duplicate (x);
21141 return (CONST_INT_P (x)
21142 && (signed_p
21143 ? IN_RANGE (INTVAL (x), -16, 15)
21144 : IN_RANGE (INTVAL (x), 0, 127)));
21145 }
21146
21147 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
21148 instruction. Negate X first if NEGATE_P is true. */
21149
21150 bool
21151 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
21152 {
21153 rtx elt;
21154 REAL_VALUE_TYPE r;
21155
21156 if (!const_vec_duplicate_p (x, &elt)
21157 || !CONST_DOUBLE_P (elt))
21158 return false;
21159
21160 r = *CONST_DOUBLE_REAL_VALUE (elt);
21161
21162 if (negate_p)
21163 r = real_value_negate (&r);
21164
21165 if (real_equal (&r, &dconst1))
21166 return true;
21167 if (real_equal (&r, &dconsthalf))
21168 return true;
21169 return false;
21170 }
21171
21172 /* Return true if X is a valid immediate operand for an SVE FMUL
21173 instruction. */
21174
21175 bool
21176 aarch64_sve_float_mul_immediate_p (rtx x)
21177 {
21178 rtx elt;
21179
21180 return (const_vec_duplicate_p (x, &elt)
21181 && CONST_DOUBLE_P (elt)
21182 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
21183 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
21184 }
21185
21186 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
21187 for the Advanced SIMD operation described by WHICH and INSN. If INFO
21188 is nonnull, use it to describe valid immediates. */
21189 static bool
21190 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
21191 simd_immediate_info *info,
21192 enum simd_immediate_check which,
21193 simd_immediate_info::insn_type insn)
21194 {
21195 /* Try a 4-byte immediate with LSL. */
21196 for (unsigned int shift = 0; shift < 32; shift += 8)
21197 if ((val32 & (0xff << shift)) == val32)
21198 {
21199 if (info)
21200 *info = simd_immediate_info (SImode, val32 >> shift, insn,
21201 simd_immediate_info::LSL, shift);
21202 return true;
21203 }
21204
21205 /* Try a 2-byte immediate with LSL. */
21206 unsigned int imm16 = val32 & 0xffff;
21207 if (imm16 == (val32 >> 16))
21208 for (unsigned int shift = 0; shift < 16; shift += 8)
21209 if ((imm16 & (0xff << shift)) == imm16)
21210 {
21211 if (info)
21212 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
21213 simd_immediate_info::LSL, shift);
21214 return true;
21215 }
21216
21217 /* Try a 4-byte immediate with MSL, except for cases that MVN
21218 can handle. */
21219 if (which == AARCH64_CHECK_MOV)
21220 for (unsigned int shift = 8; shift < 24; shift += 8)
21221 {
21222 unsigned int low = (1 << shift) - 1;
21223 if (((val32 & (0xff << shift)) | low) == val32)
21224 {
21225 if (info)
21226 *info = simd_immediate_info (SImode, val32 >> shift, insn,
21227 simd_immediate_info::MSL, shift);
21228 return true;
21229 }
21230 }
21231
21232 return false;
21233 }
21234
21235 /* Return true if replicating VAL64 is a valid immediate for the
21236 Advanced SIMD operation described by WHICH. If INFO is nonnull,
21237 use it to describe valid immediates. */
21238 static bool
21239 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
21240 simd_immediate_info *info,
21241 enum simd_immediate_check which)
21242 {
21243 unsigned int val32 = val64 & 0xffffffff;
21244 unsigned int val16 = val64 & 0xffff;
21245 unsigned int val8 = val64 & 0xff;
21246
21247 if (val32 == (val64 >> 32))
21248 {
21249 if ((which & AARCH64_CHECK_ORR) != 0
21250 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
21251 simd_immediate_info::MOV))
21252 return true;
21253
21254 if ((which & AARCH64_CHECK_BIC) != 0
21255 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
21256 simd_immediate_info::MVN))
21257 return true;
21258
21259 /* Try using a replicated byte. */
21260 if (which == AARCH64_CHECK_MOV
21261 && val16 == (val32 >> 16)
21262 && val8 == (val16 >> 8))
21263 {
21264 if (info)
21265 *info = simd_immediate_info (QImode, val8);
21266 return true;
21267 }
21268 }
21269
21270 /* Try using a bit-to-bytemask. */
21271 if (which == AARCH64_CHECK_MOV)
21272 {
21273 unsigned int i;
21274 for (i = 0; i < 64; i += 8)
21275 {
21276 unsigned char byte = (val64 >> i) & 0xff;
21277 if (byte != 0 && byte != 0xff)
21278 break;
21279 }
21280 if (i == 64)
21281 {
21282 if (info)
21283 *info = simd_immediate_info (DImode, val64);
21284 return true;
21285 }
21286 }
21287 return false;
21288 }
21289
21290 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
21291 instruction. If INFO is nonnull, use it to describe valid immediates. */
21292
21293 static bool
21294 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
21295 simd_immediate_info *info)
21296 {
21297 scalar_int_mode mode = DImode;
21298 unsigned int val32 = val64 & 0xffffffff;
21299 if (val32 == (val64 >> 32))
21300 {
21301 mode = SImode;
21302 unsigned int val16 = val32 & 0xffff;
21303 if (val16 == (val32 >> 16))
21304 {
21305 mode = HImode;
21306 unsigned int val8 = val16 & 0xff;
21307 if (val8 == (val16 >> 8))
21308 mode = QImode;
21309 }
21310 }
21311 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
21312 if (IN_RANGE (val, -0x80, 0x7f))
21313 {
21314 /* DUP with no shift. */
21315 if (info)
21316 *info = simd_immediate_info (mode, val);
21317 return true;
21318 }
21319 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
21320 {
21321 /* DUP with LSL #8. */
21322 if (info)
21323 *info = simd_immediate_info (mode, val);
21324 return true;
21325 }
21326 if (aarch64_bitmask_imm (val64, mode))
21327 {
21328 /* DUPM. */
21329 if (info)
21330 *info = simd_immediate_info (mode, val);
21331 return true;
21332 }
21333 return false;
21334 }
21335
21336 /* Return true if X is an UNSPEC_PTRUE constant of the form:
21337
21338 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
21339
21340 where PATTERN is the svpattern as a CONST_INT and where ZERO
21341 is a zero constant of the required PTRUE mode (which can have
21342 fewer elements than X's mode, if zero bits are significant).
21343
21344 If so, and if INFO is nonnull, describe the immediate in INFO. */
21345 bool
21346 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
21347 {
21348 if (GET_CODE (x) != CONST)
21349 return false;
21350
21351 x = XEXP (x, 0);
21352 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
21353 return false;
21354
21355 if (info)
21356 {
21357 aarch64_svpattern pattern
21358 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
21359 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
21360 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
21361 *info = simd_immediate_info (int_mode, pattern);
21362 }
21363 return true;
21364 }
21365
21366 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
21367 it to describe valid immediates. */
21368
21369 static bool
21370 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
21371 {
21372 if (aarch64_sve_ptrue_svpattern_p (x, info))
21373 return true;
21374
21375 if (x == CONST0_RTX (GET_MODE (x)))
21376 {
21377 if (info)
21378 *info = simd_immediate_info (DImode, 0);
21379 return true;
21380 }
21381
21382 /* Analyze the value as a VNx16BImode. This should be relatively
21383 efficient, since rtx_vector_builder has enough built-in capacity
21384 to store all VLA predicate constants without needing the heap. */
21385 rtx_vector_builder builder;
21386 if (!aarch64_get_sve_pred_bits (builder, x))
21387 return false;
21388
21389 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
21390 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
21391 {
21392 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
21393 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
21394 if (pattern != AARCH64_NUM_SVPATTERNS)
21395 {
21396 if (info)
21397 {
21398 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
21399 *info = simd_immediate_info (int_mode, pattern);
21400 }
21401 return true;
21402 }
21403 }
21404 return false;
21405 }
21406
21407 /* Return true if OP is a valid SIMD immediate for the operation
21408 described by WHICH. If INFO is nonnull, use it to describe valid
21409 immediates. */
21410 bool
21411 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
21412 enum simd_immediate_check which)
21413 {
21414 machine_mode mode = GET_MODE (op);
21415 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21416 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21417 return false;
21418
21419 if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
21420 return false;
21421
21422 if (vec_flags & VEC_SVE_PRED)
21423 return aarch64_sve_pred_valid_immediate (op, info);
21424
21425 scalar_mode elt_mode = GET_MODE_INNER (mode);
21426 rtx base, step;
21427 unsigned int n_elts;
21428 if (CONST_VECTOR_P (op)
21429 && CONST_VECTOR_DUPLICATE_P (op))
21430 n_elts = CONST_VECTOR_NPATTERNS (op);
21431 else if ((vec_flags & VEC_SVE_DATA)
21432 && const_vec_series_p (op, &base, &step))
21433 {
21434 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
21435 if (!aarch64_sve_index_immediate_p (base)
21436 || !aarch64_sve_index_immediate_p (step))
21437 return false;
21438
21439 if (info)
21440 {
21441 /* Get the corresponding container mode. E.g. an INDEX on V2SI
21442 should yield two integer values per 128-bit block, meaning
21443 that we need to treat it in the same way as V2DI and then
21444 ignore the upper 32 bits of each element. */
21445 elt_mode = aarch64_sve_container_int_mode (mode);
21446 *info = simd_immediate_info (elt_mode, base, step);
21447 }
21448 return true;
21449 }
21450 else if (CONST_VECTOR_P (op)
21451 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
21452 /* N_ELTS set above. */;
21453 else
21454 return false;
21455
21456 scalar_float_mode elt_float_mode;
21457 if (n_elts == 1
21458 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
21459 {
21460 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
21461 if (aarch64_float_const_zero_rtx_p (elt)
21462 || aarch64_float_const_representable_p (elt))
21463 {
21464 if (info)
21465 *info = simd_immediate_info (elt_float_mode, elt);
21466 return true;
21467 }
21468 }
21469
21470 /* If all elements in an SVE vector have the same value, we have a free
21471 choice between using the element mode and using the container mode.
21472 Using the element mode means that unused parts of the vector are
21473 duplicates of the used elements, while using the container mode means
21474 that the unused parts are an extension of the used elements. Using the
21475 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
21476 for its container mode VNx4SI while 0x00000101 isn't.
21477
21478 If not all elements in an SVE vector have the same value, we need the
21479 transition from one element to the next to occur at container boundaries.
21480 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
21481 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
21482 scalar_int_mode elt_int_mode;
21483 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
21484 elt_int_mode = aarch64_sve_container_int_mode (mode);
21485 else
21486 elt_int_mode = int_mode_for_mode (elt_mode).require ();
21487
21488 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
21489 if (elt_size > 8)
21490 return false;
21491
21492 /* Expand the vector constant out into a byte vector, with the least
21493 significant byte of the register first. */
21494 auto_vec<unsigned char, 16> bytes;
21495 bytes.reserve (n_elts * elt_size);
21496 for (unsigned int i = 0; i < n_elts; i++)
21497 {
21498 /* The vector is provided in gcc endian-neutral fashion.
21499 For aarch64_be Advanced SIMD, it must be laid out in the vector
21500 register in reverse order. */
21501 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
21502 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
21503
21504 if (elt_mode != elt_int_mode)
21505 elt = gen_lowpart (elt_int_mode, elt);
21506
21507 if (!CONST_INT_P (elt))
21508 return false;
21509
21510 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
21511 for (unsigned int byte = 0; byte < elt_size; byte++)
21512 {
21513 bytes.quick_push (elt_val & 0xff);
21514 elt_val >>= BITS_PER_UNIT;
21515 }
21516 }
21517
21518 /* The immediate must repeat every eight bytes. */
21519 unsigned int nbytes = bytes.length ();
21520 for (unsigned i = 8; i < nbytes; ++i)
21521 if (bytes[i] != bytes[i - 8])
21522 return false;
21523
21524 /* Get the repeating 8-byte value as an integer. No endian correction
21525 is needed here because bytes is already in lsb-first order. */
21526 unsigned HOST_WIDE_INT val64 = 0;
21527 for (unsigned int i = 0; i < 8; i++)
21528 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
21529 << (i * BITS_PER_UNIT));
21530
21531 if (vec_flags & VEC_SVE_DATA)
21532 return aarch64_sve_valid_immediate (val64, info);
21533 else
21534 return aarch64_advsimd_valid_immediate (val64, info, which);
21535 }
21536
21537 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
21538 has a step in the range of INDEX. Return the index expression if so,
21539 otherwise return null. */
21540 rtx
21541 aarch64_check_zero_based_sve_index_immediate (rtx x)
21542 {
21543 rtx base, step;
21544 if (const_vec_series_p (x, &base, &step)
21545 && base == const0_rtx
21546 && aarch64_sve_index_immediate_p (step))
21547 return step;
21548 return NULL_RTX;
21549 }
21550
21551 /* Check of immediate shift constants are within range. */
21552 bool
21553 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
21554 {
21555 x = unwrap_const_vec_duplicate (x);
21556 if (!CONST_INT_P (x))
21557 return false;
21558 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
21559 if (left)
21560 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
21561 else
21562 return IN_RANGE (INTVAL (x), 1, bit_width);
21563 }
21564
21565 /* Return the bitmask CONST_INT to select the bits required by a zero extract
21566 operation of width WIDTH at bit position POS. */
21567
21568 rtx
21569 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
21570 {
21571 gcc_assert (CONST_INT_P (width));
21572 gcc_assert (CONST_INT_P (pos));
21573
21574 unsigned HOST_WIDE_INT mask
21575 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
21576 return GEN_INT (mask << UINTVAL (pos));
21577 }
21578
21579 bool
21580 aarch64_mov_operand_p (rtx x, machine_mode mode)
21581 {
21582 if (GET_CODE (x) == HIGH
21583 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
21584 return true;
21585
21586 if (CONST_INT_P (x))
21587 return true;
21588
21589 if (VECTOR_MODE_P (GET_MODE (x)))
21590 {
21591 /* Require predicate constants to be VNx16BI before RA, so that we
21592 force everything to have a canonical form. */
21593 if (!lra_in_progress
21594 && !reload_completed
21595 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
21596 && GET_MODE (x) != VNx16BImode)
21597 return false;
21598
21599 return aarch64_simd_valid_immediate (x, NULL);
21600 }
21601
21602 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
21603 x = strip_salt (x);
21604
21605 /* GOT accesses are valid moves. */
21606 if (SYMBOL_REF_P (x)
21607 && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
21608 return true;
21609
21610 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
21611 return true;
21612
21613 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
21614 return true;
21615
21616 return aarch64_classify_symbolic_expression (x)
21617 == SYMBOL_TINY_ABSOLUTE;
21618 }
21619
21620 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
21621 the constant creation. */
21622
21623 rtx
21624 aarch64_gen_shareable_zero (machine_mode mode)
21625 {
21626 machine_mode zmode = V4SImode;
21627 rtx tmp = gen_reg_rtx (zmode);
21628 emit_move_insn (tmp, CONST0_RTX (zmode));
21629 return lowpart_subreg (mode, tmp, zmode);
21630 }
21631
21632 /* Return a const_int vector of VAL. */
21633 rtx
21634 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
21635 {
21636 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
21637 return gen_const_vec_duplicate (mode, c);
21638 }
21639
21640 /* Check OP is a legal scalar immediate for the MOVI instruction. */
21641
21642 bool
21643 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
21644 {
21645 machine_mode vmode;
21646
21647 vmode = aarch64_simd_container_mode (mode, 64);
21648 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
21649 return aarch64_simd_valid_immediate (op_v, NULL);
21650 }
21651
21652 /* Construct and return a PARALLEL RTX vector with elements numbering the
21653 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
21654 the vector - from the perspective of the architecture. This does not
21655 line up with GCC's perspective on lane numbers, so we end up with
21656 different masks depending on our target endian-ness. The diagram
21657 below may help. We must draw the distinction when building masks
21658 which select one half of the vector. An instruction selecting
21659 architectural low-lanes for a big-endian target, must be described using
21660 a mask selecting GCC high-lanes.
21661
21662 Big-Endian Little-Endian
21663
21664 GCC 0 1 2 3 3 2 1 0
21665 | x | x | x | x | | x | x | x | x |
21666 Architecture 3 2 1 0 3 2 1 0
21667
21668 Low Mask: { 2, 3 } { 0, 1 }
21669 High Mask: { 0, 1 } { 2, 3 }
21670
21671 MODE Is the mode of the vector and NUNITS is the number of units in it. */
21672
21673 rtx
21674 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
21675 {
21676 rtvec v = rtvec_alloc (nunits / 2);
21677 int high_base = nunits / 2;
21678 int low_base = 0;
21679 int base;
21680 rtx t1;
21681 int i;
21682
21683 if (BYTES_BIG_ENDIAN)
21684 base = high ? low_base : high_base;
21685 else
21686 base = high ? high_base : low_base;
21687
21688 for (i = 0; i < nunits / 2; i++)
21689 RTVEC_ELT (v, i) = GEN_INT (base + i);
21690
21691 t1 = gen_rtx_PARALLEL (mode, v);
21692 return t1;
21693 }
21694
21695 /* Check OP for validity as a PARALLEL RTX vector with elements
21696 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
21697 from the perspective of the architecture. See the diagram above
21698 aarch64_simd_vect_par_cnst_half for more details. */
21699
21700 bool
21701 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
21702 bool high)
21703 {
21704 int nelts;
21705 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
21706 return false;
21707
21708 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
21709 HOST_WIDE_INT count_op = XVECLEN (op, 0);
21710 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
21711 int i = 0;
21712
21713 if (count_op != count_ideal)
21714 return false;
21715
21716 for (i = 0; i < count_ideal; i++)
21717 {
21718 rtx elt_op = XVECEXP (op, 0, i);
21719 rtx elt_ideal = XVECEXP (ideal, 0, i);
21720
21721 if (!CONST_INT_P (elt_op)
21722 || INTVAL (elt_ideal) != INTVAL (elt_op))
21723 return false;
21724 }
21725 return true;
21726 }
21727
21728 /* Return a PARALLEL containing NELTS elements, with element I equal
21729 to BASE + I * STEP. */
21730
21731 rtx
21732 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
21733 {
21734 rtvec vec = rtvec_alloc (nelts);
21735 for (unsigned int i = 0; i < nelts; ++i)
21736 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
21737 return gen_rtx_PARALLEL (VOIDmode, vec);
21738 }
21739
21740 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
21741 series with step STEP. */
21742
21743 bool
21744 aarch64_stepped_int_parallel_p (rtx op, int step)
21745 {
21746 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
21747 return false;
21748
21749 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
21750 for (int i = 1; i < XVECLEN (op, 0); ++i)
21751 if (!CONST_INT_P (XVECEXP (op, 0, i))
21752 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
21753 return false;
21754
21755 return true;
21756 }
21757
21758 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
21759 HIGH (exclusive). */
21760 void
21761 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
21762 const_tree exp)
21763 {
21764 HOST_WIDE_INT lane;
21765 gcc_assert (CONST_INT_P (operand));
21766 lane = INTVAL (operand);
21767
21768 if (lane < low || lane >= high)
21769 {
21770 if (exp)
21771 error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
21772 lane, low, high - 1);
21773 else
21774 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
21775 }
21776 }
21777
21778 /* Peform endian correction on lane number N, which indexes a vector
21779 of mode MODE, and return the result as an SImode rtx. */
21780
21781 rtx
21782 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
21783 {
21784 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
21785 }
21786
21787 /* Return TRUE if OP is a valid vector addressing mode. */
21788
21789 bool
21790 aarch64_simd_mem_operand_p (rtx op)
21791 {
21792 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
21793 || REG_P (XEXP (op, 0)));
21794 }
21795
21796 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
21797
21798 bool
21799 aarch64_sve_ld1r_operand_p (rtx op)
21800 {
21801 struct aarch64_address_info addr;
21802 scalar_mode mode;
21803
21804 return (MEM_P (op)
21805 && is_a <scalar_mode> (GET_MODE (op), &mode)
21806 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
21807 && addr.type == ADDRESS_REG_IMM
21808 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
21809 }
21810
21811 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
21812 where the size of the read data is specified by `mode` and the size of the
21813 vector elements are specified by `elem_mode`. */
21814 bool
21815 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
21816 scalar_mode elem_mode)
21817 {
21818 struct aarch64_address_info addr;
21819 if (!MEM_P (op)
21820 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
21821 return false;
21822
21823 if (addr.type == ADDRESS_REG_IMM)
21824 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
21825
21826 if (addr.type == ADDRESS_REG_REG)
21827 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
21828
21829 return false;
21830 }
21831
21832 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
21833 bool
21834 aarch64_sve_ld1rq_operand_p (rtx op)
21835 {
21836 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
21837 GET_MODE_INNER (GET_MODE (op)));
21838 }
21839
21840 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
21841 accessing a vector where the element size is specified by `elem_mode`. */
21842 bool
21843 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
21844 {
21845 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
21846 }
21847
21848 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
21849 bool
21850 aarch64_sve_ldff1_operand_p (rtx op)
21851 {
21852 if (!MEM_P (op))
21853 return false;
21854
21855 struct aarch64_address_info addr;
21856 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
21857 return false;
21858
21859 if (addr.type == ADDRESS_REG_IMM)
21860 return known_eq (addr.const_offset, 0);
21861
21862 return addr.type == ADDRESS_REG_REG;
21863 }
21864
21865 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
21866 bool
21867 aarch64_sve_ldnf1_operand_p (rtx op)
21868 {
21869 struct aarch64_address_info addr;
21870
21871 return (MEM_P (op)
21872 && aarch64_classify_address (&addr, XEXP (op, 0),
21873 GET_MODE (op), false)
21874 && addr.type == ADDRESS_REG_IMM);
21875 }
21876
21877 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
21878 The conditions for STR are the same. */
21879 bool
21880 aarch64_sve_ldr_operand_p (rtx op)
21881 {
21882 struct aarch64_address_info addr;
21883
21884 return (MEM_P (op)
21885 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
21886 false, ADDR_QUERY_ANY)
21887 && addr.type == ADDRESS_REG_IMM);
21888 }
21889
21890 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
21891 addressing memory of mode MODE. */
21892 bool
21893 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
21894 {
21895 struct aarch64_address_info addr;
21896 if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
21897 return false;
21898
21899 if (addr.type == ADDRESS_REG_IMM)
21900 return offset_6bit_signed_scaled_p (mode, addr.const_offset);
21901
21902 return addr.type == ADDRESS_REG_REG;
21903 }
21904
21905 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
21906 We need to be able to access the individual pieces, so the range
21907 is different from LD[234] and ST[234]. */
21908 bool
21909 aarch64_sve_struct_memory_operand_p (rtx op)
21910 {
21911 if (!MEM_P (op))
21912 return false;
21913
21914 machine_mode mode = GET_MODE (op);
21915 struct aarch64_address_info addr;
21916 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
21917 ADDR_QUERY_ANY)
21918 || addr.type != ADDRESS_REG_IMM)
21919 return false;
21920
21921 poly_int64 first = addr.const_offset;
21922 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
21923 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
21924 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
21925 }
21926
21927 /* Emit a register copy from operand to operand, taking care not to
21928 early-clobber source registers in the process.
21929
21930 COUNT is the number of components into which the copy needs to be
21931 decomposed. */
21932 void
21933 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
21934 unsigned int count)
21935 {
21936 unsigned int i;
21937 int rdest = REGNO (operands[0]);
21938 int rsrc = REGNO (operands[1]);
21939
21940 if (!reg_overlap_mentioned_p (operands[0], operands[1])
21941 || rdest < rsrc)
21942 for (i = 0; i < count; i++)
21943 emit_move_insn (gen_rtx_REG (mode, rdest + i),
21944 gen_rtx_REG (mode, rsrc + i));
21945 else
21946 for (i = 0; i < count; i++)
21947 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
21948 gen_rtx_REG (mode, rsrc + count - i - 1));
21949 }
21950
21951 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
21952 one of VSTRUCT modes: OI, CI, or XI. */
21953 int
21954 aarch64_simd_attr_length_rglist (machine_mode mode)
21955 {
21956 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
21957 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
21958 }
21959
21960 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
21961 alignment of a vector to 128 bits. SVE predicates have an alignment of
21962 16 bits. */
21963 static HOST_WIDE_INT
21964 aarch64_simd_vector_alignment (const_tree type)
21965 {
21966 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
21967 be set for non-predicate vectors of booleans. Modes are the most
21968 direct way we have of identifying real SVE predicate types. */
21969 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
21970 return 16;
21971 widest_int min_size
21972 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
21973 return wi::umin (min_size, 128).to_uhwi ();
21974 }
21975
21976 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
21977 static poly_uint64
21978 aarch64_vectorize_preferred_vector_alignment (const_tree type)
21979 {
21980 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
21981 {
21982 /* If the length of the vector is a fixed power of 2, try to align
21983 to that length, otherwise don't try to align at all. */
21984 HOST_WIDE_INT result;
21985 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
21986 || !pow2p_hwi (result))
21987 result = TYPE_ALIGN (TREE_TYPE (type));
21988 return result;
21989 }
21990 return TYPE_ALIGN (type);
21991 }
21992
21993 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
21994 static bool
21995 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
21996 {
21997 if (is_packed)
21998 return false;
21999
22000 /* For fixed-length vectors, check that the vectorizer will aim for
22001 full-vector alignment. This isn't true for generic GCC vectors
22002 that are wider than the ABI maximum of 128 bits. */
22003 poly_uint64 preferred_alignment =
22004 aarch64_vectorize_preferred_vector_alignment (type);
22005 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
22006 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
22007 preferred_alignment))
22008 return false;
22009
22010 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
22011 return true;
22012 }
22013
22014 /* Return true if the vector misalignment factor is supported by the
22015 target. */
22016 static bool
22017 aarch64_builtin_support_vector_misalignment (machine_mode mode,
22018 const_tree type, int misalignment,
22019 bool is_packed)
22020 {
22021 if (TARGET_SIMD && STRICT_ALIGNMENT)
22022 {
22023 /* Return if movmisalign pattern is not supported for this mode. */
22024 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
22025 return false;
22026
22027 /* Misalignment factor is unknown at compile time. */
22028 if (misalignment == -1)
22029 return false;
22030 }
22031 return default_builtin_support_vector_misalignment (mode, type, misalignment,
22032 is_packed);
22033 }
22034
22035 /* If VALS is a vector constant that can be loaded into a register
22036 using DUP, generate instructions to do so and return an RTX to
22037 assign to the register. Otherwise return NULL_RTX. */
22038 static rtx
22039 aarch64_simd_dup_constant (rtx vals)
22040 {
22041 machine_mode mode = GET_MODE (vals);
22042 machine_mode inner_mode = GET_MODE_INNER (mode);
22043 rtx x;
22044
22045 if (!const_vec_duplicate_p (vals, &x))
22046 return NULL_RTX;
22047
22048 /* We can load this constant by using DUP and a constant in a
22049 single ARM register. This will be cheaper than a vector
22050 load. */
22051 x = copy_to_mode_reg (inner_mode, x);
22052 return gen_vec_duplicate (mode, x);
22053 }
22054
22055
22056 /* Generate code to load VALS, which is a PARALLEL containing only
22057 constants (for vec_init) or CONST_VECTOR, efficiently into a
22058 register. Returns an RTX to copy into the register, or NULL_RTX
22059 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
22060 static rtx
22061 aarch64_simd_make_constant (rtx vals)
22062 {
22063 machine_mode mode = GET_MODE (vals);
22064 rtx const_dup;
22065 rtx const_vec = NULL_RTX;
22066 int n_const = 0;
22067 int i;
22068
22069 if (CONST_VECTOR_P (vals))
22070 const_vec = vals;
22071 else if (GET_CODE (vals) == PARALLEL)
22072 {
22073 /* A CONST_VECTOR must contain only CONST_INTs and
22074 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
22075 Only store valid constants in a CONST_VECTOR. */
22076 int n_elts = XVECLEN (vals, 0);
22077 for (i = 0; i < n_elts; ++i)
22078 {
22079 rtx x = XVECEXP (vals, 0, i);
22080 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22081 n_const++;
22082 }
22083 if (n_const == n_elts)
22084 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
22085 }
22086 else
22087 gcc_unreachable ();
22088
22089 if (const_vec != NULL_RTX
22090 && aarch64_simd_valid_immediate (const_vec, NULL))
22091 /* Load using MOVI/MVNI. */
22092 return const_vec;
22093 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
22094 /* Loaded using DUP. */
22095 return const_dup;
22096 else if (const_vec != NULL_RTX)
22097 /* Load from constant pool. We cannot take advantage of single-cycle
22098 LD1 because we need a PC-relative addressing mode. */
22099 return const_vec;
22100 else
22101 /* A PARALLEL containing something not valid inside CONST_VECTOR.
22102 We cannot construct an initializer. */
22103 return NULL_RTX;
22104 }
22105
22106 /* Expand a vector initialisation sequence, such that TARGET is
22107 initialised to contain VALS. */
22108
22109 void
22110 aarch64_expand_vector_init (rtx target, rtx vals)
22111 {
22112 machine_mode mode = GET_MODE (target);
22113 scalar_mode inner_mode = GET_MODE_INNER (mode);
22114 /* The number of vector elements. */
22115 int n_elts = XVECLEN (vals, 0);
22116 /* The number of vector elements which are not constant. */
22117 int n_var = 0;
22118 rtx any_const = NULL_RTX;
22119 /* The first element of vals. */
22120 rtx v0 = XVECEXP (vals, 0, 0);
22121 bool all_same = true;
22122
22123 /* This is a special vec_init<M><N> where N is not an element mode but a
22124 vector mode with half the elements of M. We expect to find two entries
22125 of mode N in VALS and we must put their concatentation into TARGET. */
22126 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
22127 {
22128 machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
22129 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
22130 && known_eq (GET_MODE_SIZE (mode),
22131 2 * GET_MODE_SIZE (narrow_mode)));
22132 emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
22133 XVECEXP (vals, 0, 0),
22134 XVECEXP (vals, 0, 1)));
22135 return;
22136 }
22137
22138 /* Count the number of variable elements to initialise. */
22139 for (int i = 0; i < n_elts; ++i)
22140 {
22141 rtx x = XVECEXP (vals, 0, i);
22142 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
22143 ++n_var;
22144 else
22145 any_const = x;
22146
22147 all_same &= rtx_equal_p (x, v0);
22148 }
22149
22150 /* No variable elements, hand off to aarch64_simd_make_constant which knows
22151 how best to handle this. */
22152 if (n_var == 0)
22153 {
22154 rtx constant = aarch64_simd_make_constant (vals);
22155 if (constant != NULL_RTX)
22156 {
22157 emit_move_insn (target, constant);
22158 return;
22159 }
22160 }
22161
22162 /* Splat a single non-constant element if we can. */
22163 if (all_same)
22164 {
22165 rtx x = copy_to_mode_reg (inner_mode, v0);
22166 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22167 return;
22168 }
22169
22170 /* Check for interleaving case.
22171 For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
22172 Generate following code:
22173 dup v0.h, x
22174 dup v1.h, y
22175 zip1 v0.h, v0.h, v1.h
22176 for "large enough" initializer. */
22177
22178 if (n_elts >= 8)
22179 {
22180 int i;
22181 for (i = 2; i < n_elts; i++)
22182 if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
22183 break;
22184
22185 if (i == n_elts)
22186 {
22187 machine_mode mode = GET_MODE (target);
22188 rtx dest[2];
22189
22190 for (int i = 0; i < 2; i++)
22191 {
22192 rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
22193 dest[i] = force_reg (mode, x);
22194 }
22195
22196 rtvec v = gen_rtvec (2, dest[0], dest[1]);
22197 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22198 return;
22199 }
22200 }
22201
22202 enum insn_code icode = optab_handler (vec_set_optab, mode);
22203 gcc_assert (icode != CODE_FOR_nothing);
22204
22205 /* If there are only variable elements, try to optimize
22206 the insertion using dup for the most common element
22207 followed by insertions. */
22208
22209 /* The algorithm will fill matches[*][0] with the earliest matching element,
22210 and matches[X][1] with the count of duplicate elements (if X is the
22211 earliest element which has duplicates). */
22212
22213 if (n_var == n_elts && n_elts <= 16)
22214 {
22215 int matches[16][2] = {0};
22216 for (int i = 0; i < n_elts; i++)
22217 {
22218 for (int j = 0; j <= i; j++)
22219 {
22220 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
22221 {
22222 matches[i][0] = j;
22223 matches[j][1]++;
22224 break;
22225 }
22226 }
22227 }
22228 int maxelement = 0;
22229 int maxv = 0;
22230 for (int i = 0; i < n_elts; i++)
22231 if (matches[i][1] > maxv)
22232 {
22233 maxelement = i;
22234 maxv = matches[i][1];
22235 }
22236
22237 /* Create a duplicate of the most common element, unless all elements
22238 are equally useless to us, in which case just immediately set the
22239 vector register using the first element. */
22240
22241 if (maxv == 1)
22242 {
22243 /* For vectors of two 64-bit elements, we can do even better. */
22244 if (n_elts == 2
22245 && (inner_mode == E_DImode
22246 || inner_mode == E_DFmode))
22247
22248 {
22249 rtx x0 = XVECEXP (vals, 0, 0);
22250 rtx x1 = XVECEXP (vals, 0, 1);
22251 /* Combine can pick up this case, but handling it directly
22252 here leaves clearer RTL.
22253
22254 This is load_pair_lanes<mode>, and also gives us a clean-up
22255 for store_pair_lanes<mode>. */
22256 if (memory_operand (x0, inner_mode)
22257 && memory_operand (x1, inner_mode)
22258 && aarch64_mergeable_load_pair_p (mode, x0, x1))
22259 {
22260 rtx t;
22261 if (inner_mode == DFmode)
22262 t = gen_load_pair_lanesdf (target, x0, x1);
22263 else
22264 t = gen_load_pair_lanesdi (target, x0, x1);
22265 emit_insn (t);
22266 return;
22267 }
22268 }
22269 /* The subreg-move sequence below will move into lane zero of the
22270 vector register. For big-endian we want that position to hold
22271 the last element of VALS. */
22272 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
22273 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22274 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
22275 }
22276 else
22277 {
22278 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22279 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22280 }
22281
22282 /* Insert the rest. */
22283 for (int i = 0; i < n_elts; i++)
22284 {
22285 rtx x = XVECEXP (vals, 0, i);
22286 if (matches[i][0] == maxelement)
22287 continue;
22288 x = copy_to_mode_reg (inner_mode, x);
22289 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22290 }
22291 return;
22292 }
22293
22294 /* Initialise a vector which is part-variable. We want to first try
22295 to build those lanes which are constant in the most efficient way we
22296 can. */
22297 if (n_var != n_elts)
22298 {
22299 rtx copy = copy_rtx (vals);
22300
22301 /* Load constant part of vector. We really don't care what goes into the
22302 parts we will overwrite, but we're more likely to be able to load the
22303 constant efficiently if it has fewer, larger, repeating parts
22304 (see aarch64_simd_valid_immediate). */
22305 for (int i = 0; i < n_elts; i++)
22306 {
22307 rtx x = XVECEXP (vals, 0, i);
22308 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22309 continue;
22310 rtx subst = any_const;
22311 for (int bit = n_elts / 2; bit > 0; bit /= 2)
22312 {
22313 /* Look in the copied vector, as more elements are const. */
22314 rtx test = XVECEXP (copy, 0, i ^ bit);
22315 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
22316 {
22317 subst = test;
22318 break;
22319 }
22320 }
22321 XVECEXP (copy, 0, i) = subst;
22322 }
22323 aarch64_expand_vector_init (target, copy);
22324 }
22325
22326 /* Insert the variable lanes directly. */
22327 for (int i = 0; i < n_elts; i++)
22328 {
22329 rtx x = XVECEXP (vals, 0, i);
22330 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22331 continue;
22332 x = copy_to_mode_reg (inner_mode, x);
22333 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22334 }
22335 }
22336
22337 /* Emit RTL corresponding to:
22338 insr TARGET, ELEM. */
22339
22340 static void
22341 emit_insr (rtx target, rtx elem)
22342 {
22343 machine_mode mode = GET_MODE (target);
22344 scalar_mode elem_mode = GET_MODE_INNER (mode);
22345 elem = force_reg (elem_mode, elem);
22346
22347 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
22348 gcc_assert (icode != CODE_FOR_nothing);
22349 emit_insn (GEN_FCN (icode) (target, target, elem));
22350 }
22351
22352 /* Subroutine of aarch64_sve_expand_vector_init for handling
22353 trailing constants.
22354 This function works as follows:
22355 (a) Create a new vector consisting of trailing constants.
22356 (b) Initialize TARGET with the constant vector using emit_move_insn.
22357 (c) Insert remaining elements in TARGET using insr.
22358 NELTS is the total number of elements in original vector while
22359 while NELTS_REQD is the number of elements that are actually
22360 significant.
22361
22362 ??? The heuristic used is to do above only if number of constants
22363 is at least half the total number of elements. May need fine tuning. */
22364
22365 static bool
22366 aarch64_sve_expand_vector_init_handle_trailing_constants
22367 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
22368 {
22369 machine_mode mode = GET_MODE (target);
22370 scalar_mode elem_mode = GET_MODE_INNER (mode);
22371 int n_trailing_constants = 0;
22372
22373 for (int i = nelts_reqd - 1;
22374 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
22375 i--)
22376 n_trailing_constants++;
22377
22378 if (n_trailing_constants >= nelts_reqd / 2)
22379 {
22380 /* Try to use the natural pattern of BUILDER to extend the trailing
22381 constant elements to a full vector. Replace any variables in the
22382 extra elements with zeros.
22383
22384 ??? It would be better if the builders supported "don't care"
22385 elements, with the builder filling in whichever elements
22386 give the most compact encoding. */
22387 rtx_vector_builder v (mode, nelts, 1);
22388 for (int i = 0; i < nelts; i++)
22389 {
22390 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
22391 if (!valid_for_const_vector_p (elem_mode, x))
22392 x = CONST0_RTX (elem_mode);
22393 v.quick_push (x);
22394 }
22395 rtx const_vec = v.build ();
22396 emit_move_insn (target, const_vec);
22397
22398 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
22399 emit_insr (target, builder.elt (i));
22400
22401 return true;
22402 }
22403
22404 return false;
22405 }
22406
22407 /* Subroutine of aarch64_sve_expand_vector_init.
22408 Works as follows:
22409 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
22410 (b) Skip trailing elements from BUILDER, which are the same as
22411 element NELTS_REQD - 1.
22412 (c) Insert earlier elements in reverse order in TARGET using insr. */
22413
22414 static void
22415 aarch64_sve_expand_vector_init_insert_elems (rtx target,
22416 const rtx_vector_builder &builder,
22417 int nelts_reqd)
22418 {
22419 machine_mode mode = GET_MODE (target);
22420 scalar_mode elem_mode = GET_MODE_INNER (mode);
22421
22422 struct expand_operand ops[2];
22423 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
22424 gcc_assert (icode != CODE_FOR_nothing);
22425
22426 create_output_operand (&ops[0], target, mode);
22427 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
22428 expand_insn (icode, 2, ops);
22429
22430 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22431 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
22432 emit_insr (target, builder.elt (i));
22433 }
22434
22435 /* Subroutine of aarch64_sve_expand_vector_init to handle case
22436 when all trailing elements of builder are same.
22437 This works as follows:
22438 (a) Use expand_insn interface to broadcast last vector element in TARGET.
22439 (b) Insert remaining elements in TARGET using insr.
22440
22441 ??? The heuristic used is to do above if number of same trailing elements
22442 is at least 3/4 of total number of elements, loosely based on
22443 heuristic from mostly_zeros_p. May need fine-tuning. */
22444
22445 static bool
22446 aarch64_sve_expand_vector_init_handle_trailing_same_elem
22447 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
22448 {
22449 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22450 if (ndups >= (3 * nelts_reqd) / 4)
22451 {
22452 aarch64_sve_expand_vector_init_insert_elems (target, builder,
22453 nelts_reqd - ndups + 1);
22454 return true;
22455 }
22456
22457 return false;
22458 }
22459
22460 /* Initialize register TARGET from BUILDER. NELTS is the constant number
22461 of elements in BUILDER.
22462
22463 The function tries to initialize TARGET from BUILDER if it fits one
22464 of the special cases outlined below.
22465
22466 Failing that, the function divides BUILDER into two sub-vectors:
22467 v_even = even elements of BUILDER;
22468 v_odd = odd elements of BUILDER;
22469
22470 and recursively calls itself with v_even and v_odd.
22471
22472 if (recursive call succeeded for v_even or v_odd)
22473 TARGET = zip (v_even, v_odd)
22474
22475 The function returns true if it managed to build TARGET from BUILDER
22476 with one of the special cases, false otherwise.
22477
22478 Example: {a, 1, b, 2, c, 3, d, 4}
22479
22480 The vector gets divided into:
22481 v_even = {a, b, c, d}
22482 v_odd = {1, 2, 3, 4}
22483
22484 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
22485 initialize tmp2 from constant vector v_odd using emit_move_insn.
22486
22487 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
22488 4 elements, so we construct tmp1 from v_even using insr:
22489 tmp1 = dup(d)
22490 insr tmp1, c
22491 insr tmp1, b
22492 insr tmp1, a
22493
22494 And finally:
22495 TARGET = zip (tmp1, tmp2)
22496 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
22497
22498 static bool
22499 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
22500 int nelts, int nelts_reqd)
22501 {
22502 machine_mode mode = GET_MODE (target);
22503
22504 /* Case 1: Vector contains trailing constants. */
22505
22506 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22507 (target, builder, nelts, nelts_reqd))
22508 return true;
22509
22510 /* Case 2: Vector contains leading constants. */
22511
22512 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
22513 for (int i = 0; i < nelts_reqd; i++)
22514 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
22515 rev_builder.finalize ();
22516
22517 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22518 (target, rev_builder, nelts, nelts_reqd))
22519 {
22520 emit_insn (gen_aarch64_sve_rev (mode, target, target));
22521 return true;
22522 }
22523
22524 /* Case 3: Vector contains trailing same element. */
22525
22526 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22527 (target, builder, nelts_reqd))
22528 return true;
22529
22530 /* Case 4: Vector contains leading same element. */
22531
22532 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22533 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
22534 {
22535 emit_insn (gen_aarch64_sve_rev (mode, target, target));
22536 return true;
22537 }
22538
22539 /* Avoid recursing below 4-elements.
22540 ??? The threshold 4 may need fine-tuning. */
22541
22542 if (nelts_reqd <= 4)
22543 return false;
22544
22545 rtx_vector_builder v_even (mode, nelts, 1);
22546 rtx_vector_builder v_odd (mode, nelts, 1);
22547
22548 for (int i = 0; i < nelts * 2; i += 2)
22549 {
22550 v_even.quick_push (builder.elt (i));
22551 v_odd.quick_push (builder.elt (i + 1));
22552 }
22553
22554 v_even.finalize ();
22555 v_odd.finalize ();
22556
22557 rtx tmp1 = gen_reg_rtx (mode);
22558 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
22559 nelts, nelts_reqd / 2);
22560
22561 rtx tmp2 = gen_reg_rtx (mode);
22562 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
22563 nelts, nelts_reqd / 2);
22564
22565 if (!did_even_p && !did_odd_p)
22566 return false;
22567
22568 /* Initialize v_even and v_odd using INSR if it didn't match any of the
22569 special cases and zip v_even, v_odd. */
22570
22571 if (!did_even_p)
22572 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
22573
22574 if (!did_odd_p)
22575 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
22576
22577 rtvec v = gen_rtvec (2, tmp1, tmp2);
22578 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22579 return true;
22580 }
22581
22582 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
22583
22584 void
22585 aarch64_sve_expand_vector_init (rtx target, rtx vals)
22586 {
22587 machine_mode mode = GET_MODE (target);
22588 int nelts = XVECLEN (vals, 0);
22589
22590 rtx_vector_builder v (mode, nelts, 1);
22591 for (int i = 0; i < nelts; i++)
22592 v.quick_push (XVECEXP (vals, 0, i));
22593 v.finalize ();
22594
22595 /* If neither sub-vectors of v could be initialized specially,
22596 then use INSR to insert all elements from v into TARGET.
22597 ??? This might not be optimal for vectors with large
22598 initializers like 16-element or above.
22599 For nelts < 4, it probably isn't useful to handle specially. */
22600
22601 if (nelts < 4
22602 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
22603 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
22604 }
22605
22606 /* Check whether VALUE is a vector constant in which every element
22607 is either a power of 2 or a negated power of 2. If so, return
22608 a constant vector of log2s, and flip CODE between PLUS and MINUS
22609 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
22610
22611 static rtx
22612 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
22613 {
22614 if (!CONST_VECTOR_P (value))
22615 return NULL_RTX;
22616
22617 rtx_vector_builder builder;
22618 if (!builder.new_unary_operation (GET_MODE (value), value, false))
22619 return NULL_RTX;
22620
22621 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
22622 /* 1 if the result of the multiplication must be negated,
22623 0 if it mustn't, or -1 if we don't yet care. */
22624 int negate = -1;
22625 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
22626 for (unsigned int i = 0; i < encoded_nelts; ++i)
22627 {
22628 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
22629 if (!CONST_SCALAR_INT_P (elt))
22630 return NULL_RTX;
22631 rtx_mode_t val (elt, int_mode);
22632 wide_int pow2 = wi::neg (val);
22633 if (val != pow2)
22634 {
22635 /* It matters whether we negate or not. Make that choice,
22636 and make sure that it's consistent with previous elements. */
22637 if (negate == !wi::neg_p (val))
22638 return NULL_RTX;
22639 negate = wi::neg_p (val);
22640 if (!negate)
22641 pow2 = val;
22642 }
22643 /* POW2 is now the value that we want to be a power of 2. */
22644 int shift = wi::exact_log2 (pow2);
22645 if (shift < 0)
22646 return NULL_RTX;
22647 builder.quick_push (gen_int_mode (shift, int_mode));
22648 }
22649 if (negate == -1)
22650 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
22651 code = PLUS;
22652 else if (negate == 1)
22653 code = code == PLUS ? MINUS : PLUS;
22654 return builder.build ();
22655 }
22656
22657 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
22658 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
22659 operands array, in the same order as for fma_optab. Return true if
22660 the function emitted all the necessary instructions, false if the caller
22661 should generate the pattern normally with the new OPERANDS array. */
22662
22663 bool
22664 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
22665 {
22666 machine_mode mode = GET_MODE (operands[0]);
22667 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
22668 {
22669 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
22670 NULL_RTX, true, OPTAB_DIRECT);
22671 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
22672 operands[3], product, operands[0], true,
22673 OPTAB_DIRECT);
22674 return true;
22675 }
22676 operands[2] = force_reg (mode, operands[2]);
22677 return false;
22678 }
22679
22680 /* Likewise, but for a conditional pattern. */
22681
22682 bool
22683 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
22684 {
22685 machine_mode mode = GET_MODE (operands[0]);
22686 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
22687 {
22688 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
22689 NULL_RTX, true, OPTAB_DIRECT);
22690 emit_insn (gen_cond (code, mode, operands[0], operands[1],
22691 operands[4], product, operands[5]));
22692 return true;
22693 }
22694 operands[3] = force_reg (mode, operands[3]);
22695 return false;
22696 }
22697
22698 static unsigned HOST_WIDE_INT
22699 aarch64_shift_truncation_mask (machine_mode mode)
22700 {
22701 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
22702 return 0;
22703 return GET_MODE_UNIT_BITSIZE (mode) - 1;
22704 }
22705
22706 /* Select a format to encode pointers in exception handling data. */
22707 int
22708 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
22709 {
22710 int type;
22711 switch (aarch64_cmodel)
22712 {
22713 case AARCH64_CMODEL_TINY:
22714 case AARCH64_CMODEL_TINY_PIC:
22715 case AARCH64_CMODEL_SMALL:
22716 case AARCH64_CMODEL_SMALL_PIC:
22717 case AARCH64_CMODEL_SMALL_SPIC:
22718 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
22719 for everything. */
22720 type = DW_EH_PE_sdata4;
22721 break;
22722 default:
22723 /* No assumptions here. 8-byte relocs required. */
22724 type = DW_EH_PE_sdata8;
22725 break;
22726 }
22727 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
22728 }
22729
22730 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
22731
22732 static void
22733 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
22734 {
22735 if (TREE_CODE (decl) == FUNCTION_DECL)
22736 {
22737 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
22738 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
22739 {
22740 fprintf (stream, "\t.variant_pcs\t");
22741 assemble_name (stream, name);
22742 fprintf (stream, "\n");
22743 }
22744 }
22745 }
22746
22747 /* The last .arch and .tune assembly strings that we printed. */
22748 static std::string aarch64_last_printed_arch_string;
22749 static std::string aarch64_last_printed_tune_string;
22750
22751 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
22752 by the function fndecl. */
22753
22754 void
22755 aarch64_declare_function_name (FILE *stream, const char* name,
22756 tree fndecl)
22757 {
22758 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
22759
22760 struct cl_target_option *targ_options;
22761 if (target_parts)
22762 targ_options = TREE_TARGET_OPTION (target_parts);
22763 else
22764 targ_options = TREE_TARGET_OPTION (target_option_current_node);
22765 gcc_assert (targ_options);
22766
22767 const struct processor *this_arch
22768 = aarch64_get_arch (targ_options->x_selected_arch);
22769
22770 auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
22771 std::string extension
22772 = aarch64_get_extension_string_for_isa_flags (isa_flags,
22773 this_arch->flags);
22774 /* Only update the assembler .arch string if it is distinct from the last
22775 such string we printed. */
22776 std::string to_print = this_arch->name + extension;
22777 if (to_print != aarch64_last_printed_arch_string)
22778 {
22779 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
22780 aarch64_last_printed_arch_string = to_print;
22781 }
22782
22783 /* Print the cpu name we're tuning for in the comments, might be
22784 useful to readers of the generated asm. Do it only when it changes
22785 from function to function and verbose assembly is requested. */
22786 const struct processor *this_tune
22787 = aarch64_get_tune_cpu (targ_options->x_selected_tune);
22788
22789 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
22790 {
22791 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
22792 this_tune->name);
22793 aarch64_last_printed_tune_string = this_tune->name;
22794 }
22795
22796 aarch64_asm_output_variant_pcs (stream, fndecl, name);
22797
22798 /* Don't forget the type directive for ELF. */
22799 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
22800 ASM_OUTPUT_LABEL (stream, name);
22801
22802 cfun->machine->label_is_assembled = true;
22803 }
22804
22805 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */
22806
22807 void
22808 aarch64_print_patchable_function_entry (FILE *file,
22809 unsigned HOST_WIDE_INT patch_area_size,
22810 bool record_p)
22811 {
22812 if (!cfun->machine->label_is_assembled)
22813 {
22814 /* Emit the patching area before the entry label, if any. */
22815 default_print_patchable_function_entry (file, patch_area_size,
22816 record_p);
22817 return;
22818 }
22819
22820 rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
22821 GEN_INT (record_p));
22822 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
22823
22824 if (!aarch64_bti_enabled ()
22825 || cgraph_node::get (cfun->decl)->only_called_directly_p ())
22826 {
22827 /* Emit the patchable_area at the beginning of the function. */
22828 rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
22829 INSN_ADDRESSES_NEW (insn, -1);
22830 return;
22831 }
22832
22833 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
22834 if (!insn
22835 || !INSN_P (insn)
22836 || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
22837 || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
22838 {
22839 /* Emit a BTI_C. */
22840 insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
22841 }
22842
22843 /* Emit the patchable_area after BTI_C. */
22844 insn = emit_insn_after (pa, insn);
22845 INSN_ADDRESSES_NEW (insn, -1);
22846 }
22847
22848 /* Output patchable area. */
22849
22850 void
22851 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
22852 {
22853 default_print_patchable_function_entry (asm_out_file, patch_area_size,
22854 record_p);
22855 }
22856
22857 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
22858
22859 void
22860 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
22861 {
22862 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
22863 const char *value = IDENTIFIER_POINTER (target);
22864 aarch64_asm_output_variant_pcs (stream, decl, name);
22865 ASM_OUTPUT_DEF (stream, name, value);
22866 }
22867
22868 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
22869 function symbol references. */
22870
22871 void
22872 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
22873 {
22874 default_elf_asm_output_external (stream, decl, name);
22875 aarch64_asm_output_variant_pcs (stream, decl, name);
22876 }
22877
22878 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
22879 Used to output the .cfi_b_key_frame directive when signing the current
22880 function with the B key. */
22881
22882 void
22883 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
22884 {
22885 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
22886 && aarch64_ra_sign_key == AARCH64_KEY_B)
22887 asm_fprintf (f, "\t.cfi_b_key_frame\n");
22888 }
22889
22890 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
22891
22892 static void
22893 aarch64_start_file (void)
22894 {
22895 struct cl_target_option *default_options
22896 = TREE_TARGET_OPTION (target_option_default_node);
22897
22898 const struct processor *default_arch
22899 = aarch64_get_arch (default_options->x_selected_arch);
22900 auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
22901 std::string extension
22902 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
22903 default_arch->flags);
22904
22905 aarch64_last_printed_arch_string = default_arch->name + extension;
22906 aarch64_last_printed_tune_string = "";
22907 asm_fprintf (asm_out_file, "\t.arch %s\n",
22908 aarch64_last_printed_arch_string.c_str ());
22909
22910 default_file_start ();
22911 }
22912
22913 /* Emit load exclusive. */
22914
22915 static void
22916 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
22917 rtx mem, rtx model_rtx)
22918 {
22919 if (mode == TImode)
22920 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
22921 gen_highpart (DImode, rval),
22922 mem, model_rtx));
22923 else
22924 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
22925 }
22926
22927 /* Emit store exclusive. */
22928
22929 static void
22930 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
22931 rtx mem, rtx rval, rtx model_rtx)
22932 {
22933 if (mode == TImode)
22934 emit_insn (gen_aarch64_store_exclusive_pair
22935 (bval, mem, operand_subword (rval, 0, 0, TImode),
22936 operand_subword (rval, 1, 0, TImode), model_rtx));
22937 else
22938 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
22939 }
22940
22941 /* Mark the previous jump instruction as unlikely. */
22942
22943 static void
22944 aarch64_emit_unlikely_jump (rtx insn)
22945 {
22946 rtx_insn *jump = emit_jump_insn (insn);
22947 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
22948 }
22949
22950 /* We store the names of the various atomic helpers in a 5x5 array.
22951 Return the libcall function given MODE, MODEL and NAMES. */
22952
22953 rtx
22954 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
22955 const atomic_ool_names *names)
22956 {
22957 memmodel model = memmodel_from_int (INTVAL (model_rtx));
22958 int mode_idx, model_idx;
22959
22960 switch (mode)
22961 {
22962 case E_QImode:
22963 mode_idx = 0;
22964 break;
22965 case E_HImode:
22966 mode_idx = 1;
22967 break;
22968 case E_SImode:
22969 mode_idx = 2;
22970 break;
22971 case E_DImode:
22972 mode_idx = 3;
22973 break;
22974 case E_TImode:
22975 mode_idx = 4;
22976 break;
22977 default:
22978 gcc_unreachable ();
22979 }
22980
22981 switch (model)
22982 {
22983 case MEMMODEL_RELAXED:
22984 model_idx = 0;
22985 break;
22986 case MEMMODEL_CONSUME:
22987 case MEMMODEL_ACQUIRE:
22988 model_idx = 1;
22989 break;
22990 case MEMMODEL_RELEASE:
22991 model_idx = 2;
22992 break;
22993 case MEMMODEL_ACQ_REL:
22994 case MEMMODEL_SEQ_CST:
22995 model_idx = 3;
22996 break;
22997 case MEMMODEL_SYNC_ACQUIRE:
22998 case MEMMODEL_SYNC_RELEASE:
22999 case MEMMODEL_SYNC_SEQ_CST:
23000 model_idx = 4;
23001 break;
23002 default:
23003 gcc_unreachable ();
23004 }
23005
23006 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
23007 VISIBILITY_HIDDEN);
23008 }
23009
23010 #define DEF0(B, N) \
23011 { "__aarch64_" #B #N "_relax", \
23012 "__aarch64_" #B #N "_acq", \
23013 "__aarch64_" #B #N "_rel", \
23014 "__aarch64_" #B #N "_acq_rel", \
23015 "__aarch64_" #B #N "_sync" }
23016
23017 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
23018 { NULL, NULL, NULL, NULL }
23019 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
23020
23021 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
23022 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
23023 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
23024 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
23025 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
23026 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
23027
23028 #undef DEF0
23029 #undef DEF4
23030 #undef DEF5
23031
23032 /* Expand a compare and swap pattern. */
23033
23034 void
23035 aarch64_expand_compare_and_swap (rtx operands[])
23036 {
23037 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
23038 machine_mode mode, r_mode;
23039
23040 bval = operands[0];
23041 rval = operands[1];
23042 mem = operands[2];
23043 oldval = operands[3];
23044 newval = operands[4];
23045 is_weak = operands[5];
23046 mod_s = operands[6];
23047 mod_f = operands[7];
23048 mode = GET_MODE (mem);
23049
23050 /* Normally the succ memory model must be stronger than fail, but in the
23051 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
23052 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
23053 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
23054 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
23055 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
23056
23057 r_mode = mode;
23058 if (mode == QImode || mode == HImode)
23059 {
23060 r_mode = SImode;
23061 rval = gen_reg_rtx (r_mode);
23062 }
23063
23064 if (TARGET_LSE)
23065 {
23066 /* The CAS insn requires oldval and rval overlap, but we need to
23067 have a copy of oldval saved across the operation to tell if
23068 the operation is successful. */
23069 if (reg_overlap_mentioned_p (rval, oldval))
23070 rval = copy_to_mode_reg (r_mode, oldval);
23071 else
23072 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
23073
23074 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
23075 newval, mod_s));
23076 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23077 }
23078 else if (TARGET_OUTLINE_ATOMICS)
23079 {
23080 /* Oldval must satisfy compare afterward. */
23081 if (!aarch64_plus_operand (oldval, mode))
23082 oldval = force_reg (mode, oldval);
23083 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
23084 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
23085 oldval, mode, newval, mode,
23086 XEXP (mem, 0), Pmode);
23087 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23088 }
23089 else
23090 {
23091 /* The oldval predicate varies by mode. Test it and force to reg. */
23092 insn_code code = code_for_aarch64_compare_and_swap (mode);
23093 if (!insn_data[code].operand[2].predicate (oldval, mode))
23094 oldval = force_reg (mode, oldval);
23095
23096 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
23097 is_weak, mod_s, mod_f));
23098 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
23099 }
23100
23101 if (r_mode != mode)
23102 rval = gen_lowpart (mode, rval);
23103 emit_move_insn (operands[1], rval);
23104
23105 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
23106 emit_insn (gen_rtx_SET (bval, x));
23107 }
23108
23109 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
23110 sequence implementing an atomic operation. */
23111
23112 static void
23113 aarch64_emit_post_barrier (enum memmodel model)
23114 {
23115 const enum memmodel base_model = memmodel_base (model);
23116
23117 if (is_mm_sync (model)
23118 && (base_model == MEMMODEL_ACQUIRE
23119 || base_model == MEMMODEL_ACQ_REL
23120 || base_model == MEMMODEL_SEQ_CST))
23121 {
23122 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
23123 }
23124 }
23125
23126 /* Split a compare and swap pattern. */
23127
23128 void
23129 aarch64_split_compare_and_swap (rtx operands[])
23130 {
23131 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
23132 gcc_assert (epilogue_completed);
23133
23134 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
23135 machine_mode mode;
23136 bool is_weak;
23137 rtx_code_label *label1, *label2;
23138 enum memmodel model;
23139
23140 rval = operands[0];
23141 mem = operands[1];
23142 oldval = operands[2];
23143 newval = operands[3];
23144 is_weak = (operands[4] != const0_rtx);
23145 model_rtx = operands[5];
23146 scratch = operands[7];
23147 mode = GET_MODE (mem);
23148 model = memmodel_from_int (INTVAL (model_rtx));
23149
23150 /* When OLDVAL is zero and we want the strong version we can emit a tighter
23151 loop:
23152 .label1:
23153 LD[A]XR rval, [mem]
23154 CBNZ rval, .label2
23155 ST[L]XR scratch, newval, [mem]
23156 CBNZ scratch, .label1
23157 .label2:
23158 CMP rval, 0. */
23159 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
23160 oldval == const0_rtx && mode != TImode);
23161
23162 label1 = NULL;
23163 if (!is_weak)
23164 {
23165 label1 = gen_label_rtx ();
23166 emit_label (label1);
23167 }
23168 label2 = gen_label_rtx ();
23169
23170 /* The initial load can be relaxed for a __sync operation since a final
23171 barrier will be emitted to stop code hoisting. */
23172 if (is_mm_sync (model))
23173 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
23174 else
23175 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
23176
23177 if (strong_zero_p)
23178 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
23179 else
23180 {
23181 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23182 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
23183 }
23184 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23185 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
23186 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23187
23188 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
23189
23190 if (!is_weak)
23191 {
23192 if (aarch64_track_speculation)
23193 {
23194 /* Emit an explicit compare instruction, so that we can correctly
23195 track the condition codes. */
23196 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23197 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23198 }
23199 else
23200 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
23201
23202 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23203 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
23204 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23205 }
23206 else
23207 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23208
23209 emit_label (label2);
23210
23211 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
23212 to set the condition flags. If this is not used it will be removed by
23213 later passes. */
23214 if (strong_zero_p)
23215 aarch64_gen_compare_reg (NE, rval, const0_rtx);
23216
23217 /* Emit any final barrier needed for a __sync operation. */
23218 if (is_mm_sync (model))
23219 aarch64_emit_post_barrier (model);
23220 }
23221
23222 /* Split an atomic operation. */
23223
23224 void
23225 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
23226 rtx value, rtx model_rtx, rtx cond)
23227 {
23228 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
23229 gcc_assert (epilogue_completed);
23230
23231 machine_mode mode = GET_MODE (mem);
23232 machine_mode wmode = (mode == DImode ? DImode : SImode);
23233 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
23234 const bool is_sync = is_mm_sync (model);
23235 rtx_code_label *label;
23236 rtx x;
23237
23238 /* Split the atomic operation into a sequence. */
23239 label = gen_label_rtx ();
23240 emit_label (label);
23241
23242 if (new_out)
23243 new_out = gen_lowpart (wmode, new_out);
23244 if (old_out)
23245 old_out = gen_lowpart (wmode, old_out);
23246 else
23247 old_out = new_out;
23248 value = simplify_gen_subreg (wmode, value, mode, 0);
23249
23250 /* The initial load can be relaxed for a __sync operation since a final
23251 barrier will be emitted to stop code hoisting. */
23252 if (is_sync)
23253 aarch64_emit_load_exclusive (mode, old_out, mem,
23254 GEN_INT (MEMMODEL_RELAXED));
23255 else
23256 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
23257
23258 switch (code)
23259 {
23260 case SET:
23261 new_out = value;
23262 break;
23263
23264 case NOT:
23265 x = gen_rtx_AND (wmode, old_out, value);
23266 emit_insn (gen_rtx_SET (new_out, x));
23267 x = gen_rtx_NOT (wmode, new_out);
23268 emit_insn (gen_rtx_SET (new_out, x));
23269 break;
23270
23271 case MINUS:
23272 if (CONST_INT_P (value))
23273 {
23274 value = GEN_INT (-UINTVAL (value));
23275 code = PLUS;
23276 }
23277 /* Fall through. */
23278
23279 default:
23280 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
23281 emit_insn (gen_rtx_SET (new_out, x));
23282 break;
23283 }
23284
23285 aarch64_emit_store_exclusive (mode, cond, mem,
23286 gen_lowpart (mode, new_out), model_rtx);
23287
23288 if (aarch64_track_speculation)
23289 {
23290 /* Emit an explicit compare instruction, so that we can correctly
23291 track the condition codes. */
23292 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
23293 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23294 }
23295 else
23296 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
23297
23298 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23299 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
23300 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23301
23302 /* Emit any final barrier needed for a __sync operation. */
23303 if (is_sync)
23304 aarch64_emit_post_barrier (model);
23305 }
23306
23307 static void
23308 aarch64_init_libfuncs (void)
23309 {
23310 /* Half-precision float operations. The compiler handles all operations
23311 with NULL libfuncs by converting to SFmode. */
23312
23313 /* Conversions. */
23314 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
23315 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
23316
23317 /* Arithmetic. */
23318 set_optab_libfunc (add_optab, HFmode, NULL);
23319 set_optab_libfunc (sdiv_optab, HFmode, NULL);
23320 set_optab_libfunc (smul_optab, HFmode, NULL);
23321 set_optab_libfunc (neg_optab, HFmode, NULL);
23322 set_optab_libfunc (sub_optab, HFmode, NULL);
23323
23324 /* Comparisons. */
23325 set_optab_libfunc (eq_optab, HFmode, NULL);
23326 set_optab_libfunc (ne_optab, HFmode, NULL);
23327 set_optab_libfunc (lt_optab, HFmode, NULL);
23328 set_optab_libfunc (le_optab, HFmode, NULL);
23329 set_optab_libfunc (ge_optab, HFmode, NULL);
23330 set_optab_libfunc (gt_optab, HFmode, NULL);
23331 set_optab_libfunc (unord_optab, HFmode, NULL);
23332 }
23333
23334 /* Target hook for c_mode_for_suffix. */
23335 static machine_mode
23336 aarch64_c_mode_for_suffix (char suffix)
23337 {
23338 if (suffix == 'q')
23339 return TFmode;
23340
23341 return VOIDmode;
23342 }
23343
23344 /* We can only represent floating point constants which will fit in
23345 "quarter-precision" values. These values are characterised by
23346 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
23347 by:
23348
23349 (-1)^s * (n/16) * 2^r
23350
23351 Where:
23352 's' is the sign bit.
23353 'n' is an integer in the range 16 <= n <= 31.
23354 'r' is an integer in the range -3 <= r <= 4. */
23355
23356 /* Return true iff X can be represented by a quarter-precision
23357 floating point immediate operand X. Note, we cannot represent 0.0. */
23358 bool
23359 aarch64_float_const_representable_p (rtx x)
23360 {
23361 /* This represents our current view of how many bits
23362 make up the mantissa. */
23363 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
23364 int exponent;
23365 unsigned HOST_WIDE_INT mantissa, mask;
23366 REAL_VALUE_TYPE r, m;
23367 bool fail;
23368
23369 x = unwrap_const_vec_duplicate (x);
23370 if (!CONST_DOUBLE_P (x))
23371 return false;
23372
23373 if (GET_MODE (x) == VOIDmode
23374 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
23375 return false;
23376
23377 r = *CONST_DOUBLE_REAL_VALUE (x);
23378
23379 /* We cannot represent infinities, NaNs or +/-zero. We won't
23380 know if we have +zero until we analyse the mantissa, but we
23381 can reject the other invalid values. */
23382 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23383 || REAL_VALUE_MINUS_ZERO (r))
23384 return false;
23385
23386 /* Extract exponent. */
23387 r = real_value_abs (&r);
23388 exponent = REAL_EXP (&r);
23389
23390 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23391 highest (sign) bit, with a fixed binary point at bit point_pos.
23392 m1 holds the low part of the mantissa, m2 the high part.
23393 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23394 bits for the mantissa, this can fail (low bits will be lost). */
23395 real_ldexp (&m, &r, point_pos - exponent);
23396 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
23397
23398 /* If the low part of the mantissa has bits set we cannot represent
23399 the value. */
23400 if (w.ulow () != 0)
23401 return false;
23402 /* We have rejected the lower HOST_WIDE_INT, so update our
23403 understanding of how many bits lie in the mantissa and
23404 look only at the high HOST_WIDE_INT. */
23405 mantissa = w.elt (1);
23406 point_pos -= HOST_BITS_PER_WIDE_INT;
23407
23408 /* We can only represent values with a mantissa of the form 1.xxxx. */
23409 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23410 if ((mantissa & mask) != 0)
23411 return false;
23412
23413 /* Having filtered unrepresentable values, we may now remove all
23414 but the highest 5 bits. */
23415 mantissa >>= point_pos - 5;
23416
23417 /* We cannot represent the value 0.0, so reject it. This is handled
23418 elsewhere. */
23419 if (mantissa == 0)
23420 return false;
23421
23422 /* Then, as bit 4 is always set, we can mask it off, leaving
23423 the mantissa in the range [0, 15]. */
23424 mantissa &= ~(1 << 4);
23425 gcc_assert (mantissa <= 15);
23426
23427 /* GCC internally does not use IEEE754-like encoding (where normalized
23428 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
23429 Our mantissa values are shifted 4 places to the left relative to
23430 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23431 by 5 places to correct for GCC's representation. */
23432 exponent = 5 - exponent;
23433
23434 return (exponent >= 0 && exponent <= 7);
23435 }
23436
23437 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
23438 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
23439 output MOVI/MVNI, ORR or BIC immediate. */
23440 char*
23441 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
23442 enum simd_immediate_check which)
23443 {
23444 bool is_valid;
23445 static char templ[40];
23446 const char *mnemonic;
23447 const char *shift_op;
23448 unsigned int lane_count = 0;
23449 char element_char;
23450
23451 struct simd_immediate_info info;
23452
23453 /* This will return true to show const_vector is legal for use as either
23454 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
23455 It will also update INFO to show how the immediate should be generated.
23456 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
23457 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
23458 gcc_assert (is_valid);
23459
23460 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23461 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
23462
23463 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23464 {
23465 gcc_assert (info.insn == simd_immediate_info::MOV
23466 && info.u.mov.shift == 0);
23467 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
23468 move immediate path. */
23469 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23470 info.u.mov.value = GEN_INT (0);
23471 else
23472 {
23473 const unsigned int buf_size = 20;
23474 char float_buf[buf_size] = {'\0'};
23475 real_to_decimal_for_mode (float_buf,
23476 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23477 buf_size, buf_size, 1, info.elt_mode);
23478
23479 if (lane_count == 1)
23480 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
23481 else
23482 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
23483 lane_count, element_char, float_buf);
23484 return templ;
23485 }
23486 }
23487
23488 gcc_assert (CONST_INT_P (info.u.mov.value));
23489
23490 if (which == AARCH64_CHECK_MOV)
23491 {
23492 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
23493 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
23494 ? "msl" : "lsl");
23495 if (lane_count == 1)
23496 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
23497 mnemonic, UINTVAL (info.u.mov.value));
23498 else if (info.u.mov.shift)
23499 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23500 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
23501 element_char, UINTVAL (info.u.mov.value), shift_op,
23502 info.u.mov.shift);
23503 else
23504 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23505 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
23506 element_char, UINTVAL (info.u.mov.value));
23507 }
23508 else
23509 {
23510 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
23511 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
23512 if (info.u.mov.shift)
23513 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23514 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
23515 element_char, UINTVAL (info.u.mov.value), "lsl",
23516 info.u.mov.shift);
23517 else
23518 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23519 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
23520 element_char, UINTVAL (info.u.mov.value));
23521 }
23522 return templ;
23523 }
23524
23525 char*
23526 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
23527 {
23528
23529 /* If a floating point number was passed and we desire to use it in an
23530 integer mode do the conversion to integer. */
23531 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
23532 {
23533 unsigned HOST_WIDE_INT ival;
23534 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
23535 gcc_unreachable ();
23536 immediate = gen_int_mode (ival, mode);
23537 }
23538
23539 machine_mode vmode;
23540 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
23541 a 128 bit vector mode. */
23542 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
23543
23544 vmode = aarch64_simd_container_mode (mode, width);
23545 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
23546 return aarch64_output_simd_mov_immediate (v_op, width);
23547 }
23548
23549 /* Return the output string to use for moving immediate CONST_VECTOR
23550 into an SVE register. */
23551
23552 char *
23553 aarch64_output_sve_mov_immediate (rtx const_vector)
23554 {
23555 static char templ[40];
23556 struct simd_immediate_info info;
23557 char element_char;
23558
23559 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
23560 gcc_assert (is_valid);
23561
23562 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23563
23564 machine_mode vec_mode = GET_MODE (const_vector);
23565 if (aarch64_sve_pred_mode_p (vec_mode))
23566 {
23567 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
23568 if (info.insn == simd_immediate_info::MOV)
23569 {
23570 gcc_assert (info.u.mov.value == const0_rtx);
23571 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
23572 }
23573 else
23574 {
23575 gcc_assert (info.insn == simd_immediate_info::PTRUE);
23576 unsigned int total_bytes;
23577 if (info.u.pattern == AARCH64_SV_ALL
23578 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
23579 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
23580 total_bytes / GET_MODE_SIZE (info.elt_mode));
23581 else
23582 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
23583 svpattern_token (info.u.pattern));
23584 }
23585 return buf;
23586 }
23587
23588 if (info.insn == simd_immediate_info::INDEX)
23589 {
23590 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
23591 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
23592 element_char, INTVAL (info.u.index.base),
23593 INTVAL (info.u.index.step));
23594 return templ;
23595 }
23596
23597 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23598 {
23599 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23600 info.u.mov.value = GEN_INT (0);
23601 else
23602 {
23603 const int buf_size = 20;
23604 char float_buf[buf_size] = {};
23605 real_to_decimal_for_mode (float_buf,
23606 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23607 buf_size, buf_size, 1, info.elt_mode);
23608
23609 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
23610 element_char, float_buf);
23611 return templ;
23612 }
23613 }
23614
23615 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
23616 element_char, INTVAL (info.u.mov.value));
23617 return templ;
23618 }
23619
23620 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
23621 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
23622 pattern. */
23623
23624 char *
23625 aarch64_output_sve_ptrues (rtx const_unspec)
23626 {
23627 static char templ[40];
23628
23629 struct simd_immediate_info info;
23630 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
23631 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
23632
23633 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23634 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
23635 svpattern_token (info.u.pattern));
23636 return templ;
23637 }
23638
23639 /* Split operands into moves from op[1] + op[2] into op[0]. */
23640
23641 void
23642 aarch64_split_combinev16qi (rtx operands[3])
23643 {
23644 unsigned int dest = REGNO (operands[0]);
23645 unsigned int src1 = REGNO (operands[1]);
23646 unsigned int src2 = REGNO (operands[2]);
23647 machine_mode halfmode = GET_MODE (operands[1]);
23648 unsigned int halfregs = REG_NREGS (operands[1]);
23649 rtx destlo, desthi;
23650
23651 gcc_assert (halfmode == V16QImode);
23652
23653 if (src1 == dest && src2 == dest + halfregs)
23654 {
23655 /* No-op move. Can't split to nothing; emit something. */
23656 emit_note (NOTE_INSN_DELETED);
23657 return;
23658 }
23659
23660 /* Preserve register attributes for variable tracking. */
23661 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
23662 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
23663 GET_MODE_SIZE (halfmode));
23664
23665 /* Special case of reversed high/low parts. */
23666 if (reg_overlap_mentioned_p (operands[2], destlo)
23667 && reg_overlap_mentioned_p (operands[1], desthi))
23668 {
23669 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23670 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
23671 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23672 }
23673 else if (!reg_overlap_mentioned_p (operands[2], destlo))
23674 {
23675 /* Try to avoid unnecessary moves if part of the result
23676 is in the right place already. */
23677 if (src1 != dest)
23678 emit_move_insn (destlo, operands[1]);
23679 if (src2 != dest + halfregs)
23680 emit_move_insn (desthi, operands[2]);
23681 }
23682 else
23683 {
23684 if (src2 != dest + halfregs)
23685 emit_move_insn (desthi, operands[2]);
23686 if (src1 != dest)
23687 emit_move_insn (destlo, operands[1]);
23688 }
23689 }
23690
23691 /* vec_perm support. */
23692
23693 struct expand_vec_perm_d
23694 {
23695 rtx target, op0, op1;
23696 vec_perm_indices perm;
23697 machine_mode vmode;
23698 machine_mode op_mode;
23699 unsigned int vec_flags;
23700 unsigned int op_vec_flags;
23701 bool one_vector_p;
23702 bool testing_p;
23703 };
23704
23705 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
23706
23707 /* Generate a variable permutation. */
23708
23709 static void
23710 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
23711 {
23712 machine_mode vmode = GET_MODE (target);
23713 bool one_vector_p = rtx_equal_p (op0, op1);
23714
23715 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
23716 gcc_checking_assert (GET_MODE (op0) == vmode);
23717 gcc_checking_assert (GET_MODE (op1) == vmode);
23718 gcc_checking_assert (GET_MODE (sel) == vmode);
23719 gcc_checking_assert (TARGET_SIMD);
23720
23721 if (one_vector_p)
23722 {
23723 if (vmode == V8QImode)
23724 {
23725 /* Expand the argument to a V16QI mode by duplicating it. */
23726 rtx pair = gen_reg_rtx (V16QImode);
23727 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
23728 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23729 }
23730 else
23731 {
23732 emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
23733 }
23734 }
23735 else
23736 {
23737 rtx pair;
23738
23739 if (vmode == V8QImode)
23740 {
23741 pair = gen_reg_rtx (V16QImode);
23742 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
23743 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23744 }
23745 else
23746 {
23747 pair = gen_reg_rtx (V2x16QImode);
23748 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
23749 emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
23750 }
23751 }
23752 }
23753
23754 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
23755 NELT is the number of elements in the vector. */
23756
23757 void
23758 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
23759 unsigned int nelt)
23760 {
23761 machine_mode vmode = GET_MODE (target);
23762 bool one_vector_p = rtx_equal_p (op0, op1);
23763 rtx mask;
23764
23765 /* The TBL instruction does not use a modulo index, so we must take care
23766 of that ourselves. */
23767 mask = aarch64_simd_gen_const_vector_dup (vmode,
23768 one_vector_p ? nelt - 1 : 2 * nelt - 1);
23769 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
23770
23771 /* For big-endian, we also need to reverse the index within the vector
23772 (but not which vector). */
23773 if (BYTES_BIG_ENDIAN)
23774 {
23775 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
23776 if (!one_vector_p)
23777 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
23778 sel = expand_simple_binop (vmode, XOR, sel, mask,
23779 NULL, 0, OPTAB_LIB_WIDEN);
23780 }
23781 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
23782 }
23783
23784 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
23785
23786 static void
23787 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
23788 {
23789 emit_insn (gen_rtx_SET (target,
23790 gen_rtx_UNSPEC (GET_MODE (target),
23791 gen_rtvec (2, op0, op1), code)));
23792 }
23793
23794 /* Expand an SVE vec_perm with the given operands. */
23795
23796 void
23797 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
23798 {
23799 machine_mode data_mode = GET_MODE (target);
23800 machine_mode sel_mode = GET_MODE (sel);
23801 /* Enforced by the pattern condition. */
23802 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
23803
23804 /* Note: vec_perm indices are supposed to wrap when they go beyond the
23805 size of the two value vectors, i.e. the upper bits of the indices
23806 are effectively ignored. SVE TBL instead produces 0 for any
23807 out-of-range indices, so we need to modulo all the vec_perm indices
23808 to ensure they are all in range. */
23809 rtx sel_reg = force_reg (sel_mode, sel);
23810
23811 /* Check if the sel only references the first values vector. */
23812 if (CONST_VECTOR_P (sel)
23813 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
23814 {
23815 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
23816 return;
23817 }
23818
23819 /* Check if the two values vectors are the same. */
23820 if (rtx_equal_p (op0, op1))
23821 {
23822 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
23823 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23824 NULL, 0, OPTAB_DIRECT);
23825 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
23826 return;
23827 }
23828
23829 /* Run TBL on for each value vector and combine the results. */
23830
23831 rtx res0 = gen_reg_rtx (data_mode);
23832 rtx res1 = gen_reg_rtx (data_mode);
23833 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
23834 if (!CONST_VECTOR_P (sel)
23835 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
23836 {
23837 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
23838 2 * nunits - 1);
23839 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23840 NULL, 0, OPTAB_DIRECT);
23841 }
23842 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
23843 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
23844 NULL, 0, OPTAB_DIRECT);
23845 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
23846 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
23847 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
23848 else
23849 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
23850 }
23851
23852 /* Recognize patterns suitable for the TRN instructions. */
23853 static bool
23854 aarch64_evpc_trn (struct expand_vec_perm_d *d)
23855 {
23856 HOST_WIDE_INT odd;
23857 poly_uint64 nelt = d->perm.length ();
23858 rtx out, in0, in1;
23859 machine_mode vmode = d->vmode;
23860
23861 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23862 return false;
23863
23864 /* Note that these are little-endian tests.
23865 We correct for big-endian later. */
23866 if (!d->perm[0].is_constant (&odd)
23867 || (odd != 0 && odd != 1)
23868 || !d->perm.series_p (0, 2, odd, 2)
23869 || !d->perm.series_p (1, 2, nelt + odd, 2))
23870 return false;
23871
23872 /* Success! */
23873 if (d->testing_p)
23874 return true;
23875
23876 in0 = d->op0;
23877 in1 = d->op1;
23878 /* We don't need a big-endian lane correction for SVE; see the comment
23879 at the head of aarch64-sve.md for details. */
23880 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23881 {
23882 std::swap (in0, in1);
23883 odd = !odd;
23884 }
23885 out = d->target;
23886
23887 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23888 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
23889 return true;
23890 }
23891
23892 /* Try to re-encode the PERM constant so it combines odd and even elements.
23893 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
23894 We retry with this new constant with the full suite of patterns. */
23895 static bool
23896 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
23897 {
23898 expand_vec_perm_d newd;
23899 unsigned HOST_WIDE_INT nelt;
23900
23901 if (d->vec_flags != VEC_ADVSIMD)
23902 return false;
23903
23904 /* Get the new mode. Always twice the size of the inner
23905 and half the elements. */
23906 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
23907 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
23908 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
23909 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
23910
23911 if (new_mode == word_mode)
23912 return false;
23913
23914 /* to_constant is safe since this routine is specific to Advanced SIMD
23915 vectors. */
23916 nelt = d->perm.length ().to_constant ();
23917
23918 vec_perm_builder newpermconst;
23919 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
23920
23921 /* Convert the perm constant if we can. Require even, odd as the pairs. */
23922 for (unsigned int i = 0; i < nelt; i += 2)
23923 {
23924 poly_int64 elt0 = d->perm[i];
23925 poly_int64 elt1 = d->perm[i + 1];
23926 poly_int64 newelt;
23927 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
23928 return false;
23929 newpermconst.quick_push (newelt.to_constant ());
23930 }
23931 newpermconst.finalize ();
23932
23933 newd.vmode = new_mode;
23934 newd.vec_flags = VEC_ADVSIMD;
23935 newd.op_mode = newd.vmode;
23936 newd.op_vec_flags = newd.vec_flags;
23937 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
23938 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
23939 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
23940 newd.testing_p = d->testing_p;
23941 newd.one_vector_p = d->one_vector_p;
23942
23943 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
23944 return aarch64_expand_vec_perm_const_1 (&newd);
23945 }
23946
23947 /* Recognize patterns suitable for the UZP instructions. */
23948 static bool
23949 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
23950 {
23951 HOST_WIDE_INT odd;
23952 rtx out, in0, in1;
23953 machine_mode vmode = d->vmode;
23954
23955 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23956 return false;
23957
23958 /* Note that these are little-endian tests.
23959 We correct for big-endian later. */
23960 if (!d->perm[0].is_constant (&odd)
23961 || (odd != 0 && odd != 1)
23962 || !d->perm.series_p (0, 1, odd, 2))
23963 return false;
23964
23965 /* Success! */
23966 if (d->testing_p)
23967 return true;
23968
23969 in0 = d->op0;
23970 in1 = d->op1;
23971 /* We don't need a big-endian lane correction for SVE; see the comment
23972 at the head of aarch64-sve.md for details. */
23973 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23974 {
23975 std::swap (in0, in1);
23976 odd = !odd;
23977 }
23978 out = d->target;
23979
23980 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23981 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
23982 return true;
23983 }
23984
23985 /* Recognize patterns suitable for the ZIP instructions. */
23986 static bool
23987 aarch64_evpc_zip (struct expand_vec_perm_d *d)
23988 {
23989 unsigned int high;
23990 poly_uint64 nelt = d->perm.length ();
23991 rtx out, in0, in1;
23992 machine_mode vmode = d->vmode;
23993
23994 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23995 return false;
23996
23997 /* Note that these are little-endian tests.
23998 We correct for big-endian later. */
23999 poly_uint64 first = d->perm[0];
24000 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
24001 || !d->perm.series_p (0, 2, first, 1)
24002 || !d->perm.series_p (1, 2, first + nelt, 1))
24003 return false;
24004 high = maybe_ne (first, 0U);
24005
24006 /* Success! */
24007 if (d->testing_p)
24008 return true;
24009
24010 in0 = d->op0;
24011 in1 = d->op1;
24012 /* We don't need a big-endian lane correction for SVE; see the comment
24013 at the head of aarch64-sve.md for details. */
24014 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
24015 {
24016 std::swap (in0, in1);
24017 high = !high;
24018 }
24019 out = d->target;
24020
24021 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
24022 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
24023 return true;
24024 }
24025
24026 /* Recognize patterns for the EXT insn. */
24027
24028 static bool
24029 aarch64_evpc_ext (struct expand_vec_perm_d *d)
24030 {
24031 HOST_WIDE_INT location;
24032 rtx offset;
24033
24034 /* The first element always refers to the first vector.
24035 Check if the extracted indices are increasing by one. */
24036 if (d->vec_flags == VEC_SVE_PRED
24037 || !d->perm[0].is_constant (&location)
24038 || !d->perm.series_p (0, 1, location, 1))
24039 return false;
24040
24041 /* Success! */
24042 if (d->testing_p)
24043 return true;
24044
24045 /* The case where (location == 0) is a no-op for both big- and little-endian,
24046 and is removed by the mid-end at optimization levels -O1 and higher.
24047
24048 We don't need a big-endian lane correction for SVE; see the comment
24049 at the head of aarch64-sve.md for details. */
24050 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
24051 {
24052 /* After setup, we want the high elements of the first vector (stored
24053 at the LSB end of the register), and the low elements of the second
24054 vector (stored at the MSB end of the register). So swap. */
24055 std::swap (d->op0, d->op1);
24056 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
24057 to_constant () is safe since this is restricted to Advanced SIMD
24058 vectors. */
24059 location = d->perm.length ().to_constant () - location;
24060 }
24061
24062 offset = GEN_INT (location);
24063 emit_set_insn (d->target,
24064 gen_rtx_UNSPEC (d->vmode,
24065 gen_rtvec (3, d->op0, d->op1, offset),
24066 UNSPEC_EXT));
24067 return true;
24068 }
24069
24070 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
24071 within each 64-bit, 32-bit or 16-bit granule. */
24072
24073 static bool
24074 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
24075 {
24076 HOST_WIDE_INT diff;
24077 unsigned int i, size, unspec;
24078 machine_mode pred_mode;
24079
24080 if (d->vec_flags == VEC_SVE_PRED
24081 || !d->one_vector_p
24082 || !d->perm[0].is_constant (&diff)
24083 || !diff)
24084 return false;
24085
24086 if (d->vec_flags & VEC_SVE_DATA)
24087 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
24088 else
24089 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
24090 if (size == 64)
24091 {
24092 unspec = UNSPEC_REV64;
24093 pred_mode = VNx2BImode;
24094 }
24095 else if (size == 32)
24096 {
24097 unspec = UNSPEC_REV32;
24098 pred_mode = VNx4BImode;
24099 }
24100 else if (size == 16)
24101 {
24102 unspec = UNSPEC_REV16;
24103 pred_mode = VNx8BImode;
24104 }
24105 else
24106 return false;
24107
24108 unsigned int step = diff + 1;
24109 for (i = 0; i < step; ++i)
24110 if (!d->perm.series_p (i, step, diff - i, step))
24111 return false;
24112
24113 /* Success! */
24114 if (d->testing_p)
24115 return true;
24116
24117 if (d->vec_flags & VEC_SVE_DATA)
24118 {
24119 rtx pred = aarch64_ptrue_reg (pred_mode);
24120 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
24121 d->target, pred, d->op0));
24122 return true;
24123 }
24124 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
24125 emit_set_insn (d->target, src);
24126 return true;
24127 }
24128
24129 /* Recognize patterns for the REV insn, which reverses elements within
24130 a full vector. */
24131
24132 static bool
24133 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
24134 {
24135 poly_uint64 nelt = d->perm.length ();
24136
24137 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
24138 return false;
24139
24140 if (!d->perm.series_p (0, 1, nelt - 1, -1))
24141 return false;
24142
24143 /* Success! */
24144 if (d->testing_p)
24145 return true;
24146
24147 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
24148 emit_set_insn (d->target, src);
24149 return true;
24150 }
24151
24152 static bool
24153 aarch64_evpc_dup (struct expand_vec_perm_d *d)
24154 {
24155 rtx out = d->target;
24156 rtx in0;
24157 HOST_WIDE_INT elt;
24158 machine_mode vmode = d->vmode;
24159 rtx lane;
24160
24161 if (d->vec_flags == VEC_SVE_PRED
24162 || d->perm.encoding ().encoded_nelts () != 1
24163 || !d->perm[0].is_constant (&elt))
24164 return false;
24165
24166 if ((d->vec_flags & VEC_SVE_DATA)
24167 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
24168 return false;
24169
24170 /* Success! */
24171 if (d->testing_p)
24172 return true;
24173
24174 /* The generic preparation in aarch64_expand_vec_perm_const_1
24175 swaps the operand order and the permute indices if it finds
24176 d->perm[0] to be in the second operand. Thus, we can always
24177 use d->op0 and need not do any extra arithmetic to get the
24178 correct lane number. */
24179 in0 = d->op0;
24180 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
24181
24182 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
24183 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
24184 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
24185 return true;
24186 }
24187
24188 static bool
24189 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
24190 {
24191 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
24192 machine_mode vmode = d->vmode;
24193
24194 /* Make sure that the indices are constant. */
24195 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
24196 for (unsigned int i = 0; i < encoded_nelts; ++i)
24197 if (!d->perm[i].is_constant ())
24198 return false;
24199
24200 if (d->testing_p)
24201 return true;
24202
24203 /* Generic code will try constant permutation twice. Once with the
24204 original mode and again with the elements lowered to QImode.
24205 So wait and don't do the selector expansion ourselves. */
24206 if (vmode != V8QImode && vmode != V16QImode)
24207 return false;
24208
24209 /* to_constant is safe since this routine is specific to Advanced SIMD
24210 vectors. */
24211 unsigned int nelt = d->perm.length ().to_constant ();
24212 for (unsigned int i = 0; i < nelt; ++i)
24213 /* If big-endian and two vectors we end up with a weird mixed-endian
24214 mode on NEON. Reverse the index within each word but not the word
24215 itself. to_constant is safe because we checked is_constant above. */
24216 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
24217 ? d->perm[i].to_constant () ^ (nelt - 1)
24218 : d->perm[i].to_constant ());
24219
24220 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
24221 sel = force_reg (vmode, sel);
24222
24223 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
24224 return true;
24225 }
24226
24227 /* Try to implement D using an SVE TBL instruction. */
24228
24229 static bool
24230 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
24231 {
24232 unsigned HOST_WIDE_INT nelt;
24233
24234 /* Permuting two variable-length vectors could overflow the
24235 index range. */
24236 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
24237 return false;
24238
24239 if (d->testing_p)
24240 return true;
24241
24242 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
24243 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
24244 if (d->one_vector_p)
24245 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
24246 else
24247 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
24248 return true;
24249 }
24250
24251 /* Try to implement D using SVE dup instruction. */
24252
24253 static bool
24254 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
24255 {
24256 if (BYTES_BIG_ENDIAN
24257 || !d->one_vector_p
24258 || d->vec_flags != VEC_SVE_DATA
24259 || d->op_vec_flags != VEC_ADVSIMD
24260 || d->perm.encoding ().nelts_per_pattern () != 1
24261 || !known_eq (d->perm.encoding ().npatterns (),
24262 GET_MODE_NUNITS (d->op_mode))
24263 || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
24264 return false;
24265
24266 int npatterns = d->perm.encoding ().npatterns ();
24267 for (int i = 0; i < npatterns; i++)
24268 if (!known_eq (d->perm[i], i))
24269 return false;
24270
24271 if (d->testing_p)
24272 return true;
24273
24274 aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
24275 return true;
24276 }
24277
24278 /* Try to implement D using SVE SEL instruction. */
24279
24280 static bool
24281 aarch64_evpc_sel (struct expand_vec_perm_d *d)
24282 {
24283 machine_mode vmode = d->vmode;
24284 int unit_size = GET_MODE_UNIT_SIZE (vmode);
24285
24286 if (d->vec_flags != VEC_SVE_DATA
24287 || unit_size > 8)
24288 return false;
24289
24290 int n_patterns = d->perm.encoding ().npatterns ();
24291 poly_int64 vec_len = d->perm.length ();
24292
24293 for (int i = 0; i < n_patterns; ++i)
24294 if (!known_eq (d->perm[i], i)
24295 && !known_eq (d->perm[i], vec_len + i))
24296 return false;
24297
24298 for (int i = n_patterns; i < n_patterns * 2; i++)
24299 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
24300 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
24301 return false;
24302
24303 if (d->testing_p)
24304 return true;
24305
24306 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
24307
24308 /* Build a predicate that is true when op0 elements should be used. */
24309 rtx_vector_builder builder (pred_mode, n_patterns, 2);
24310 for (int i = 0; i < n_patterns * 2; i++)
24311 {
24312 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
24313 : CONST0_RTX (BImode);
24314 builder.quick_push (elem);
24315 }
24316
24317 rtx const_vec = builder.build ();
24318 rtx pred = force_reg (pred_mode, const_vec);
24319 /* TARGET = PRED ? OP0 : OP1. */
24320 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
24321 return true;
24322 }
24323
24324 /* Recognize patterns suitable for the INS instructions. */
24325 static bool
24326 aarch64_evpc_ins (struct expand_vec_perm_d *d)
24327 {
24328 machine_mode mode = d->vmode;
24329 unsigned HOST_WIDE_INT nelt;
24330
24331 if (d->vec_flags != VEC_ADVSIMD)
24332 return false;
24333
24334 /* to_constant is safe since this routine is specific to Advanced SIMD
24335 vectors. */
24336 nelt = d->perm.length ().to_constant ();
24337 rtx insv = d->op0;
24338
24339 HOST_WIDE_INT idx = -1;
24340
24341 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24342 {
24343 HOST_WIDE_INT elt;
24344 if (!d->perm[i].is_constant (&elt))
24345 return false;
24346 if (elt == (HOST_WIDE_INT) i)
24347 continue;
24348 if (idx != -1)
24349 {
24350 idx = -1;
24351 break;
24352 }
24353 idx = i;
24354 }
24355
24356 if (idx == -1)
24357 {
24358 insv = d->op1;
24359 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24360 {
24361 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
24362 continue;
24363 if (idx != -1)
24364 return false;
24365 idx = i;
24366 }
24367
24368 if (idx == -1)
24369 return false;
24370 }
24371
24372 if (d->testing_p)
24373 return true;
24374
24375 gcc_assert (idx != -1);
24376
24377 unsigned extractindex = d->perm[idx].to_constant ();
24378 rtx extractv = d->op0;
24379 if (extractindex >= nelt)
24380 {
24381 extractv = d->op1;
24382 extractindex -= nelt;
24383 }
24384 gcc_assert (extractindex < nelt);
24385
24386 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
24387 expand_operand ops[5];
24388 create_output_operand (&ops[0], d->target, mode);
24389 create_input_operand (&ops[1], insv, mode);
24390 create_integer_operand (&ops[2], 1 << idx);
24391 create_input_operand (&ops[3], extractv, mode);
24392 create_integer_operand (&ops[4], extractindex);
24393 expand_insn (icode, 5, ops);
24394
24395 return true;
24396 }
24397
24398 static bool
24399 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
24400 {
24401 gcc_assert (d->op_mode != E_VOIDmode);
24402
24403 /* The pattern matching functions above are written to look for a small
24404 number to begin the sequence (0, 1, N/2). If we begin with an index
24405 from the second operand, we can swap the operands. */
24406 poly_int64 nelt = d->perm.length ();
24407 if (known_ge (d->perm[0], nelt))
24408 {
24409 d->perm.rotate_inputs (1);
24410 std::swap (d->op0, d->op1);
24411 }
24412
24413 if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
24414 || d->vec_flags == VEC_SVE_DATA
24415 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
24416 || d->vec_flags == VEC_SVE_PRED)
24417 && known_gt (nelt, 1))
24418 {
24419 if (d->vmode == d->op_mode)
24420 {
24421 if (aarch64_evpc_rev_local (d))
24422 return true;
24423 else if (aarch64_evpc_rev_global (d))
24424 return true;
24425 else if (aarch64_evpc_ext (d))
24426 return true;
24427 else if (aarch64_evpc_dup (d))
24428 return true;
24429 else if (aarch64_evpc_zip (d))
24430 return true;
24431 else if (aarch64_evpc_uzp (d))
24432 return true;
24433 else if (aarch64_evpc_trn (d))
24434 return true;
24435 else if (aarch64_evpc_sel (d))
24436 return true;
24437 else if (aarch64_evpc_ins (d))
24438 return true;
24439 else if (aarch64_evpc_reencode (d))
24440 return true;
24441
24442 if (d->vec_flags == VEC_SVE_DATA)
24443 return aarch64_evpc_sve_tbl (d);
24444 else if (d->vec_flags == VEC_ADVSIMD)
24445 return aarch64_evpc_tbl (d);
24446 }
24447 else
24448 {
24449 if (aarch64_evpc_sve_dup (d))
24450 return true;
24451 }
24452 }
24453 return false;
24454 }
24455
24456 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
24457
24458 static bool
24459 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
24460 rtx target, rtx op0, rtx op1,
24461 const vec_perm_indices &sel)
24462 {
24463 struct expand_vec_perm_d d;
24464
24465 /* Check whether the mask can be applied to a single vector. */
24466 if (sel.ninputs () == 1
24467 || (op0 && rtx_equal_p (op0, op1)))
24468 d.one_vector_p = true;
24469 else if (sel.all_from_input_p (0))
24470 {
24471 d.one_vector_p = true;
24472 op1 = op0;
24473 }
24474 else if (sel.all_from_input_p (1))
24475 {
24476 d.one_vector_p = true;
24477 op0 = op1;
24478 }
24479 else
24480 d.one_vector_p = false;
24481
24482 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
24483 sel.nelts_per_input ());
24484 d.vmode = vmode;
24485 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
24486 d.op_mode = op_mode;
24487 d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
24488 d.target = target;
24489 d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
24490 if (op0 == op1)
24491 d.op1 = d.op0;
24492 else
24493 d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
24494 d.testing_p = !target;
24495
24496 if (!d.testing_p)
24497 return aarch64_expand_vec_perm_const_1 (&d);
24498
24499 rtx_insn *last = get_last_insn ();
24500 bool ret = aarch64_expand_vec_perm_const_1 (&d);
24501 gcc_assert (last == get_last_insn ());
24502
24503 return ret;
24504 }
24505
24506 /* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */
24507
24508 bool
24509 aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
24510 tree vectype, wide_int cst,
24511 rtx *output, rtx in0, rtx in1)
24512 {
24513 if (code != TRUNC_DIV_EXPR
24514 || !TYPE_UNSIGNED (vectype))
24515 return false;
24516
24517 machine_mode mode = TYPE_MODE (vectype);
24518 unsigned int flags = aarch64_classify_vector_mode (mode);
24519 if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
24520 return false;
24521
24522 int pow = wi::exact_log2 (cst + 1);
24523 auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype));
24524 /* SVE actually has a div operator, we may have gotten here through
24525 that route. */
24526 if (pow != (int) (element_precision (vectype) / 2)
24527 || insn_code == CODE_FOR_nothing)
24528 return false;
24529
24530 /* We can use the optimized pattern. */
24531 if (in0 == NULL_RTX && in1 == NULL_RTX)
24532 return true;
24533
24534 gcc_assert (output);
24535
24536 expand_operand ops[3];
24537 create_output_operand (&ops[0], *output, mode);
24538 create_input_operand (&ops[1], in0, mode);
24539 create_fixed_operand (&ops[2], in1);
24540 expand_insn (insn_code, 3, ops);
24541 *output = ops[0].value;
24542 return true;
24543 }
24544
24545 /* Generate a byte permute mask for a register of mode MODE,
24546 which has NUNITS units. */
24547
24548 rtx
24549 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
24550 {
24551 /* We have to reverse each vector because we dont have
24552 a permuted load that can reverse-load according to ABI rules. */
24553 rtx mask;
24554 rtvec v = rtvec_alloc (16);
24555 unsigned int i, j;
24556 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
24557
24558 gcc_assert (BYTES_BIG_ENDIAN);
24559 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
24560
24561 for (i = 0; i < nunits; i++)
24562 for (j = 0; j < usize; j++)
24563 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
24564 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
24565 return force_reg (V16QImode, mask);
24566 }
24567
24568 /* Expand an SVE integer comparison using the SVE equivalent of:
24569
24570 (set TARGET (CODE OP0 OP1)). */
24571
24572 void
24573 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
24574 {
24575 machine_mode pred_mode = GET_MODE (target);
24576 machine_mode data_mode = GET_MODE (op0);
24577 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
24578 op0, op1);
24579 if (!rtx_equal_p (target, res))
24580 emit_move_insn (target, res);
24581 }
24582
24583 /* Return the UNSPEC_COND_* code for comparison CODE. */
24584
24585 static unsigned int
24586 aarch64_unspec_cond_code (rtx_code code)
24587 {
24588 switch (code)
24589 {
24590 case NE:
24591 return UNSPEC_COND_FCMNE;
24592 case EQ:
24593 return UNSPEC_COND_FCMEQ;
24594 case LT:
24595 return UNSPEC_COND_FCMLT;
24596 case GT:
24597 return UNSPEC_COND_FCMGT;
24598 case LE:
24599 return UNSPEC_COND_FCMLE;
24600 case GE:
24601 return UNSPEC_COND_FCMGE;
24602 case UNORDERED:
24603 return UNSPEC_COND_FCMUO;
24604 default:
24605 gcc_unreachable ();
24606 }
24607 }
24608
24609 /* Emit:
24610
24611 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24612
24613 where <X> is the operation associated with comparison CODE.
24614 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24615
24616 static void
24617 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
24618 bool known_ptrue_p, rtx op0, rtx op1)
24619 {
24620 rtx flag = gen_int_mode (known_ptrue_p, SImode);
24621 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
24622 gen_rtvec (4, pred, flag, op0, op1),
24623 aarch64_unspec_cond_code (code));
24624 emit_set_insn (target, unspec);
24625 }
24626
24627 /* Emit the SVE equivalent of:
24628
24629 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
24630 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
24631 (set TARGET (ior:PRED_MODE TMP1 TMP2))
24632
24633 where <Xi> is the operation associated with comparison CODEi.
24634 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24635
24636 static void
24637 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
24638 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
24639 {
24640 machine_mode pred_mode = GET_MODE (pred);
24641 rtx tmp1 = gen_reg_rtx (pred_mode);
24642 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
24643 rtx tmp2 = gen_reg_rtx (pred_mode);
24644 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
24645 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
24646 }
24647
24648 /* Emit the SVE equivalent of:
24649
24650 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24651 (set TARGET (not TMP))
24652
24653 where <X> is the operation associated with comparison CODE.
24654 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24655
24656 static void
24657 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
24658 bool known_ptrue_p, rtx op0, rtx op1)
24659 {
24660 machine_mode pred_mode = GET_MODE (pred);
24661 rtx tmp = gen_reg_rtx (pred_mode);
24662 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
24663 aarch64_emit_unop (target, one_cmpl_optab, tmp);
24664 }
24665
24666 /* Expand an SVE floating-point comparison using the SVE equivalent of:
24667
24668 (set TARGET (CODE OP0 OP1))
24669
24670 If CAN_INVERT_P is true, the caller can also handle inverted results;
24671 return true if the result is in fact inverted. */
24672
24673 bool
24674 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
24675 rtx op0, rtx op1, bool can_invert_p)
24676 {
24677 machine_mode pred_mode = GET_MODE (target);
24678 machine_mode data_mode = GET_MODE (op0);
24679
24680 rtx ptrue = aarch64_ptrue_reg (pred_mode);
24681 switch (code)
24682 {
24683 case UNORDERED:
24684 /* UNORDERED has no immediate form. */
24685 op1 = force_reg (data_mode, op1);
24686 /* fall through */
24687 case LT:
24688 case LE:
24689 case GT:
24690 case GE:
24691 case EQ:
24692 case NE:
24693 {
24694 /* There is native support for the comparison. */
24695 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24696 return false;
24697 }
24698
24699 case LTGT:
24700 /* This is a trapping operation (LT or GT). */
24701 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
24702 return false;
24703
24704 case UNEQ:
24705 if (!flag_trapping_math)
24706 {
24707 /* This would trap for signaling NaNs. */
24708 op1 = force_reg (data_mode, op1);
24709 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
24710 ptrue, true, op0, op1);
24711 return false;
24712 }
24713 /* fall through */
24714 case UNLT:
24715 case UNLE:
24716 case UNGT:
24717 case UNGE:
24718 if (flag_trapping_math)
24719 {
24720 /* Work out which elements are ordered. */
24721 rtx ordered = gen_reg_rtx (pred_mode);
24722 op1 = force_reg (data_mode, op1);
24723 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
24724 ptrue, true, op0, op1);
24725
24726 /* Test the opposite condition for the ordered elements,
24727 then invert the result. */
24728 if (code == UNEQ)
24729 code = NE;
24730 else
24731 code = reverse_condition_maybe_unordered (code);
24732 if (can_invert_p)
24733 {
24734 aarch64_emit_sve_fp_cond (target, code,
24735 ordered, false, op0, op1);
24736 return true;
24737 }
24738 aarch64_emit_sve_invert_fp_cond (target, code,
24739 ordered, false, op0, op1);
24740 return false;
24741 }
24742 break;
24743
24744 case ORDERED:
24745 /* ORDERED has no immediate form. */
24746 op1 = force_reg (data_mode, op1);
24747 break;
24748
24749 default:
24750 gcc_unreachable ();
24751 }
24752
24753 /* There is native support for the inverse comparison. */
24754 code = reverse_condition_maybe_unordered (code);
24755 if (can_invert_p)
24756 {
24757 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24758 return true;
24759 }
24760 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
24761 return false;
24762 }
24763
24764 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
24765 of the data being selected and CMP_MODE is the mode of the values being
24766 compared. */
24767
24768 void
24769 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
24770 rtx *ops)
24771 {
24772 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
24773 rtx pred = gen_reg_rtx (pred_mode);
24774 if (FLOAT_MODE_P (cmp_mode))
24775 {
24776 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
24777 ops[4], ops[5], true))
24778 std::swap (ops[1], ops[2]);
24779 }
24780 else
24781 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
24782
24783 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
24784 ops[1] = force_reg (data_mode, ops[1]);
24785 /* The "false" value can only be zero if the "true" value is a constant. */
24786 if (register_operand (ops[1], data_mode)
24787 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
24788 ops[2] = force_reg (data_mode, ops[2]);
24789
24790 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
24791 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
24792 }
24793
24794 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
24795 true. However due to issues with register allocation it is preferable
24796 to avoid tieing integer scalar and FP scalar modes. Executing integer
24797 operations in general registers is better than treating them as scalar
24798 vector operations. This reduces latency and avoids redundant int<->FP
24799 moves. So tie modes if they are either the same class, or vector modes
24800 with other vector modes, vector structs or any scalar mode. */
24801
24802 static bool
24803 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
24804 {
24805 if ((aarch64_advsimd_partial_struct_mode_p (mode1)
24806 != aarch64_advsimd_partial_struct_mode_p (mode2))
24807 && maybe_gt (GET_MODE_SIZE (mode1), 8)
24808 && maybe_gt (GET_MODE_SIZE (mode2), 8))
24809 return false;
24810
24811 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
24812 return true;
24813
24814 /* We specifically want to allow elements of "structure" modes to
24815 be tieable to the structure. This more general condition allows
24816 other rarer situations too. The reason we don't extend this to
24817 predicate modes is that there are no predicate structure modes
24818 nor any specific instructions for extracting part of a predicate
24819 register. */
24820 if (aarch64_vector_data_mode_p (mode1)
24821 && aarch64_vector_data_mode_p (mode2))
24822 return true;
24823
24824 /* Also allow any scalar modes with vectors. */
24825 if (aarch64_vector_mode_supported_p (mode1)
24826 || aarch64_vector_mode_supported_p (mode2))
24827 return true;
24828
24829 return false;
24830 }
24831
24832 /* Return a new RTX holding the result of moving POINTER forward by
24833 AMOUNT bytes. */
24834
24835 static rtx
24836 aarch64_move_pointer (rtx pointer, poly_int64 amount)
24837 {
24838 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
24839
24840 return adjust_automodify_address (pointer, GET_MODE (pointer),
24841 next, amount);
24842 }
24843
24844 /* Return a new RTX holding the result of moving POINTER forward by the
24845 size of the mode it points to. */
24846
24847 static rtx
24848 aarch64_progress_pointer (rtx pointer)
24849 {
24850 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
24851 }
24852
24853 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
24854 MODE bytes. */
24855
24856 static void
24857 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
24858 machine_mode mode)
24859 {
24860 /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
24861 address copies using V4SImode so that we can use Q registers. */
24862 if (known_eq (GET_MODE_BITSIZE (mode), 256))
24863 {
24864 mode = V4SImode;
24865 rtx reg1 = gen_reg_rtx (mode);
24866 rtx reg2 = gen_reg_rtx (mode);
24867 /* "Cast" the pointers to the correct mode. */
24868 *src = adjust_address (*src, mode, 0);
24869 *dst = adjust_address (*dst, mode, 0);
24870 /* Emit the memcpy. */
24871 emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
24872 aarch64_progress_pointer (*src)));
24873 emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
24874 aarch64_progress_pointer (*dst), reg2));
24875 /* Move the pointers forward. */
24876 *src = aarch64_move_pointer (*src, 32);
24877 *dst = aarch64_move_pointer (*dst, 32);
24878 return;
24879 }
24880
24881 rtx reg = gen_reg_rtx (mode);
24882
24883 /* "Cast" the pointers to the correct mode. */
24884 *src = adjust_address (*src, mode, 0);
24885 *dst = adjust_address (*dst, mode, 0);
24886 /* Emit the memcpy. */
24887 emit_move_insn (reg, *src);
24888 emit_move_insn (*dst, reg);
24889 /* Move the pointers forward. */
24890 *src = aarch64_progress_pointer (*src);
24891 *dst = aarch64_progress_pointer (*dst);
24892 }
24893
24894 /* Expand a cpymem using the MOPS extension. OPERANDS are taken
24895 from the cpymem pattern. Return true iff we succeeded. */
24896 static bool
24897 aarch64_expand_cpymem_mops (rtx *operands)
24898 {
24899 if (!TARGET_MOPS)
24900 return false;
24901
24902 /* All three registers are changed by the instruction, so each one
24903 must be a fresh pseudo. */
24904 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24905 rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
24906 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24907 rtx src_mem = replace_equiv_address (operands[1], src_addr);
24908 rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
24909 emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
24910
24911 return true;
24912 }
24913
24914 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
24915 we succeed, otherwise return false, indicating that a libcall to
24916 memcpy should be emitted. */
24917
24918 bool
24919 aarch64_expand_cpymem (rtx *operands)
24920 {
24921 int mode_bits;
24922 rtx dst = operands[0];
24923 rtx src = operands[1];
24924 rtx base;
24925 machine_mode cur_mode = BLKmode;
24926
24927 /* Variable-sized memcpy can go through the MOPS expansion if available. */
24928 if (!CONST_INT_P (operands[2]))
24929 return aarch64_expand_cpymem_mops (operands);
24930
24931 unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
24932
24933 /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
24934 unsigned HOST_WIDE_INT max_copy_size
24935 = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
24936
24937 bool size_p = optimize_function_for_size_p (cfun);
24938
24939 /* Large constant-sized cpymem should go through MOPS when possible.
24940 It should be a win even for size optimization in the general case.
24941 For speed optimization the choice between MOPS and the SIMD sequence
24942 depends on the size of the copy, rather than number of instructions,
24943 alignment etc. */
24944 if (size > max_copy_size)
24945 return aarch64_expand_cpymem_mops (operands);
24946
24947 int copy_bits = 256;
24948
24949 /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
24950 support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
24951 if (size <= 24
24952 || !TARGET_SIMD
24953 || (aarch64_tune_params.extra_tuning_flags
24954 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
24955 copy_bits = 128;
24956
24957 /* Emit an inline load+store sequence and count the number of operations
24958 involved. We use a simple count of just the loads and stores emitted
24959 rather than rtx_insn count as all the pointer adjustments and reg copying
24960 in this function will get optimized away later in the pipeline. */
24961 start_sequence ();
24962 unsigned nops = 0;
24963
24964 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24965 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24966
24967 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
24968 src = adjust_automodify_address (src, VOIDmode, base, 0);
24969
24970 /* Convert size to bits to make the rest of the code simpler. */
24971 int n = size * BITS_PER_UNIT;
24972
24973 while (n > 0)
24974 {
24975 /* Find the largest mode in which to do the copy in without over reading
24976 or writing. */
24977 opt_scalar_int_mode mode_iter;
24978 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
24979 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
24980 cur_mode = mode_iter.require ();
24981
24982 gcc_assert (cur_mode != BLKmode);
24983
24984 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
24985
24986 /* Prefer Q-register accesses for the last bytes. */
24987 if (mode_bits == 128 && copy_bits == 256)
24988 cur_mode = V4SImode;
24989
24990 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
24991 /* A single block copy is 1 load + 1 store. */
24992 nops += 2;
24993 n -= mode_bits;
24994
24995 /* Emit trailing copies using overlapping unaligned accesses
24996 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
24997 if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
24998 {
24999 machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
25000 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
25001 gcc_assert (n_bits <= mode_bits);
25002 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
25003 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
25004 n = n_bits;
25005 }
25006 }
25007 rtx_insn *seq = get_insns ();
25008 end_sequence ();
25009 /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
25010 the constant size into a register. */
25011 unsigned mops_cost = 3 + 1;
25012
25013 /* If MOPS is available at this point we don't consider the libcall as it's
25014 not a win even on code size. At this point only consider MOPS if
25015 optimizing for size. For speed optimizations we will have chosen between
25016 the two based on copy size already. */
25017 if (TARGET_MOPS)
25018 {
25019 if (size_p && mops_cost < nops)
25020 return aarch64_expand_cpymem_mops (operands);
25021 emit_insn (seq);
25022 return true;
25023 }
25024
25025 /* A memcpy libcall in the worst case takes 3 instructions to prepare the
25026 arguments + 1 for the call. When MOPS is not available and we're
25027 optimizing for size a libcall may be preferable. */
25028 unsigned libcall_cost = 4;
25029 if (size_p && libcall_cost < nops)
25030 return false;
25031
25032 emit_insn (seq);
25033 return true;
25034 }
25035
25036 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
25037 SRC is a register we have created with the duplicated value to be set. */
25038 static void
25039 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
25040 machine_mode mode)
25041 {
25042 /* If we are copying 128bits or 256bits, we can do that straight from
25043 the SIMD register we prepared. */
25044 if (known_eq (GET_MODE_BITSIZE (mode), 256))
25045 {
25046 mode = GET_MODE (src);
25047 /* "Cast" the *dst to the correct mode. */
25048 *dst = adjust_address (*dst, mode, 0);
25049 /* Emit the memset. */
25050 emit_insn (aarch64_gen_store_pair (mode, *dst, src,
25051 aarch64_progress_pointer (*dst), src));
25052
25053 /* Move the pointers forward. */
25054 *dst = aarch64_move_pointer (*dst, 32);
25055 return;
25056 }
25057 if (known_eq (GET_MODE_BITSIZE (mode), 128))
25058 {
25059 /* "Cast" the *dst to the correct mode. */
25060 *dst = adjust_address (*dst, GET_MODE (src), 0);
25061 /* Emit the memset. */
25062 emit_move_insn (*dst, src);
25063 /* Move the pointers forward. */
25064 *dst = aarch64_move_pointer (*dst, 16);
25065 return;
25066 }
25067 /* For copying less, we have to extract the right amount from src. */
25068 rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
25069
25070 /* "Cast" the *dst to the correct mode. */
25071 *dst = adjust_address (*dst, mode, 0);
25072 /* Emit the memset. */
25073 emit_move_insn (*dst, reg);
25074 /* Move the pointer forward. */
25075 *dst = aarch64_progress_pointer (*dst);
25076 }
25077
25078 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
25079 as for the setmem pattern. Return true iff we succeed. */
25080 static bool
25081 aarch64_expand_setmem_mops (rtx *operands)
25082 {
25083 if (!TARGET_MOPS)
25084 return false;
25085
25086 /* The first two registers are changed by the instruction, so both
25087 of them must be a fresh pseudo. */
25088 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
25089 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
25090 rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
25091 rtx val = operands[2];
25092 if (val != CONST0_RTX (QImode))
25093 val = force_reg (QImode, val);
25094 emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
25095 return true;
25096 }
25097
25098 /* Expand setmem, as if from a __builtin_memset. Return true if
25099 we succeed, otherwise return false. */
25100
25101 bool
25102 aarch64_expand_setmem (rtx *operands)
25103 {
25104 int n, mode_bits;
25105 unsigned HOST_WIDE_INT len;
25106 rtx dst = operands[0];
25107 rtx val = operands[2], src;
25108 rtx base;
25109 machine_mode cur_mode = BLKmode, next_mode;
25110
25111 /* If we don't have SIMD registers or the size is variable use the MOPS
25112 inlined sequence if possible. */
25113 if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
25114 return aarch64_expand_setmem_mops (operands);
25115
25116 bool size_p = optimize_function_for_size_p (cfun);
25117
25118 /* Default the maximum to 256-bytes when considering only libcall vs
25119 SIMD broadcast sequence. */
25120 unsigned max_set_size = 256;
25121
25122 len = INTVAL (operands[1]);
25123 if (len > max_set_size && !TARGET_MOPS)
25124 return false;
25125
25126 int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
25127 /* The MOPS sequence takes:
25128 3 instructions for the memory storing
25129 + 1 to move the constant size into a reg
25130 + 1 if VAL is a non-zero constant to move into a reg
25131 (zero constants can use XZR directly). */
25132 unsigned mops_cost = 3 + 1 + cst_val;
25133 /* A libcall to memset in the worst case takes 3 instructions to prepare
25134 the arguments + 1 for the call. */
25135 unsigned libcall_cost = 4;
25136
25137 /* Upper bound check. For large constant-sized setmem use the MOPS sequence
25138 when available. */
25139 if (TARGET_MOPS
25140 && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
25141 return aarch64_expand_setmem_mops (operands);
25142
25143 /* Attempt a sequence with a vector broadcast followed by stores.
25144 Count the number of operations involved to see if it's worth it
25145 against the alternatives. A simple counter simd_ops on the
25146 algorithmically-relevant operations is used rather than an rtx_insn count
25147 as all the pointer adjusmtents and mode reinterprets will be optimized
25148 away later. */
25149 start_sequence ();
25150 unsigned simd_ops = 0;
25151
25152 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
25153 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
25154
25155 /* Prepare the val using a DUP/MOVI v0.16B, val. */
25156 src = expand_vector_broadcast (V16QImode, val);
25157 src = force_reg (V16QImode, src);
25158 simd_ops++;
25159 /* Convert len to bits to make the rest of the code simpler. */
25160 n = len * BITS_PER_UNIT;
25161
25162 /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
25163 AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
25164 const int copy_limit = (aarch64_tune_params.extra_tuning_flags
25165 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
25166 ? GET_MODE_BITSIZE (TImode) : 256;
25167
25168 while (n > 0)
25169 {
25170 /* Find the largest mode in which to do the copy without
25171 over writing. */
25172 opt_scalar_int_mode mode_iter;
25173 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
25174 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
25175 cur_mode = mode_iter.require ();
25176
25177 gcc_assert (cur_mode != BLKmode);
25178
25179 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
25180 aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
25181 simd_ops++;
25182 n -= mode_bits;
25183
25184 /* Do certain trailing copies as overlapping if it's going to be
25185 cheaper. i.e. less instructions to do so. For instance doing a 15
25186 byte copy it's more efficient to do two overlapping 8 byte copies than
25187 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
25188 if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
25189 {
25190 next_mode = smallest_mode_for_size (n, MODE_INT);
25191 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
25192 gcc_assert (n_bits <= mode_bits);
25193 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
25194 n = n_bits;
25195 }
25196 }
25197 rtx_insn *seq = get_insns ();
25198 end_sequence ();
25199
25200 if (size_p)
25201 {
25202 /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
25203 call to memset or the MOPS expansion. */
25204 if (TARGET_MOPS
25205 && mops_cost <= libcall_cost
25206 && mops_cost <= simd_ops)
25207 return aarch64_expand_setmem_mops (operands);
25208 /* If MOPS is not available or not shorter pick a libcall if the SIMD
25209 sequence is too long. */
25210 else if (libcall_cost < simd_ops)
25211 return false;
25212 emit_insn (seq);
25213 return true;
25214 }
25215
25216 /* At this point the SIMD broadcast sequence is the best choice when
25217 optimizing for speed. */
25218 emit_insn (seq);
25219 return true;
25220 }
25221
25222
25223 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
25224 SImode stores. Handle the case when the constant has identical
25225 bottom and top halves. This is beneficial when the two stores can be
25226 merged into an STP and we avoid synthesising potentially expensive
25227 immediates twice. Return true if such a split is possible. */
25228
25229 bool
25230 aarch64_split_dimode_const_store (rtx dst, rtx src)
25231 {
25232 rtx lo = gen_lowpart (SImode, src);
25233 rtx hi = gen_highpart_mode (SImode, DImode, src);
25234
25235 bool size_p = optimize_function_for_size_p (cfun);
25236
25237 if (!rtx_equal_p (lo, hi))
25238 return false;
25239
25240 unsigned int orig_cost
25241 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
25242 unsigned int lo_cost
25243 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
25244
25245 /* We want to transform:
25246 MOV x1, 49370
25247 MOVK x1, 0x140, lsl 16
25248 MOVK x1, 0xc0da, lsl 32
25249 MOVK x1, 0x140, lsl 48
25250 STR x1, [x0]
25251 into:
25252 MOV w1, 49370
25253 MOVK w1, 0x140, lsl 16
25254 STP w1, w1, [x0]
25255 So we want to perform this only when we save two instructions
25256 or more. When optimizing for size, however, accept any code size
25257 savings we can. */
25258 if (size_p && orig_cost <= lo_cost)
25259 return false;
25260
25261 if (!size_p
25262 && (orig_cost <= lo_cost + 1))
25263 return false;
25264
25265 rtx mem_lo = adjust_address (dst, SImode, 0);
25266 if (!aarch64_mem_pair_operand (mem_lo, SImode))
25267 return false;
25268
25269 rtx tmp_reg = gen_reg_rtx (SImode);
25270 aarch64_expand_mov_immediate (tmp_reg, lo);
25271 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
25272 /* Don't emit an explicit store pair as this may not be always profitable.
25273 Let the sched-fusion logic decide whether to merge them. */
25274 emit_move_insn (mem_lo, tmp_reg);
25275 emit_move_insn (mem_hi, tmp_reg);
25276
25277 return true;
25278 }
25279
25280 /* Generate RTL for a conditional branch with rtx comparison CODE in
25281 mode CC_MODE. The destination of the unlikely conditional branch
25282 is LABEL_REF. */
25283
25284 void
25285 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
25286 rtx label_ref)
25287 {
25288 rtx x;
25289 x = gen_rtx_fmt_ee (code, VOIDmode,
25290 gen_rtx_REG (cc_mode, CC_REGNUM),
25291 const0_rtx);
25292
25293 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
25294 gen_rtx_LABEL_REF (VOIDmode, label_ref),
25295 pc_rtx);
25296 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
25297 }
25298
25299 /* Generate DImode scratch registers for 128-bit (TImode) addition.
25300
25301 OP1 represents the TImode destination operand 1
25302 OP2 represents the TImode destination operand 2
25303 LOW_DEST represents the low half (DImode) of TImode operand 0
25304 LOW_IN1 represents the low half (DImode) of TImode operand 1
25305 LOW_IN2 represents the low half (DImode) of TImode operand 2
25306 HIGH_DEST represents the high half (DImode) of TImode operand 0
25307 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25308 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
25309
25310 void
25311 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25312 rtx *low_in1, rtx *low_in2,
25313 rtx *high_dest, rtx *high_in1,
25314 rtx *high_in2)
25315 {
25316 *low_dest = gen_reg_rtx (DImode);
25317 *low_in1 = gen_lowpart (DImode, op1);
25318 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25319 subreg_lowpart_offset (DImode, TImode));
25320 *high_dest = gen_reg_rtx (DImode);
25321 *high_in1 = gen_highpart (DImode, op1);
25322 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25323 subreg_highpart_offset (DImode, TImode));
25324 }
25325
25326 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
25327
25328 This function differs from 'arch64_addti_scratch_regs' in that
25329 OP1 can be an immediate constant (zero). We must call
25330 subreg_highpart_offset with DImode and TImode arguments, otherwise
25331 VOIDmode will be used for the const_int which generates an internal
25332 error from subreg_size_highpart_offset which does not expect a size of zero.
25333
25334 OP1 represents the TImode destination operand 1
25335 OP2 represents the TImode destination operand 2
25336 LOW_DEST represents the low half (DImode) of TImode operand 0
25337 LOW_IN1 represents the low half (DImode) of TImode operand 1
25338 LOW_IN2 represents the low half (DImode) of TImode operand 2
25339 HIGH_DEST represents the high half (DImode) of TImode operand 0
25340 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25341 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
25342
25343
25344 void
25345 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25346 rtx *low_in1, rtx *low_in2,
25347 rtx *high_dest, rtx *high_in1,
25348 rtx *high_in2)
25349 {
25350 *low_dest = gen_reg_rtx (DImode);
25351 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
25352 subreg_lowpart_offset (DImode, TImode));
25353
25354 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25355 subreg_lowpart_offset (DImode, TImode));
25356 *high_dest = gen_reg_rtx (DImode);
25357
25358 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
25359 subreg_highpart_offset (DImode, TImode));
25360 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25361 subreg_highpart_offset (DImode, TImode));
25362 }
25363
25364 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
25365
25366 OP0 represents the TImode destination operand 0
25367 LOW_DEST represents the low half (DImode) of TImode operand 0
25368 LOW_IN1 represents the low half (DImode) of TImode operand 1
25369 LOW_IN2 represents the low half (DImode) of TImode operand 2
25370 HIGH_DEST represents the high half (DImode) of TImode operand 0
25371 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25372 HIGH_IN2 represents the high half (DImode) of TImode operand 2
25373 UNSIGNED_P is true if the operation is being performed on unsigned
25374 values. */
25375 void
25376 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
25377 rtx low_in2, rtx high_dest, rtx high_in1,
25378 rtx high_in2, bool unsigned_p)
25379 {
25380 if (low_in2 == const0_rtx)
25381 {
25382 low_dest = low_in1;
25383 high_in2 = force_reg (DImode, high_in2);
25384 if (unsigned_p)
25385 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
25386 else
25387 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
25388 }
25389 else
25390 {
25391 if (aarch64_plus_immediate (low_in2, DImode))
25392 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
25393 GEN_INT (-UINTVAL (low_in2))));
25394 else
25395 {
25396 low_in2 = force_reg (DImode, low_in2);
25397 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
25398 }
25399 high_in2 = force_reg (DImode, high_in2);
25400
25401 if (unsigned_p)
25402 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
25403 else
25404 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
25405 }
25406
25407 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
25408 emit_move_insn (gen_highpart (DImode, op0), high_dest);
25409
25410 }
25411
25412 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
25413
25414 static unsigned HOST_WIDE_INT
25415 aarch64_asan_shadow_offset (void)
25416 {
25417 if (TARGET_ILP32)
25418 return (HOST_WIDE_INT_1 << 29);
25419 else
25420 return (HOST_WIDE_INT_1 << 36);
25421 }
25422
25423 static rtx
25424 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
25425 int code, tree treeop0, tree treeop1)
25426 {
25427 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25428 rtx op0, op1;
25429 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25430 insn_code icode;
25431 struct expand_operand ops[4];
25432
25433 start_sequence ();
25434 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25435
25436 op_mode = GET_MODE (op0);
25437 if (op_mode == VOIDmode)
25438 op_mode = GET_MODE (op1);
25439
25440 switch (op_mode)
25441 {
25442 case E_QImode:
25443 case E_HImode:
25444 case E_SImode:
25445 cmp_mode = SImode;
25446 icode = CODE_FOR_cmpsi;
25447 break;
25448
25449 case E_DImode:
25450 cmp_mode = DImode;
25451 icode = CODE_FOR_cmpdi;
25452 break;
25453
25454 case E_SFmode:
25455 cmp_mode = SFmode;
25456 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25457 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
25458 break;
25459
25460 case E_DFmode:
25461 cmp_mode = DFmode;
25462 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25463 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
25464 break;
25465
25466 default:
25467 end_sequence ();
25468 return NULL_RTX;
25469 }
25470
25471 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
25472 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
25473 if (!op0 || !op1)
25474 {
25475 end_sequence ();
25476 return NULL_RTX;
25477 }
25478 *prep_seq = get_insns ();
25479 end_sequence ();
25480
25481 create_fixed_operand (&ops[0], op0);
25482 create_fixed_operand (&ops[1], op1);
25483
25484 start_sequence ();
25485 if (!maybe_expand_insn (icode, 2, ops))
25486 {
25487 end_sequence ();
25488 return NULL_RTX;
25489 }
25490 *gen_seq = get_insns ();
25491 end_sequence ();
25492
25493 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
25494 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
25495 }
25496
25497 static rtx
25498 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
25499 int cmp_code, tree treeop0, tree treeop1, int bit_code)
25500 {
25501 rtx op0, op1, target;
25502 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25503 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25504 insn_code icode;
25505 struct expand_operand ops[6];
25506 int aarch64_cond;
25507
25508 push_to_sequence (*prep_seq);
25509 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25510
25511 op_mode = GET_MODE (op0);
25512 if (op_mode == VOIDmode)
25513 op_mode = GET_MODE (op1);
25514
25515 switch (op_mode)
25516 {
25517 case E_QImode:
25518 case E_HImode:
25519 case E_SImode:
25520 cmp_mode = SImode;
25521 break;
25522
25523 case E_DImode:
25524 cmp_mode = DImode;
25525 break;
25526
25527 case E_SFmode:
25528 cmp_mode = SFmode;
25529 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25530 break;
25531
25532 case E_DFmode:
25533 cmp_mode = DFmode;
25534 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25535 break;
25536
25537 default:
25538 end_sequence ();
25539 return NULL_RTX;
25540 }
25541
25542 icode = code_for_ccmp (cc_mode, cmp_mode);
25543
25544 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
25545 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
25546 if (!op0 || !op1)
25547 {
25548 end_sequence ();
25549 return NULL_RTX;
25550 }
25551 *prep_seq = get_insns ();
25552 end_sequence ();
25553
25554 target = gen_rtx_REG (cc_mode, CC_REGNUM);
25555 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
25556
25557 if (bit_code != AND)
25558 {
25559 /* Treat the ccmp patterns as canonical and use them where possible,
25560 but fall back to ccmp_rev patterns if there's no other option. */
25561 rtx_code prev_code = GET_CODE (prev);
25562 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
25563 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
25564 && !(prev_code == EQ
25565 || prev_code == NE
25566 || prev_code == ORDERED
25567 || prev_code == UNORDERED))
25568 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
25569 else
25570 {
25571 rtx_code code = reverse_condition (prev_code);
25572 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
25573 }
25574 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
25575 }
25576
25577 create_fixed_operand (&ops[0], XEXP (prev, 0));
25578 create_fixed_operand (&ops[1], target);
25579 create_fixed_operand (&ops[2], op0);
25580 create_fixed_operand (&ops[3], op1);
25581 create_fixed_operand (&ops[4], prev);
25582 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
25583
25584 push_to_sequence (*gen_seq);
25585 if (!maybe_expand_insn (icode, 6, ops))
25586 {
25587 end_sequence ();
25588 return NULL_RTX;
25589 }
25590
25591 *gen_seq = get_insns ();
25592 end_sequence ();
25593
25594 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
25595 }
25596
25597 #undef TARGET_GEN_CCMP_FIRST
25598 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
25599
25600 #undef TARGET_GEN_CCMP_NEXT
25601 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
25602
25603 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
25604 instruction fusion of some sort. */
25605
25606 static bool
25607 aarch64_macro_fusion_p (void)
25608 {
25609 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
25610 }
25611
25612
25613 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
25614 should be kept together during scheduling. */
25615
25616 static bool
25617 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
25618 {
25619 rtx set_dest;
25620 rtx prev_set = single_set (prev);
25621 rtx curr_set = single_set (curr);
25622 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
25623 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
25624
25625 if (!aarch64_macro_fusion_p ())
25626 return false;
25627
25628 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
25629 {
25630 /* We are trying to match:
25631 prev (mov) == (set (reg r0) (const_int imm16))
25632 curr (movk) == (set (zero_extract (reg r0)
25633 (const_int 16)
25634 (const_int 16))
25635 (const_int imm16_1)) */
25636
25637 set_dest = SET_DEST (curr_set);
25638
25639 if (GET_CODE (set_dest) == ZERO_EXTRACT
25640 && CONST_INT_P (SET_SRC (curr_set))
25641 && CONST_INT_P (SET_SRC (prev_set))
25642 && CONST_INT_P (XEXP (set_dest, 2))
25643 && INTVAL (XEXP (set_dest, 2)) == 16
25644 && REG_P (XEXP (set_dest, 0))
25645 && REG_P (SET_DEST (prev_set))
25646 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
25647 {
25648 return true;
25649 }
25650 }
25651
25652 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
25653 {
25654
25655 /* We're trying to match:
25656 prev (adrp) == (set (reg r1)
25657 (high (symbol_ref ("SYM"))))
25658 curr (add) == (set (reg r0)
25659 (lo_sum (reg r1)
25660 (symbol_ref ("SYM"))))
25661 Note that r0 need not necessarily be the same as r1, especially
25662 during pre-regalloc scheduling. */
25663
25664 if (satisfies_constraint_Ush (SET_SRC (prev_set))
25665 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25666 {
25667 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
25668 && REG_P (XEXP (SET_SRC (curr_set), 0))
25669 && REGNO (XEXP (SET_SRC (curr_set), 0))
25670 == REGNO (SET_DEST (prev_set))
25671 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
25672 XEXP (SET_SRC (curr_set), 1)))
25673 return true;
25674 }
25675 }
25676
25677 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
25678 {
25679
25680 /* We're trying to match:
25681 prev (movk) == (set (zero_extract (reg r0)
25682 (const_int 16)
25683 (const_int 32))
25684 (const_int imm16_1))
25685 curr (movk) == (set (zero_extract (reg r0)
25686 (const_int 16)
25687 (const_int 48))
25688 (const_int imm16_2)) */
25689
25690 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
25691 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
25692 && REG_P (XEXP (SET_DEST (prev_set), 0))
25693 && REG_P (XEXP (SET_DEST (curr_set), 0))
25694 && REGNO (XEXP (SET_DEST (prev_set), 0))
25695 == REGNO (XEXP (SET_DEST (curr_set), 0))
25696 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
25697 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
25698 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
25699 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
25700 && CONST_INT_P (SET_SRC (prev_set))
25701 && CONST_INT_P (SET_SRC (curr_set)))
25702 return true;
25703
25704 }
25705 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
25706 {
25707 /* We're trying to match:
25708 prev (adrp) == (set (reg r0)
25709 (high (symbol_ref ("SYM"))))
25710 curr (ldr) == (set (reg r1)
25711 (mem (lo_sum (reg r0)
25712 (symbol_ref ("SYM")))))
25713 or
25714 curr (ldr) == (set (reg r1)
25715 (zero_extend (mem
25716 (lo_sum (reg r0)
25717 (symbol_ref ("SYM")))))) */
25718 if (satisfies_constraint_Ush (SET_SRC (prev_set))
25719 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25720 {
25721 rtx curr_src = SET_SRC (curr_set);
25722
25723 if (GET_CODE (curr_src) == ZERO_EXTEND)
25724 curr_src = XEXP (curr_src, 0);
25725
25726 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
25727 && REG_P (XEXP (XEXP (curr_src, 0), 0))
25728 && REGNO (XEXP (XEXP (curr_src, 0), 0))
25729 == REGNO (SET_DEST (prev_set))
25730 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
25731 XEXP (SET_SRC (prev_set), 0)))
25732 return true;
25733 }
25734 }
25735
25736 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
25737 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
25738 && prev_set && curr_set && any_condjump_p (curr)
25739 && GET_CODE (SET_SRC (prev_set)) == COMPARE
25740 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
25741 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
25742 return true;
25743
25744 /* Fuse flag-setting ALU instructions and conditional branch. */
25745 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
25746 && any_condjump_p (curr))
25747 {
25748 unsigned int condreg1, condreg2;
25749 rtx cc_reg_1;
25750 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
25751 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
25752
25753 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
25754 && prev
25755 && modified_in_p (cc_reg_1, prev))
25756 {
25757 enum attr_type prev_type = get_attr_type (prev);
25758
25759 /* FIXME: this misses some which is considered simple arthematic
25760 instructions for ThunderX. Simple shifts are missed here. */
25761 if (prev_type == TYPE_ALUS_SREG
25762 || prev_type == TYPE_ALUS_IMM
25763 || prev_type == TYPE_LOGICS_REG
25764 || prev_type == TYPE_LOGICS_IMM)
25765 return true;
25766 }
25767 }
25768
25769 /* Fuse ALU instructions and CBZ/CBNZ. */
25770 if (prev_set
25771 && curr_set
25772 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
25773 && any_condjump_p (curr))
25774 {
25775 /* We're trying to match:
25776 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
25777 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
25778 (const_int 0))
25779 (label_ref ("SYM"))
25780 (pc)) */
25781 if (SET_DEST (curr_set) == (pc_rtx)
25782 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
25783 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
25784 && REG_P (SET_DEST (prev_set))
25785 && REGNO (SET_DEST (prev_set))
25786 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
25787 {
25788 /* Fuse ALU operations followed by conditional branch instruction. */
25789 switch (get_attr_type (prev))
25790 {
25791 case TYPE_ALU_IMM:
25792 case TYPE_ALU_SREG:
25793 case TYPE_ADC_REG:
25794 case TYPE_ADC_IMM:
25795 case TYPE_ADCS_REG:
25796 case TYPE_ADCS_IMM:
25797 case TYPE_LOGIC_REG:
25798 case TYPE_LOGIC_IMM:
25799 case TYPE_CSEL:
25800 case TYPE_ADR:
25801 case TYPE_MOV_IMM:
25802 case TYPE_SHIFT_REG:
25803 case TYPE_SHIFT_IMM:
25804 case TYPE_BFM:
25805 case TYPE_RBIT:
25806 case TYPE_REV:
25807 case TYPE_EXTEND:
25808 return true;
25809
25810 default:;
25811 }
25812 }
25813 }
25814
25815 /* Fuse A+B+1 and A-B-1 */
25816 if (simple_sets_p
25817 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
25818 {
25819 /* We're trying to match:
25820 prev == (set (r0) (plus (r0) (r1)))
25821 curr == (set (r0) (plus (r0) (const_int 1)))
25822 or:
25823 prev == (set (r0) (minus (r0) (r1)))
25824 curr == (set (r0) (plus (r0) (const_int -1))) */
25825
25826 rtx prev_src = SET_SRC (prev_set);
25827 rtx curr_src = SET_SRC (curr_set);
25828
25829 int polarity = 1;
25830 if (GET_CODE (prev_src) == MINUS)
25831 polarity = -1;
25832
25833 if (GET_CODE (curr_src) == PLUS
25834 && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
25835 && CONST_INT_P (XEXP (curr_src, 1))
25836 && INTVAL (XEXP (curr_src, 1)) == polarity
25837 && REG_P (XEXP (curr_src, 0))
25838 && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
25839 return true;
25840 }
25841
25842 return false;
25843 }
25844
25845 /* Return true iff the instruction fusion described by OP is enabled. */
25846
25847 bool
25848 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
25849 {
25850 return (aarch64_tune_params.fusible_ops & op) != 0;
25851 }
25852
25853 /* If MEM is in the form of [base+offset], extract the two parts
25854 of address and set to BASE and OFFSET, otherwise return false
25855 after clearing BASE and OFFSET. */
25856
25857 bool
25858 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
25859 {
25860 rtx addr;
25861
25862 gcc_assert (MEM_P (mem));
25863
25864 addr = XEXP (mem, 0);
25865
25866 if (REG_P (addr))
25867 {
25868 *base = addr;
25869 *offset = const0_rtx;
25870 return true;
25871 }
25872
25873 if (GET_CODE (addr) == PLUS
25874 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
25875 {
25876 *base = XEXP (addr, 0);
25877 *offset = XEXP (addr, 1);
25878 return true;
25879 }
25880
25881 *base = NULL_RTX;
25882 *offset = NULL_RTX;
25883
25884 return false;
25885 }
25886
25887 /* Types for scheduling fusion. */
25888 enum sched_fusion_type
25889 {
25890 SCHED_FUSION_NONE = 0,
25891 SCHED_FUSION_LD_SIGN_EXTEND,
25892 SCHED_FUSION_LD_ZERO_EXTEND,
25893 SCHED_FUSION_LD,
25894 SCHED_FUSION_ST,
25895 SCHED_FUSION_NUM
25896 };
25897
25898 /* If INSN is a load or store of address in the form of [base+offset],
25899 extract the two parts and set to BASE and OFFSET. Return scheduling
25900 fusion type this INSN is. */
25901
25902 static enum sched_fusion_type
25903 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
25904 {
25905 rtx x, dest, src;
25906 enum sched_fusion_type fusion = SCHED_FUSION_LD;
25907
25908 gcc_assert (INSN_P (insn));
25909 x = PATTERN (insn);
25910 if (GET_CODE (x) != SET)
25911 return SCHED_FUSION_NONE;
25912
25913 src = SET_SRC (x);
25914 dest = SET_DEST (x);
25915
25916 machine_mode dest_mode = GET_MODE (dest);
25917
25918 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
25919 return SCHED_FUSION_NONE;
25920
25921 if (GET_CODE (src) == SIGN_EXTEND)
25922 {
25923 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
25924 src = XEXP (src, 0);
25925 if (!MEM_P (src) || GET_MODE (src) != SImode)
25926 return SCHED_FUSION_NONE;
25927 }
25928 else if (GET_CODE (src) == ZERO_EXTEND)
25929 {
25930 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
25931 src = XEXP (src, 0);
25932 if (!MEM_P (src) || GET_MODE (src) != SImode)
25933 return SCHED_FUSION_NONE;
25934 }
25935
25936 if (MEM_P (src) && REG_P (dest))
25937 extract_base_offset_in_addr (src, base, offset);
25938 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
25939 {
25940 fusion = SCHED_FUSION_ST;
25941 extract_base_offset_in_addr (dest, base, offset);
25942 }
25943 else
25944 return SCHED_FUSION_NONE;
25945
25946 if (*base == NULL_RTX || *offset == NULL_RTX)
25947 fusion = SCHED_FUSION_NONE;
25948
25949 return fusion;
25950 }
25951
25952 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
25953
25954 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
25955 and PRI are only calculated for these instructions. For other instruction,
25956 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
25957 type instruction fusion can be added by returning different priorities.
25958
25959 It's important that irrelevant instructions get the largest FUSION_PRI. */
25960
25961 static void
25962 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
25963 int *fusion_pri, int *pri)
25964 {
25965 int tmp, off_val;
25966 rtx base, offset;
25967 enum sched_fusion_type fusion;
25968
25969 gcc_assert (INSN_P (insn));
25970
25971 tmp = max_pri - 1;
25972 fusion = fusion_load_store (insn, &base, &offset);
25973 if (fusion == SCHED_FUSION_NONE)
25974 {
25975 *pri = tmp;
25976 *fusion_pri = tmp;
25977 return;
25978 }
25979
25980 /* Set FUSION_PRI according to fusion type and base register. */
25981 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
25982
25983 /* Calculate PRI. */
25984 tmp /= 2;
25985
25986 /* INSN with smaller offset goes first. */
25987 off_val = (int)(INTVAL (offset));
25988 if (off_val >= 0)
25989 tmp -= (off_val & 0xfffff);
25990 else
25991 tmp += ((- off_val) & 0xfffff);
25992
25993 *pri = tmp;
25994 return;
25995 }
25996
25997 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
25998 Adjust priority of sha1h instructions so they are scheduled before
25999 other SHA1 instructions. */
26000
26001 static int
26002 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
26003 {
26004 rtx x = PATTERN (insn);
26005
26006 if (GET_CODE (x) == SET)
26007 {
26008 x = SET_SRC (x);
26009
26010 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
26011 return priority + 10;
26012 }
26013
26014 return priority;
26015 }
26016
26017 /* If REVERSED is null, return true if memory reference *MEM2 comes
26018 immediately after memory reference *MEM1. Do not change the references
26019 in this case.
26020
26021 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
26022 if they are, try to make them use constant offsets from the same base
26023 register. Return true on success. When returning true, set *REVERSED
26024 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
26025 static bool
26026 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
26027 {
26028 if (reversed)
26029 *reversed = false;
26030
26031 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
26032 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
26033 return false;
26034
26035 if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
26036 return false;
26037
26038 auto size1 = MEM_SIZE (*mem1);
26039 auto size2 = MEM_SIZE (*mem2);
26040
26041 rtx base1, base2, offset1, offset2;
26042 extract_base_offset_in_addr (*mem1, &base1, &offset1);
26043 extract_base_offset_in_addr (*mem2, &base2, &offset2);
26044
26045 /* Make sure at least one memory is in base+offset form. */
26046 if (!(base1 && offset1) && !(base2 && offset2))
26047 return false;
26048
26049 /* If both mems already use the same base register, just check the
26050 offsets. */
26051 if (base1 && base2 && rtx_equal_p (base1, base2))
26052 {
26053 if (!offset1 || !offset2)
26054 return false;
26055
26056 if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
26057 return true;
26058
26059 if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
26060 {
26061 *reversed = true;
26062 return true;
26063 }
26064
26065 return false;
26066 }
26067
26068 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
26069 guarantee that the values are consecutive. */
26070 if (MEM_EXPR (*mem1)
26071 && MEM_EXPR (*mem2)
26072 && MEM_OFFSET_KNOWN_P (*mem1)
26073 && MEM_OFFSET_KNOWN_P (*mem2))
26074 {
26075 poly_int64 expr_offset1;
26076 poly_int64 expr_offset2;
26077 tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
26078 &expr_offset1);
26079 tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
26080 &expr_offset2);
26081 if (!expr_base1
26082 || !expr_base2
26083 || !DECL_P (expr_base1)
26084 || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
26085 return false;
26086
26087 expr_offset1 += MEM_OFFSET (*mem1);
26088 expr_offset2 += MEM_OFFSET (*mem2);
26089
26090 if (known_eq (expr_offset1 + size1, expr_offset2))
26091 ;
26092 else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
26093 *reversed = true;
26094 else
26095 return false;
26096
26097 if (reversed)
26098 {
26099 if (base2)
26100 {
26101 rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
26102 expr_offset1 - expr_offset2);
26103 *mem1 = replace_equiv_address_nv (*mem1, addr1);
26104 }
26105 else
26106 {
26107 rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
26108 expr_offset2 - expr_offset1);
26109 *mem2 = replace_equiv_address_nv (*mem2, addr2);
26110 }
26111 }
26112 return true;
26113 }
26114
26115 return false;
26116 }
26117
26118 /* Return true if MEM1 and MEM2 can be combined into a single access
26119 of mode MODE, with the combined access having the same address as MEM1. */
26120
26121 bool
26122 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
26123 {
26124 if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
26125 return false;
26126 return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
26127 }
26128
26129 /* Given OPERANDS of consecutive load/store, check if we can merge
26130 them into ldp/stp. LOAD is true if they are load instructions.
26131 MODE is the mode of memory operands. */
26132
26133 bool
26134 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
26135 machine_mode mode)
26136 {
26137 enum reg_class rclass_1, rclass_2;
26138 rtx mem_1, mem_2, reg_1, reg_2;
26139
26140 if (load)
26141 {
26142 mem_1 = operands[1];
26143 mem_2 = operands[3];
26144 reg_1 = operands[0];
26145 reg_2 = operands[2];
26146 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
26147 if (REGNO (reg_1) == REGNO (reg_2))
26148 return false;
26149 if (reg_overlap_mentioned_p (reg_1, mem_2))
26150 return false;
26151 }
26152 else
26153 {
26154 mem_1 = operands[0];
26155 mem_2 = operands[2];
26156 reg_1 = operands[1];
26157 reg_2 = operands[3];
26158 }
26159
26160 /* The mems cannot be volatile. */
26161 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
26162 return false;
26163
26164 /* If we have SImode and slow unaligned ldp,
26165 check the alignment to be at least 8 byte. */
26166 if (mode == SImode
26167 && (aarch64_tune_params.extra_tuning_flags
26168 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26169 && !optimize_size
26170 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
26171 return false;
26172
26173 /* Check if the addresses are in the form of [base+offset]. */
26174 bool reversed = false;
26175 if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
26176 return false;
26177
26178 /* The operands must be of the same size. */
26179 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
26180 GET_MODE_SIZE (GET_MODE (mem_2))));
26181
26182 /* One of the memory accesses must be a mempair operand.
26183 If it is not the first one, they need to be swapped by the
26184 peephole. */
26185 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
26186 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
26187 return false;
26188
26189 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
26190 rclass_1 = FP_REGS;
26191 else
26192 rclass_1 = GENERAL_REGS;
26193
26194 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
26195 rclass_2 = FP_REGS;
26196 else
26197 rclass_2 = GENERAL_REGS;
26198
26199 /* Check if the registers are of same class. */
26200 if (rclass_1 != rclass_2)
26201 return false;
26202
26203 return true;
26204 }
26205
26206 /* Given OPERANDS of consecutive load/store that can be merged,
26207 swap them if they are not in ascending order. */
26208 void
26209 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
26210 {
26211 int mem_op = load ? 1 : 0;
26212 bool reversed = false;
26213 if (!aarch64_check_consecutive_mems (operands + mem_op,
26214 operands + mem_op + 2, &reversed))
26215 gcc_unreachable ();
26216
26217 if (reversed)
26218 {
26219 /* Irrespective of whether this is a load or a store,
26220 we do the same swap. */
26221 std::swap (operands[0], operands[2]);
26222 std::swap (operands[1], operands[3]);
26223 }
26224 }
26225
26226 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
26227 comparison between the two. */
26228 int
26229 aarch64_host_wide_int_compare (const void *x, const void *y)
26230 {
26231 return wi::cmps (* ((const HOST_WIDE_INT *) x),
26232 * ((const HOST_WIDE_INT *) y));
26233 }
26234
26235 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
26236 other pointing to a REG rtx containing an offset, compare the offsets
26237 of the two pairs.
26238
26239 Return:
26240
26241 1 iff offset (X) > offset (Y)
26242 0 iff offset (X) == offset (Y)
26243 -1 iff offset (X) < offset (Y) */
26244 int
26245 aarch64_ldrstr_offset_compare (const void *x, const void *y)
26246 {
26247 const rtx * operands_1 = (const rtx *) x;
26248 const rtx * operands_2 = (const rtx *) y;
26249 rtx mem_1, mem_2, base, offset_1, offset_2;
26250
26251 if (MEM_P (operands_1[0]))
26252 mem_1 = operands_1[0];
26253 else
26254 mem_1 = operands_1[1];
26255
26256 if (MEM_P (operands_2[0]))
26257 mem_2 = operands_2[0];
26258 else
26259 mem_2 = operands_2[1];
26260
26261 /* Extract the offsets. */
26262 extract_base_offset_in_addr (mem_1, &base, &offset_1);
26263 extract_base_offset_in_addr (mem_2, &base, &offset_2);
26264
26265 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
26266
26267 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
26268 }
26269
26270 /* Given OPERANDS of consecutive load/store, check if we can merge
26271 them into ldp/stp by adjusting the offset. LOAD is true if they
26272 are load instructions. MODE is the mode of memory operands.
26273
26274 Given below consecutive stores:
26275
26276 str w1, [xb, 0x100]
26277 str w1, [xb, 0x104]
26278 str w1, [xb, 0x108]
26279 str w1, [xb, 0x10c]
26280
26281 Though the offsets are out of the range supported by stp, we can
26282 still pair them after adjusting the offset, like:
26283
26284 add scratch, xb, 0x100
26285 stp w1, w1, [scratch]
26286 stp w1, w1, [scratch, 0x8]
26287
26288 The peephole patterns detecting this opportunity should guarantee
26289 the scratch register is avaliable. */
26290
26291 bool
26292 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
26293 machine_mode mode)
26294 {
26295 const int num_insns = 4;
26296 enum reg_class rclass;
26297 HOST_WIDE_INT offvals[num_insns], msize;
26298 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
26299
26300 if (load)
26301 {
26302 for (int i = 0; i < num_insns; i++)
26303 {
26304 reg[i] = operands[2 * i];
26305 mem[i] = operands[2 * i + 1];
26306
26307 gcc_assert (REG_P (reg[i]));
26308 }
26309
26310 /* Do not attempt to merge the loads if the loads clobber each other. */
26311 for (int i = 0; i < 8; i += 2)
26312 for (int j = i + 2; j < 8; j += 2)
26313 if (reg_overlap_mentioned_p (operands[i], operands[j]))
26314 return false;
26315 }
26316 else
26317 for (int i = 0; i < num_insns; i++)
26318 {
26319 mem[i] = operands[2 * i];
26320 reg[i] = operands[2 * i + 1];
26321 }
26322
26323 /* Skip if memory operand is by itself valid for ldp/stp. */
26324 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
26325 return false;
26326
26327 for (int i = 0; i < num_insns; i++)
26328 {
26329 /* The mems cannot be volatile. */
26330 if (MEM_VOLATILE_P (mem[i]))
26331 return false;
26332
26333 /* Check if the addresses are in the form of [base+offset]. */
26334 extract_base_offset_in_addr (mem[i], base + i, offset + i);
26335 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
26336 return false;
26337 }
26338
26339 /* Check if the registers are of same class. */
26340 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
26341 ? FP_REGS : GENERAL_REGS;
26342
26343 for (int i = 1; i < num_insns; i++)
26344 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
26345 {
26346 if (rclass != FP_REGS)
26347 return false;
26348 }
26349 else
26350 {
26351 if (rclass != GENERAL_REGS)
26352 return false;
26353 }
26354
26355 /* Only the last register in the order in which they occur
26356 may be clobbered by the load. */
26357 if (rclass == GENERAL_REGS && load)
26358 for (int i = 0; i < num_insns - 1; i++)
26359 if (reg_mentioned_p (reg[i], mem[i]))
26360 return false;
26361
26362 /* Check if the bases are same. */
26363 for (int i = 0; i < num_insns - 1; i++)
26364 if (!rtx_equal_p (base[i], base[i + 1]))
26365 return false;
26366
26367 for (int i = 0; i < num_insns; i++)
26368 offvals[i] = INTVAL (offset[i]);
26369
26370 msize = GET_MODE_SIZE (mode).to_constant ();
26371
26372 /* Check if the offsets can be put in the right order to do a ldp/stp. */
26373 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
26374 aarch64_host_wide_int_compare);
26375
26376 if (!(offvals[1] == offvals[0] + msize
26377 && offvals[3] == offvals[2] + msize))
26378 return false;
26379
26380 /* Check that offsets are within range of each other. The ldp/stp
26381 instructions have 7 bit immediate offsets, so use 0x80. */
26382 if (offvals[2] - offvals[0] >= msize * 0x80)
26383 return false;
26384
26385 /* The offsets must be aligned with respect to each other. */
26386 if (offvals[0] % msize != offvals[2] % msize)
26387 return false;
26388
26389 /* If we have SImode and slow unaligned ldp,
26390 check the alignment to be at least 8 byte. */
26391 if (mode == SImode
26392 && (aarch64_tune_params.extra_tuning_flags
26393 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26394 && !optimize_size
26395 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
26396 return false;
26397
26398 return true;
26399 }
26400
26401 /* Given OPERANDS of consecutive load/store, this function pairs them
26402 into LDP/STP after adjusting the offset. It depends on the fact
26403 that the operands can be sorted so the offsets are correct for STP.
26404 MODE is the mode of memory operands. CODE is the rtl operator
26405 which should be applied to all memory operands, it's SIGN_EXTEND,
26406 ZERO_EXTEND or UNKNOWN. */
26407
26408 bool
26409 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
26410 machine_mode mode, RTX_CODE code)
26411 {
26412 rtx base, offset_1, offset_3, t1, t2;
26413 rtx mem_1, mem_2, mem_3, mem_4;
26414 rtx temp_operands[8];
26415 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
26416 stp_off_upper_limit, stp_off_lower_limit, msize;
26417
26418 /* We make changes on a copy as we may still bail out. */
26419 for (int i = 0; i < 8; i ++)
26420 temp_operands[i] = operands[i];
26421
26422 /* Sort the operands. Note for cases as below:
26423 [base + 0x310] = A
26424 [base + 0x320] = B
26425 [base + 0x330] = C
26426 [base + 0x320] = D
26427 We need stable sorting otherwise wrong data may be store to offset 0x320.
26428 Also note the dead store in above case should be optimized away, but no
26429 guarantees here. */
26430 gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
26431 aarch64_ldrstr_offset_compare);
26432
26433 /* Copy the memory operands so that if we have to bail for some
26434 reason the original addresses are unchanged. */
26435 if (load)
26436 {
26437 mem_1 = copy_rtx (temp_operands[1]);
26438 mem_2 = copy_rtx (temp_operands[3]);
26439 mem_3 = copy_rtx (temp_operands[5]);
26440 mem_4 = copy_rtx (temp_operands[7]);
26441 }
26442 else
26443 {
26444 mem_1 = copy_rtx (temp_operands[0]);
26445 mem_2 = copy_rtx (temp_operands[2]);
26446 mem_3 = copy_rtx (temp_operands[4]);
26447 mem_4 = copy_rtx (temp_operands[6]);
26448 gcc_assert (code == UNKNOWN);
26449 }
26450
26451 extract_base_offset_in_addr (mem_1, &base, &offset_1);
26452 extract_base_offset_in_addr (mem_3, &base, &offset_3);
26453 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
26454 && offset_3 != NULL_RTX);
26455
26456 /* Adjust offset so it can fit in LDP/STP instruction. */
26457 msize = GET_MODE_SIZE (mode).to_constant();
26458 stp_off_upper_limit = msize * (0x40 - 1);
26459 stp_off_lower_limit = - msize * 0x40;
26460
26461 off_val_1 = INTVAL (offset_1);
26462 off_val_3 = INTVAL (offset_3);
26463
26464 /* The base offset is optimally half way between the two STP/LDP offsets. */
26465 if (msize <= 4)
26466 base_off = (off_val_1 + off_val_3) / 2;
26467 else
26468 /* However, due to issues with negative LDP/STP offset generation for
26469 larger modes, for DF, DD, DI and vector modes. we must not use negative
26470 addresses smaller than 9 signed unadjusted bits can store. This
26471 provides the most range in this case. */
26472 base_off = off_val_1;
26473
26474 /* Adjust the base so that it is aligned with the addresses but still
26475 optimal. */
26476 if (base_off % msize != off_val_1 % msize)
26477 /* Fix the offset, bearing in mind we want to make it bigger not
26478 smaller. */
26479 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26480 else if (msize <= 4)
26481 /* The negative range of LDP/STP is one larger than the positive range. */
26482 base_off += msize;
26483
26484 /* Check if base offset is too big or too small. We can attempt to resolve
26485 this issue by setting it to the maximum value and seeing if the offsets
26486 still fit. */
26487 if (base_off >= 0x1000)
26488 {
26489 base_off = 0x1000 - 1;
26490 /* We must still make sure that the base offset is aligned with respect
26491 to the address. But it may not be made any bigger. */
26492 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26493 }
26494
26495 /* Likewise for the case where the base is too small. */
26496 if (base_off <= -0x1000)
26497 {
26498 base_off = -0x1000 + 1;
26499 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26500 }
26501
26502 /* Offset of the first STP/LDP. */
26503 new_off_1 = off_val_1 - base_off;
26504
26505 /* Offset of the second STP/LDP. */
26506 new_off_3 = off_val_3 - base_off;
26507
26508 /* The offsets must be within the range of the LDP/STP instructions. */
26509 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
26510 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
26511 return false;
26512
26513 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
26514 new_off_1), true);
26515 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
26516 new_off_1 + msize), true);
26517 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
26518 new_off_3), true);
26519 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
26520 new_off_3 + msize), true);
26521
26522 if (!aarch64_mem_pair_operand (mem_1, mode)
26523 || !aarch64_mem_pair_operand (mem_3, mode))
26524 return false;
26525
26526 if (code == ZERO_EXTEND)
26527 {
26528 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
26529 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
26530 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
26531 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
26532 }
26533 else if (code == SIGN_EXTEND)
26534 {
26535 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
26536 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
26537 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
26538 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
26539 }
26540
26541 if (load)
26542 {
26543 operands[0] = temp_operands[0];
26544 operands[1] = mem_1;
26545 operands[2] = temp_operands[2];
26546 operands[3] = mem_2;
26547 operands[4] = temp_operands[4];
26548 operands[5] = mem_3;
26549 operands[6] = temp_operands[6];
26550 operands[7] = mem_4;
26551 }
26552 else
26553 {
26554 operands[0] = mem_1;
26555 operands[1] = temp_operands[1];
26556 operands[2] = mem_2;
26557 operands[3] = temp_operands[3];
26558 operands[4] = mem_3;
26559 operands[5] = temp_operands[5];
26560 operands[6] = mem_4;
26561 operands[7] = temp_operands[7];
26562 }
26563
26564 /* Emit adjusting instruction. */
26565 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
26566 /* Emit ldp/stp instructions. */
26567 t1 = gen_rtx_SET (operands[0], operands[1]);
26568 t2 = gen_rtx_SET (operands[2], operands[3]);
26569 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26570 t1 = gen_rtx_SET (operands[4], operands[5]);
26571 t2 = gen_rtx_SET (operands[6], operands[7]);
26572 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26573 return true;
26574 }
26575
26576 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
26577 it isn't worth branching around empty masked ops (including masked
26578 stores). */
26579
26580 static bool
26581 aarch64_empty_mask_is_expensive (unsigned)
26582 {
26583 return false;
26584 }
26585
26586 /* Return 1 if pseudo register should be created and used to hold
26587 GOT address for PIC code. */
26588
26589 bool
26590 aarch64_use_pseudo_pic_reg (void)
26591 {
26592 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
26593 }
26594
26595 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
26596
26597 static int
26598 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
26599 {
26600 switch (XINT (x, 1))
26601 {
26602 case UNSPEC_GOTSMALLPIC:
26603 case UNSPEC_GOTSMALLPIC28K:
26604 case UNSPEC_GOTTINYPIC:
26605 return 0;
26606 default:
26607 break;
26608 }
26609
26610 return default_unspec_may_trap_p (x, flags);
26611 }
26612
26613
26614 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
26615 return the log2 of that value. Otherwise return -1. */
26616
26617 int
26618 aarch64_fpconst_pow_of_2 (rtx x)
26619 {
26620 const REAL_VALUE_TYPE *r;
26621
26622 if (!CONST_DOUBLE_P (x))
26623 return -1;
26624
26625 r = CONST_DOUBLE_REAL_VALUE (x);
26626
26627 if (REAL_VALUE_NEGATIVE (*r)
26628 || REAL_VALUE_ISNAN (*r)
26629 || REAL_VALUE_ISINF (*r)
26630 || !real_isinteger (r, DFmode))
26631 return -1;
26632
26633 return exact_log2 (real_to_integer (r));
26634 }
26635
26636 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
26637 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
26638 return n. Otherwise return -1. */
26639
26640 int
26641 aarch64_fpconst_pow2_recip (rtx x)
26642 {
26643 REAL_VALUE_TYPE r0;
26644
26645 if (!CONST_DOUBLE_P (x))
26646 return -1;
26647
26648 r0 = *CONST_DOUBLE_REAL_VALUE (x);
26649 if (exact_real_inverse (DFmode, &r0)
26650 && !REAL_VALUE_NEGATIVE (r0))
26651 {
26652 int ret = exact_log2 (real_to_integer (&r0));
26653 if (ret >= 1 && ret <= 32)
26654 return ret;
26655 }
26656 return -1;
26657 }
26658
26659 /* If X is a vector of equal CONST_DOUBLE values and that value is
26660 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
26661
26662 int
26663 aarch64_vec_fpconst_pow_of_2 (rtx x)
26664 {
26665 int nelts;
26666 if (!CONST_VECTOR_P (x)
26667 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
26668 return -1;
26669
26670 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
26671 return -1;
26672
26673 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
26674 if (firstval <= 0)
26675 return -1;
26676
26677 for (int i = 1; i < nelts; i++)
26678 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
26679 return -1;
26680
26681 return firstval;
26682 }
26683
26684 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
26685 to float.
26686
26687 __fp16 always promotes through this hook.
26688 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
26689 through the generic excess precision logic rather than here. */
26690
26691 static tree
26692 aarch64_promoted_type (const_tree t)
26693 {
26694 if (SCALAR_FLOAT_TYPE_P (t)
26695 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
26696 return float_type_node;
26697
26698 return NULL_TREE;
26699 }
26700
26701 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
26702
26703 static bool
26704 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
26705 optimization_type opt_type)
26706 {
26707 switch (op)
26708 {
26709 case rsqrt_optab:
26710 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
26711
26712 default:
26713 return true;
26714 }
26715 }
26716
26717 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
26718
26719 static unsigned int
26720 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
26721 int *offset)
26722 {
26723 /* Polynomial invariant 1 == (VG / 2) - 1. */
26724 gcc_assert (i == 1);
26725 *factor = 2;
26726 *offset = 1;
26727 return AARCH64_DWARF_VG;
26728 }
26729
26730 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
26731 if MODE is HFmode, and punt to the generic implementation otherwise. */
26732
26733 static bool
26734 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
26735 {
26736 return (mode == HFmode
26737 ? true
26738 : default_libgcc_floating_mode_supported_p (mode));
26739 }
26740
26741 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
26742 if MODE is HFmode, and punt to the generic implementation otherwise. */
26743
26744 static bool
26745 aarch64_scalar_mode_supported_p (scalar_mode mode)
26746 {
26747 if (DECIMAL_FLOAT_MODE_P (mode))
26748 return default_decimal_float_supported_p ();
26749
26750 return (mode == HFmode
26751 ? true
26752 : default_scalar_mode_supported_p (mode));
26753 }
26754
26755 /* Set the value of FLT_EVAL_METHOD.
26756 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
26757
26758 0: evaluate all operations and constants, whose semantic type has at
26759 most the range and precision of type float, to the range and
26760 precision of float; evaluate all other operations and constants to
26761 the range and precision of the semantic type;
26762
26763 N, where _FloatN is a supported interchange floating type
26764 evaluate all operations and constants, whose semantic type has at
26765 most the range and precision of _FloatN type, to the range and
26766 precision of the _FloatN type; evaluate all other operations and
26767 constants to the range and precision of the semantic type;
26768
26769 If we have the ARMv8.2-A extensions then we support _Float16 in native
26770 precision, so we should set this to 16. Otherwise, we support the type,
26771 but want to evaluate expressions in float precision, so set this to
26772 0. */
26773
26774 static enum flt_eval_method
26775 aarch64_excess_precision (enum excess_precision_type type)
26776 {
26777 switch (type)
26778 {
26779 case EXCESS_PRECISION_TYPE_FAST:
26780 case EXCESS_PRECISION_TYPE_STANDARD:
26781 /* We can calculate either in 16-bit range and precision or
26782 32-bit range and precision. Make that decision based on whether
26783 we have native support for the ARMv8.2-A 16-bit floating-point
26784 instructions or not. */
26785 return (TARGET_FP_F16INST
26786 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
26787 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
26788 case EXCESS_PRECISION_TYPE_IMPLICIT:
26789 case EXCESS_PRECISION_TYPE_FLOAT16:
26790 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
26791 default:
26792 gcc_unreachable ();
26793 }
26794 return FLT_EVAL_METHOD_UNPREDICTABLE;
26795 }
26796
26797 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
26798 scheduled for speculative execution. Reject the long-running division
26799 and square-root instructions. */
26800
26801 static bool
26802 aarch64_sched_can_speculate_insn (rtx_insn *insn)
26803 {
26804 switch (get_attr_type (insn))
26805 {
26806 case TYPE_SDIV:
26807 case TYPE_UDIV:
26808 case TYPE_FDIVS:
26809 case TYPE_FDIVD:
26810 case TYPE_FSQRTS:
26811 case TYPE_FSQRTD:
26812 case TYPE_NEON_FP_SQRT_S:
26813 case TYPE_NEON_FP_SQRT_D:
26814 case TYPE_NEON_FP_SQRT_S_Q:
26815 case TYPE_NEON_FP_SQRT_D_Q:
26816 case TYPE_NEON_FP_DIV_S:
26817 case TYPE_NEON_FP_DIV_D:
26818 case TYPE_NEON_FP_DIV_S_Q:
26819 case TYPE_NEON_FP_DIV_D_Q:
26820 return false;
26821 default:
26822 return true;
26823 }
26824 }
26825
26826 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
26827
26828 static int
26829 aarch64_compute_pressure_classes (reg_class *classes)
26830 {
26831 int i = 0;
26832 classes[i++] = GENERAL_REGS;
26833 classes[i++] = FP_REGS;
26834 /* PR_REGS isn't a useful pressure class because many predicate pseudo
26835 registers need to go in PR_LO_REGS at some point during their
26836 lifetime. Splitting it into two halves has the effect of making
26837 all predicates count against PR_LO_REGS, so that we try whenever
26838 possible to restrict the number of live predicates to 8. This
26839 greatly reduces the amount of spilling in certain loops. */
26840 classes[i++] = PR_LO_REGS;
26841 classes[i++] = PR_HI_REGS;
26842 return i;
26843 }
26844
26845 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
26846
26847 static bool
26848 aarch64_can_change_mode_class (machine_mode from,
26849 machine_mode to, reg_class_t)
26850 {
26851 unsigned int from_flags = aarch64_classify_vector_mode (from);
26852 unsigned int to_flags = aarch64_classify_vector_mode (to);
26853
26854 bool from_sve_p = (from_flags & VEC_ANY_SVE);
26855 bool to_sve_p = (to_flags & VEC_ANY_SVE);
26856
26857 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
26858 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
26859
26860 bool from_pred_p = (from_flags & VEC_SVE_PRED);
26861 bool to_pred_p = (to_flags & VEC_SVE_PRED);
26862
26863 bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
26864 | VEC_PARTIAL));
26865 bool from_partial_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT
26866 | VEC_PARTIAL));
26867
26868 /* Don't allow changes between predicate modes and other modes.
26869 Only predicate registers can hold predicate modes and only
26870 non-predicate registers can hold non-predicate modes, so any
26871 attempt to mix them would require a round trip through memory. */
26872 if (from_pred_p != to_pred_p)
26873 return false;
26874
26875 /* Don't allow changes between partial SVE modes and other modes.
26876 The contents of partial SVE modes are distributed evenly across
26877 the register, whereas GCC expects them to be clustered together. */
26878 if (from_partial_sve_p != to_partial_sve_p)
26879 return false;
26880
26881 /* Similarly reject changes between partial SVE modes that have
26882 different patterns of significant and insignificant bits. */
26883 if (from_partial_sve_p
26884 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
26885 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
26886 return false;
26887
26888 /* Don't allow changes between partial and other registers only if
26889 one is a normal SIMD register, allow only if not larger than 64-bit. */
26890 if ((to_partial_advsimd_struct_p ^ from_partial_advsimd_struct_p)
26891 && (known_gt (GET_MODE_SIZE (to), 8) || known_gt (GET_MODE_SIZE (to), 8)))
26892 return false;
26893
26894 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26895 {
26896 /* Don't allow changes between SVE modes and other modes that might
26897 be bigger than 128 bits. In particular, OImode, CImode and XImode
26898 divide into 128-bit quantities while SVE modes divide into
26899 BITS_PER_SVE_VECTOR quantities. */
26900 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
26901 return false;
26902 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
26903 return false;
26904 }
26905
26906 if (BYTES_BIG_ENDIAN)
26907 {
26908 /* Don't allow changes between SVE data modes and non-SVE modes.
26909 See the comment at the head of aarch64-sve.md for details. */
26910 if (from_sve_p != to_sve_p)
26911 return false;
26912
26913 /* Don't allow changes in element size: lane 0 of the new vector
26914 would not then be lane 0 of the old vector. See the comment
26915 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26916 description.
26917
26918 In the worst case, this forces a register to be spilled in
26919 one mode and reloaded in the other, which handles the
26920 endianness correctly. */
26921 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
26922 return false;
26923 }
26924 return true;
26925 }
26926
26927 /* Implement TARGET_EARLY_REMAT_MODES. */
26928
26929 static void
26930 aarch64_select_early_remat_modes (sbitmap modes)
26931 {
26932 /* SVE values are not normally live across a call, so it should be
26933 worth doing early rematerialization even in VL-specific mode. */
26934 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
26935 if (aarch64_sve_mode_p ((machine_mode) i))
26936 bitmap_set_bit (modes, i);
26937 }
26938
26939 /* Override the default target speculation_safe_value. */
26940 static rtx
26941 aarch64_speculation_safe_value (machine_mode mode,
26942 rtx result, rtx val, rtx failval)
26943 {
26944 /* Maybe we should warn if falling back to hard barriers. They are
26945 likely to be noticably more expensive than the alternative below. */
26946 if (!aarch64_track_speculation)
26947 return default_speculation_safe_value (mode, result, val, failval);
26948
26949 if (!REG_P (val))
26950 val = copy_to_mode_reg (mode, val);
26951
26952 if (!aarch64_reg_or_zero (failval, mode))
26953 failval = copy_to_mode_reg (mode, failval);
26954
26955 emit_insn (gen_despeculate_copy (mode, result, val, failval));
26956 return result;
26957 }
26958
26959 /* Implement TARGET_ESTIMATED_POLY_VALUE.
26960 Look into the tuning structure for an estimate.
26961 KIND specifies the type of requested estimate: min, max or likely.
26962 For cores with a known SVE width all three estimates are the same.
26963 For generic SVE tuning we want to distinguish the maximum estimate from
26964 the minimum and likely ones.
26965 The likely estimate is the same as the minimum in that case to give a
26966 conservative behavior of auto-vectorizing with SVE when it is a win
26967 even for 128-bit SVE.
26968 When SVE width information is available VAL.coeffs[1] is multiplied by
26969 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
26970
26971 static HOST_WIDE_INT
26972 aarch64_estimated_poly_value (poly_int64 val,
26973 poly_value_estimate_kind kind
26974 = POLY_VALUE_LIKELY)
26975 {
26976 unsigned int width_source = aarch64_tune_params.sve_width;
26977
26978 /* If there is no core-specific information then the minimum and likely
26979 values are based on 128-bit vectors and the maximum is based on
26980 the architectural maximum of 2048 bits. */
26981 if (width_source == SVE_SCALABLE)
26982 switch (kind)
26983 {
26984 case POLY_VALUE_MIN:
26985 case POLY_VALUE_LIKELY:
26986 return val.coeffs[0];
26987 case POLY_VALUE_MAX:
26988 return val.coeffs[0] + val.coeffs[1] * 15;
26989 }
26990
26991 /* Allow sve_width to be a bitmask of different VL, treating the lowest
26992 as likely. This could be made more general if future -mtune options
26993 need it to be. */
26994 if (kind == POLY_VALUE_MAX)
26995 width_source = 1 << floor_log2 (width_source);
26996 else
26997 width_source = least_bit_hwi (width_source);
26998
26999 /* If the core provides width information, use that. */
27000 HOST_WIDE_INT over_128 = width_source - 128;
27001 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
27002 }
27003
27004
27005 /* Return true for types that could be supported as SIMD return or
27006 argument types. */
27007
27008 static bool
27009 supported_simd_type (tree t)
27010 {
27011 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
27012 {
27013 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
27014 return s == 1 || s == 2 || s == 4 || s == 8;
27015 }
27016 return false;
27017 }
27018
27019 /* Return true for types that currently are supported as SIMD return
27020 or argument types. */
27021
27022 static bool
27023 currently_supported_simd_type (tree t, tree b)
27024 {
27025 if (COMPLEX_FLOAT_TYPE_P (t))
27026 return false;
27027
27028 if (TYPE_SIZE (t) != TYPE_SIZE (b))
27029 return false;
27030
27031 return supported_simd_type (t);
27032 }
27033
27034 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
27035
27036 static int
27037 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
27038 struct cgraph_simd_clone *clonei,
27039 tree base_type, int num,
27040 bool explicit_p)
27041 {
27042 tree t, ret_type;
27043 unsigned int elt_bits, count;
27044 unsigned HOST_WIDE_INT const_simdlen;
27045 poly_uint64 vec_bits;
27046
27047 if (!TARGET_SIMD)
27048 return 0;
27049
27050 /* For now, SVE simdclones won't produce illegal simdlen, So only check
27051 const simdlens here. */
27052 if (maybe_ne (clonei->simdlen, 0U)
27053 && clonei->simdlen.is_constant (&const_simdlen)
27054 && (const_simdlen < 2
27055 || const_simdlen > 1024
27056 || (const_simdlen & (const_simdlen - 1)) != 0))
27057 {
27058 if (explicit_p)
27059 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27060 "unsupported simdlen %wd", const_simdlen);
27061 return 0;
27062 }
27063
27064 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
27065 if (TREE_CODE (ret_type) != VOID_TYPE
27066 && !currently_supported_simd_type (ret_type, base_type))
27067 {
27068 if (!explicit_p)
27069 ;
27070 else if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
27071 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27072 "GCC does not currently support mixed size types "
27073 "for %<simd%> functions");
27074 else if (supported_simd_type (ret_type))
27075 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27076 "GCC does not currently support return type %qT "
27077 "for %<simd%> functions", ret_type);
27078 else
27079 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27080 "unsupported return type %qT for %<simd%> functions",
27081 ret_type);
27082 return 0;
27083 }
27084
27085 int i;
27086 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
27087 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
27088
27089 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
27090 t && t != void_list_node; t = TREE_CHAIN (t), i++)
27091 {
27092 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
27093
27094 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
27095 && !currently_supported_simd_type (arg_type, base_type))
27096 {
27097 if (!explicit_p)
27098 ;
27099 else if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
27100 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27101 "GCC does not currently support mixed size types "
27102 "for %<simd%> functions");
27103 else
27104 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27105 "GCC does not currently support argument type %qT "
27106 "for %<simd%> functions", arg_type);
27107 return 0;
27108 }
27109 }
27110
27111 clonei->vecsize_mangle = 'n';
27112 clonei->mask_mode = VOIDmode;
27113 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
27114 if (known_eq (clonei->simdlen, 0U))
27115 {
27116 count = 2;
27117 vec_bits = (num == 0 ? 64 : 128);
27118 clonei->simdlen = exact_div (vec_bits, elt_bits);
27119 }
27120 else
27121 {
27122 count = 1;
27123 vec_bits = clonei->simdlen * elt_bits;
27124 /* For now, SVE simdclones won't produce illegal simdlen, So only check
27125 const simdlens here. */
27126 if (clonei->simdlen.is_constant (&const_simdlen)
27127 && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
27128 {
27129 if (explicit_p)
27130 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27131 "GCC does not currently support simdlen %wd for "
27132 "type %qT",
27133 const_simdlen, base_type);
27134 return 0;
27135 }
27136 }
27137 clonei->vecsize_int = vec_bits;
27138 clonei->vecsize_float = vec_bits;
27139 return count;
27140 }
27141
27142 /* Implement TARGET_SIMD_CLONE_ADJUST. */
27143
27144 static void
27145 aarch64_simd_clone_adjust (struct cgraph_node *node)
27146 {
27147 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
27148 use the correct ABI. */
27149
27150 tree t = TREE_TYPE (node->decl);
27151 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
27152 TYPE_ATTRIBUTES (t));
27153 }
27154
27155 /* Implement TARGET_SIMD_CLONE_USABLE. */
27156
27157 static int
27158 aarch64_simd_clone_usable (struct cgraph_node *node)
27159 {
27160 switch (node->simdclone->vecsize_mangle)
27161 {
27162 case 'n':
27163 if (!TARGET_SIMD)
27164 return -1;
27165 return 0;
27166 default:
27167 gcc_unreachable ();
27168 }
27169 }
27170
27171 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
27172
27173 static int
27174 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
27175 {
27176 auto check_attr = [&](const char *name) {
27177 tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
27178 tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
27179 if (!attr1 && !attr2)
27180 return true;
27181
27182 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
27183 };
27184
27185 if (!check_attr ("aarch64_vector_pcs"))
27186 return 0;
27187 if (!check_attr ("Advanced SIMD type"))
27188 return 0;
27189 if (!check_attr ("SVE type"))
27190 return 0;
27191 if (!check_attr ("SVE sizeless type"))
27192 return 0;
27193 return 1;
27194 }
27195
27196 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
27197
27198 static const char *
27199 aarch64_get_multilib_abi_name (void)
27200 {
27201 if (TARGET_BIG_END)
27202 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
27203 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
27204 }
27205
27206 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
27207 global variable based guard use the default else
27208 return a null tree. */
27209 static tree
27210 aarch64_stack_protect_guard (void)
27211 {
27212 if (aarch64_stack_protector_guard == SSP_GLOBAL)
27213 return default_stack_protect_guard ();
27214
27215 return NULL_TREE;
27216 }
27217
27218 /* Return the diagnostic message string if conversion from FROMTYPE to
27219 TOTYPE is not allowed, NULL otherwise. */
27220
27221 static const char *
27222 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
27223 {
27224 if (element_mode (fromtype) != element_mode (totype))
27225 {
27226 /* Do no allow conversions to/from BFmode scalar types. */
27227 if (TYPE_MODE (fromtype) == BFmode)
27228 return N_("invalid conversion from type %<bfloat16_t%>");
27229 if (TYPE_MODE (totype) == BFmode)
27230 return N_("invalid conversion to type %<bfloat16_t%>");
27231 }
27232
27233 /* Conversion allowed. */
27234 return NULL;
27235 }
27236
27237 /* Return the diagnostic message string if the unary operation OP is
27238 not permitted on TYPE, NULL otherwise. */
27239
27240 static const char *
27241 aarch64_invalid_unary_op (int op, const_tree type)
27242 {
27243 /* Reject all single-operand operations on BFmode except for &. */
27244 if (element_mode (type) == BFmode && op != ADDR_EXPR)
27245 return N_("operation not permitted on type %<bfloat16_t%>");
27246
27247 /* Operation allowed. */
27248 return NULL;
27249 }
27250
27251 /* Return the diagnostic message string if the binary operation OP is
27252 not permitted on TYPE1 and TYPE2, NULL otherwise. */
27253
27254 static const char *
27255 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
27256 const_tree type2)
27257 {
27258 /* Reject all 2-operand operations on BFmode. */
27259 if (element_mode (type1) == BFmode
27260 || element_mode (type2) == BFmode)
27261 return N_("operation not permitted on type %<bfloat16_t%>");
27262
27263 if (VECTOR_TYPE_P (type1)
27264 && VECTOR_TYPE_P (type2)
27265 && !TYPE_INDIVISIBLE_P (type1)
27266 && !TYPE_INDIVISIBLE_P (type2)
27267 && (aarch64_sve::builtin_type_p (type1)
27268 != aarch64_sve::builtin_type_p (type2)))
27269 return N_("cannot combine GNU and SVE vectors in a binary operation");
27270
27271 /* Operation allowed. */
27272 return NULL;
27273 }
27274
27275 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
27276 compiler that we automatically ignore the top byte of our pointers, which
27277 allows using -fsanitize=hwaddress. */
27278 bool
27279 aarch64_can_tag_addresses ()
27280 {
27281 return !TARGET_ILP32;
27282 }
27283
27284 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
27285 section at the end if needed. */
27286 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
27287 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
27288 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
27289 void
27290 aarch64_file_end_indicate_exec_stack ()
27291 {
27292 file_end_indicate_exec_stack ();
27293
27294 unsigned feature_1_and = 0;
27295 if (aarch64_bti_enabled ())
27296 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
27297
27298 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
27299 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
27300
27301 if (feature_1_and)
27302 {
27303 /* Generate .note.gnu.property section. */
27304 switch_to_section (get_section (".note.gnu.property",
27305 SECTION_NOTYPE, NULL));
27306
27307 /* PT_NOTE header: namesz, descsz, type.
27308 namesz = 4 ("GNU\0")
27309 descsz = 16 (Size of the program property array)
27310 [(12 + padding) * Number of array elements]
27311 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
27312 assemble_align (POINTER_SIZE);
27313 assemble_integer (GEN_INT (4), 4, 32, 1);
27314 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
27315 assemble_integer (GEN_INT (5), 4, 32, 1);
27316
27317 /* PT_NOTE name. */
27318 assemble_string ("GNU", 4);
27319
27320 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
27321 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
27322 datasz = 4
27323 data = feature_1_and. */
27324 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
27325 assemble_integer (GEN_INT (4), 4, 32, 1);
27326 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
27327
27328 /* Pad the size of the note to the required alignment. */
27329 assemble_align (POINTER_SIZE);
27330 }
27331 }
27332 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
27333 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
27334 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
27335
27336 /* Helper function for straight line speculation.
27337 Return what barrier should be emitted for straight line speculation
27338 mitigation.
27339 When not mitigating against straight line speculation this function returns
27340 an empty string.
27341 When mitigating against straight line speculation, use:
27342 * SB when the v8.5-A SB extension is enabled.
27343 * DSB+ISB otherwise. */
27344 const char *
27345 aarch64_sls_barrier (int mitigation_required)
27346 {
27347 return mitigation_required
27348 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
27349 : "";
27350 }
27351
27352 static GTY (()) tree aarch64_sls_shared_thunks[30];
27353 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
27354 const char *indirect_symbol_names[30] = {
27355 "__call_indirect_x0",
27356 "__call_indirect_x1",
27357 "__call_indirect_x2",
27358 "__call_indirect_x3",
27359 "__call_indirect_x4",
27360 "__call_indirect_x5",
27361 "__call_indirect_x6",
27362 "__call_indirect_x7",
27363 "__call_indirect_x8",
27364 "__call_indirect_x9",
27365 "__call_indirect_x10",
27366 "__call_indirect_x11",
27367 "__call_indirect_x12",
27368 "__call_indirect_x13",
27369 "__call_indirect_x14",
27370 "__call_indirect_x15",
27371 "", /* "__call_indirect_x16", */
27372 "", /* "__call_indirect_x17", */
27373 "__call_indirect_x18",
27374 "__call_indirect_x19",
27375 "__call_indirect_x20",
27376 "__call_indirect_x21",
27377 "__call_indirect_x22",
27378 "__call_indirect_x23",
27379 "__call_indirect_x24",
27380 "__call_indirect_x25",
27381 "__call_indirect_x26",
27382 "__call_indirect_x27",
27383 "__call_indirect_x28",
27384 "__call_indirect_x29",
27385 };
27386
27387 /* Function to create a BLR thunk. This thunk is used to mitigate straight
27388 line speculation. Instead of a simple BLR that can be speculated past,
27389 we emit a BL to this thunk, and this thunk contains a BR to the relevant
27390 register. These thunks have the relevant speculation barries put after
27391 their indirect branch so that speculation is blocked.
27392
27393 We use such a thunk so the speculation barriers are kept off the
27394 architecturally executed path in order to reduce the performance overhead.
27395
27396 When optimizing for size we use stubs shared by the linked object.
27397 When optimizing for performance we emit stubs for each function in the hope
27398 that the branch predictor can better train on jumps specific for a given
27399 function. */
27400 rtx
27401 aarch64_sls_create_blr_label (int regnum)
27402 {
27403 gcc_assert (STUB_REGNUM_P (regnum));
27404 if (optimize_function_for_size_p (cfun))
27405 {
27406 /* For the thunks shared between different functions in this compilation
27407 unit we use a named symbol -- this is just for users to more easily
27408 understand the generated assembly. */
27409 aarch64_sls_shared_thunks_needed = true;
27410 const char *thunk_name = indirect_symbol_names[regnum];
27411 if (aarch64_sls_shared_thunks[regnum] == NULL)
27412 {
27413 /* Build a decl representing this function stub and record it for
27414 later. We build a decl here so we can use the GCC machinery for
27415 handling sections automatically (through `get_named_section` and
27416 `make_decl_one_only`). That saves us a lot of trouble handling
27417 the specifics of different output file formats. */
27418 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
27419 get_identifier (thunk_name),
27420 build_function_type_list (void_type_node,
27421 NULL_TREE));
27422 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
27423 NULL_TREE, void_type_node);
27424 TREE_PUBLIC (decl) = 1;
27425 TREE_STATIC (decl) = 1;
27426 DECL_IGNORED_P (decl) = 1;
27427 DECL_ARTIFICIAL (decl) = 1;
27428 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
27429 resolve_unique_section (decl, 0, false);
27430 aarch64_sls_shared_thunks[regnum] = decl;
27431 }
27432
27433 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
27434 }
27435
27436 if (cfun->machine->call_via[regnum] == NULL)
27437 cfun->machine->call_via[regnum]
27438 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
27439 return cfun->machine->call_via[regnum];
27440 }
27441
27442 /* Helper function for aarch64_sls_emit_blr_function_thunks and
27443 aarch64_sls_emit_shared_blr_thunks below. */
27444 static void
27445 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
27446 {
27447 /* Save in x16 and branch to that function so this transformation does
27448 not prevent jumping to `BTI c` instructions. */
27449 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
27450 asm_fprintf (out_file, "\tbr\tx16\n");
27451 }
27452
27453 /* Emit all BLR stubs for this particular function.
27454 Here we emit all the BLR stubs needed for the current function. Since we
27455 emit these stubs in a consecutive block we know there will be no speculation
27456 gadgets between each stub, and hence we only emit a speculation barrier at
27457 the end of the stub sequences.
27458
27459 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
27460 void
27461 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
27462 {
27463 if (! aarch64_harden_sls_blr_p ())
27464 return;
27465
27466 bool any_functions_emitted = false;
27467 /* We must save and restore the current function section since this assembly
27468 is emitted at the end of the function. This means it can be emitted *just
27469 after* the cold section of a function. That cold part would be emitted in
27470 a different section. That switch would trigger a `.cfi_endproc` directive
27471 to be emitted in the original section and a `.cfi_startproc` directive to
27472 be emitted in the new section. Switching to the original section without
27473 restoring would mean that the `.cfi_endproc` emitted as a function ends
27474 would happen in a different section -- leaving an unmatched
27475 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
27476 in the standard text section. */
27477 section *save_text_section = in_section;
27478 switch_to_section (function_section (current_function_decl));
27479 for (int regnum = 0; regnum < 30; ++regnum)
27480 {
27481 rtx specu_label = cfun->machine->call_via[regnum];
27482 if (specu_label == NULL)
27483 continue;
27484
27485 targetm.asm_out.print_operand (out_file, specu_label, 0);
27486 asm_fprintf (out_file, ":\n");
27487 aarch64_sls_emit_function_stub (out_file, regnum);
27488 any_functions_emitted = true;
27489 }
27490 if (any_functions_emitted)
27491 /* Can use the SB if needs be here, since this stub will only be used
27492 by the current function, and hence for the current target. */
27493 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
27494 switch_to_section (save_text_section);
27495 }
27496
27497 /* Emit shared BLR stubs for the current compilation unit.
27498 Over the course of compiling this unit we may have converted some BLR
27499 instructions to a BL to a shared stub function. This is where we emit those
27500 stub functions.
27501 This function is for the stubs shared between different functions in this
27502 compilation unit. We share when optimizing for size instead of speed.
27503
27504 This function is called through the TARGET_ASM_FILE_END hook. */
27505 void
27506 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
27507 {
27508 if (! aarch64_sls_shared_thunks_needed)
27509 return;
27510
27511 for (int regnum = 0; regnum < 30; ++regnum)
27512 {
27513 tree decl = aarch64_sls_shared_thunks[regnum];
27514 if (!decl)
27515 continue;
27516
27517 const char *name = indirect_symbol_names[regnum];
27518 switch_to_section (get_named_section (decl, NULL, 0));
27519 ASM_OUTPUT_ALIGN (out_file, 2);
27520 targetm.asm_out.globalize_label (out_file, name);
27521 /* Only emits if the compiler is configured for an assembler that can
27522 handle visibility directives. */
27523 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
27524 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
27525 ASM_OUTPUT_LABEL (out_file, name);
27526 aarch64_sls_emit_function_stub (out_file, regnum);
27527 /* Use the most conservative target to ensure it can always be used by any
27528 function in the translation unit. */
27529 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
27530 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
27531 }
27532 }
27533
27534 /* Implement TARGET_ASM_FILE_END. */
27535 void
27536 aarch64_asm_file_end ()
27537 {
27538 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
27539 /* Since this function will be called for the ASM_FILE_END hook, we ensure
27540 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
27541 for FreeBSD) still gets called. */
27542 #ifdef TARGET_ASM_FILE_END
27543 TARGET_ASM_FILE_END ();
27544 #endif
27545 }
27546
27547 const char *
27548 aarch64_indirect_call_asm (rtx addr)
27549 {
27550 gcc_assert (REG_P (addr));
27551 if (aarch64_harden_sls_blr_p ())
27552 {
27553 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
27554 output_asm_insn ("bl\t%0", &stub_label);
27555 }
27556 else
27557 output_asm_insn ("blr\t%0", &addr);
27558 return "";
27559 }
27560
27561 /* Target-specific selftests. */
27562
27563 #if CHECKING_P
27564
27565 namespace selftest {
27566
27567 /* Selftest for the RTL loader.
27568 Verify that the RTL loader copes with a dump from
27569 print_rtx_function. This is essentially just a test that class
27570 function_reader can handle a real dump, but it also verifies
27571 that lookup_reg_by_dump_name correctly handles hard regs.
27572 The presence of hard reg names in the dump means that the test is
27573 target-specific, hence it is in this file. */
27574
27575 static void
27576 aarch64_test_loading_full_dump ()
27577 {
27578 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
27579
27580 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
27581
27582 rtx_insn *insn_1 = get_insn_by_uid (1);
27583 ASSERT_EQ (NOTE, GET_CODE (insn_1));
27584
27585 rtx_insn *insn_15 = get_insn_by_uid (15);
27586 ASSERT_EQ (INSN, GET_CODE (insn_15));
27587 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
27588
27589 /* Verify crtl->return_rtx. */
27590 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
27591 ASSERT_EQ (0, REGNO (crtl->return_rtx));
27592 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
27593 }
27594
27595 /* Test the fractional_cost class. */
27596
27597 static void
27598 aarch64_test_fractional_cost ()
27599 {
27600 using cf = fractional_cost;
27601
27602 ASSERT_EQ (cf (0, 20), 0);
27603
27604 ASSERT_EQ (cf (4, 2), 2);
27605 ASSERT_EQ (3, cf (9, 3));
27606
27607 ASSERT_NE (cf (5, 2), 2);
27608 ASSERT_NE (3, cf (8, 3));
27609
27610 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
27611 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
27612 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
27613
27614 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
27615 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
27616 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
27617 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
27618 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
27619 ASSERT_EQ (3 - cf (10, 3), 0);
27620
27621 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
27622 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
27623
27624 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27625 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27626 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27627 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27628 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27629 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27630 ASSERT_TRUE (cf (239, 240) < 1);
27631 ASSERT_FALSE (cf (240, 240) < 1);
27632 ASSERT_FALSE (cf (241, 240) < 1);
27633 ASSERT_FALSE (2 < cf (207, 104));
27634 ASSERT_FALSE (2 < cf (208, 104));
27635 ASSERT_TRUE (2 < cf (209, 104));
27636
27637 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27638 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27639 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27640 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27641 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27642 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27643 ASSERT_TRUE (cf (239, 240) < 1);
27644 ASSERT_FALSE (cf (240, 240) < 1);
27645 ASSERT_FALSE (cf (241, 240) < 1);
27646 ASSERT_FALSE (2 < cf (207, 104));
27647 ASSERT_FALSE (2 < cf (208, 104));
27648 ASSERT_TRUE (2 < cf (209, 104));
27649
27650 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
27651 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
27652 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
27653 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
27654 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
27655 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
27656 ASSERT_FALSE (cf (239, 240) >= 1);
27657 ASSERT_TRUE (cf (240, 240) >= 1);
27658 ASSERT_TRUE (cf (241, 240) >= 1);
27659 ASSERT_TRUE (2 >= cf (207, 104));
27660 ASSERT_TRUE (2 >= cf (208, 104));
27661 ASSERT_FALSE (2 >= cf (209, 104));
27662
27663 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
27664 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
27665 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
27666 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
27667 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
27668 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
27669 ASSERT_FALSE (cf (239, 240) > 1);
27670 ASSERT_FALSE (cf (240, 240) > 1);
27671 ASSERT_TRUE (cf (241, 240) > 1);
27672 ASSERT_TRUE (2 > cf (207, 104));
27673 ASSERT_FALSE (2 > cf (208, 104));
27674 ASSERT_FALSE (2 > cf (209, 104));
27675
27676 ASSERT_EQ (cf (1, 2).ceil (), 1);
27677 ASSERT_EQ (cf (11, 7).ceil (), 2);
27678 ASSERT_EQ (cf (20, 1).ceil (), 20);
27679 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
27680 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
27681 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
27682 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
27683 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
27684
27685 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
27686 }
27687
27688 /* Run all target-specific selftests. */
27689
27690 static void
27691 aarch64_run_selftests (void)
27692 {
27693 aarch64_test_loading_full_dump ();
27694 aarch64_test_fractional_cost ();
27695 }
27696
27697 } // namespace selftest
27698
27699 #endif /* #if CHECKING_P */
27700
27701 #undef TARGET_STACK_PROTECT_GUARD
27702 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
27703
27704 #undef TARGET_ADDRESS_COST
27705 #define TARGET_ADDRESS_COST aarch64_address_cost
27706
27707 /* This hook will determines whether unnamed bitfields affect the alignment
27708 of the containing structure. The hook returns true if the structure
27709 should inherit the alignment requirements of an unnamed bitfield's
27710 type. */
27711 #undef TARGET_ALIGN_ANON_BITFIELD
27712 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
27713
27714 #undef TARGET_ASM_ALIGNED_DI_OP
27715 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
27716
27717 #undef TARGET_ASM_ALIGNED_HI_OP
27718 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
27719
27720 #undef TARGET_ASM_ALIGNED_SI_OP
27721 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
27722
27723 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
27724 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
27725 hook_bool_const_tree_hwi_hwi_const_tree_true
27726
27727 #undef TARGET_ASM_FILE_START
27728 #define TARGET_ASM_FILE_START aarch64_start_file
27729
27730 #undef TARGET_ASM_OUTPUT_MI_THUNK
27731 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
27732
27733 #undef TARGET_ASM_SELECT_RTX_SECTION
27734 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
27735
27736 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
27737 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
27738
27739 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
27740 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
27741
27742 #undef TARGET_BUILD_BUILTIN_VA_LIST
27743 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
27744
27745 #undef TARGET_CALLEE_COPIES
27746 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
27747
27748 #undef TARGET_CAN_ELIMINATE
27749 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
27750
27751 #undef TARGET_CAN_INLINE_P
27752 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
27753
27754 #undef TARGET_CANNOT_FORCE_CONST_MEM
27755 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
27756
27757 #undef TARGET_CASE_VALUES_THRESHOLD
27758 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
27759
27760 #undef TARGET_CONDITIONAL_REGISTER_USAGE
27761 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
27762
27763 #undef TARGET_MEMBER_TYPE_FORCES_BLK
27764 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
27765
27766 /* Only the least significant bit is used for initialization guard
27767 variables. */
27768 #undef TARGET_CXX_GUARD_MASK_BIT
27769 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
27770
27771 #undef TARGET_C_MODE_FOR_SUFFIX
27772 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
27773
27774 #ifdef TARGET_BIG_ENDIAN_DEFAULT
27775 #undef TARGET_DEFAULT_TARGET_FLAGS
27776 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
27777 #endif
27778
27779 #undef TARGET_CLASS_MAX_NREGS
27780 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
27781
27782 #undef TARGET_BUILTIN_DECL
27783 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
27784
27785 #undef TARGET_BUILTIN_RECIPROCAL
27786 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
27787
27788 #undef TARGET_C_EXCESS_PRECISION
27789 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
27790
27791 #undef TARGET_EXPAND_BUILTIN
27792 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
27793
27794 #undef TARGET_EXPAND_BUILTIN_VA_START
27795 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
27796
27797 #undef TARGET_FOLD_BUILTIN
27798 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
27799
27800 #undef TARGET_FUNCTION_ARG
27801 #define TARGET_FUNCTION_ARG aarch64_function_arg
27802
27803 #undef TARGET_FUNCTION_ARG_ADVANCE
27804 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
27805
27806 #undef TARGET_FUNCTION_ARG_BOUNDARY
27807 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
27808
27809 #undef TARGET_FUNCTION_ARG_PADDING
27810 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
27811
27812 #undef TARGET_GET_RAW_RESULT_MODE
27813 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
27814 #undef TARGET_GET_RAW_ARG_MODE
27815 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
27816
27817 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
27818 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
27819
27820 #undef TARGET_FUNCTION_VALUE
27821 #define TARGET_FUNCTION_VALUE aarch64_function_value
27822
27823 #undef TARGET_FUNCTION_VALUE_REGNO_P
27824 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
27825
27826 #undef TARGET_GIMPLE_FOLD_BUILTIN
27827 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
27828
27829 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
27830 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
27831
27832 #undef TARGET_INIT_BUILTINS
27833 #define TARGET_INIT_BUILTINS aarch64_init_builtins
27834
27835 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
27836 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
27837 aarch64_ira_change_pseudo_allocno_class
27838
27839 #undef TARGET_LEGITIMATE_ADDRESS_P
27840 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
27841
27842 #undef TARGET_LEGITIMATE_CONSTANT_P
27843 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
27844
27845 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
27846 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
27847 aarch64_legitimize_address_displacement
27848
27849 #undef TARGET_LIBGCC_CMP_RETURN_MODE
27850 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
27851
27852 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
27853 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
27854 aarch64_libgcc_floating_mode_supported_p
27855
27856 #undef TARGET_MANGLE_TYPE
27857 #define TARGET_MANGLE_TYPE aarch64_mangle_type
27858
27859 #undef TARGET_INVALID_CONVERSION
27860 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
27861
27862 #undef TARGET_INVALID_UNARY_OP
27863 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
27864
27865 #undef TARGET_INVALID_BINARY_OP
27866 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
27867
27868 #undef TARGET_VERIFY_TYPE_CONTEXT
27869 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
27870
27871 #undef TARGET_MEMORY_MOVE_COST
27872 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
27873
27874 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
27875 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
27876
27877 #undef TARGET_MUST_PASS_IN_STACK
27878 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
27879
27880 /* This target hook should return true if accesses to volatile bitfields
27881 should use the narrowest mode possible. It should return false if these
27882 accesses should use the bitfield container type. */
27883 #undef TARGET_NARROW_VOLATILE_BITFIELD
27884 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
27885
27886 #undef TARGET_OPTION_OVERRIDE
27887 #define TARGET_OPTION_OVERRIDE aarch64_override_options
27888
27889 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
27890 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
27891 aarch64_override_options_after_change
27892
27893 #undef TARGET_OFFLOAD_OPTIONS
27894 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
27895
27896 #undef TARGET_OPTION_RESTORE
27897 #define TARGET_OPTION_RESTORE aarch64_option_restore
27898
27899 #undef TARGET_OPTION_PRINT
27900 #define TARGET_OPTION_PRINT aarch64_option_print
27901
27902 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
27903 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
27904
27905 #undef TARGET_SET_CURRENT_FUNCTION
27906 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
27907
27908 #undef TARGET_PASS_BY_REFERENCE
27909 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
27910
27911 #undef TARGET_PREFERRED_RELOAD_CLASS
27912 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
27913
27914 #undef TARGET_SCHED_REASSOCIATION_WIDTH
27915 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
27916
27917 #undef TARGET_DWARF_FRAME_REG_MODE
27918 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
27919
27920 #undef TARGET_PROMOTED_TYPE
27921 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
27922
27923 #undef TARGET_SECONDARY_RELOAD
27924 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
27925
27926 #undef TARGET_SECONDARY_MEMORY_NEEDED
27927 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
27928
27929 #undef TARGET_SHIFT_TRUNCATION_MASK
27930 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
27931
27932 #undef TARGET_SETUP_INCOMING_VARARGS
27933 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
27934
27935 #undef TARGET_STRUCT_VALUE_RTX
27936 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
27937
27938 #undef TARGET_REGISTER_MOVE_COST
27939 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
27940
27941 #undef TARGET_RETURN_IN_MEMORY
27942 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
27943
27944 #undef TARGET_RETURN_IN_MSB
27945 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
27946
27947 #undef TARGET_RTX_COSTS
27948 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
27949
27950 #undef TARGET_SCALAR_MODE_SUPPORTED_P
27951 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
27952
27953 #undef TARGET_SCHED_ISSUE_RATE
27954 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
27955
27956 #undef TARGET_SCHED_VARIABLE_ISSUE
27957 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
27958
27959 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
27960 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
27961 aarch64_sched_first_cycle_multipass_dfa_lookahead
27962
27963 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
27964 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
27965 aarch64_first_cycle_multipass_dfa_lookahead_guard
27966
27967 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
27968 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
27969 aarch64_get_separate_components
27970
27971 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
27972 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
27973 aarch64_components_for_bb
27974
27975 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
27976 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
27977 aarch64_disqualify_components
27978
27979 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
27980 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
27981 aarch64_emit_prologue_components
27982
27983 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
27984 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
27985 aarch64_emit_epilogue_components
27986
27987 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
27988 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
27989 aarch64_set_handled_components
27990
27991 #undef TARGET_TRAMPOLINE_INIT
27992 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
27993
27994 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
27995 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
27996
27997 #undef TARGET_VECTOR_MODE_SUPPORTED_P
27998 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
27999
28000 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
28001 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
28002
28003 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
28004 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
28005 aarch64_builtin_support_vector_misalignment
28006
28007 #undef TARGET_ARRAY_MODE
28008 #define TARGET_ARRAY_MODE aarch64_array_mode
28009
28010 #undef TARGET_ARRAY_MODE_SUPPORTED_P
28011 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
28012
28013 #undef TARGET_VECTORIZE_CREATE_COSTS
28014 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
28015
28016 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
28017 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
28018 aarch64_builtin_vectorization_cost
28019
28020 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
28021 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
28022
28023 #undef TARGET_VECTORIZE_BUILTINS
28024 #define TARGET_VECTORIZE_BUILTINS
28025
28026 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
28027 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
28028 aarch64_autovectorize_vector_modes
28029
28030 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
28031 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
28032 aarch64_atomic_assign_expand_fenv
28033
28034 /* Section anchor support. */
28035
28036 #undef TARGET_MIN_ANCHOR_OFFSET
28037 #define TARGET_MIN_ANCHOR_OFFSET -256
28038
28039 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
28040 byte offset; we can do much more for larger data types, but have no way
28041 to determine the size of the access. We assume accesses are aligned. */
28042 #undef TARGET_MAX_ANCHOR_OFFSET
28043 #define TARGET_MAX_ANCHOR_OFFSET 4095
28044
28045 #undef TARGET_VECTOR_ALIGNMENT
28046 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
28047
28048 #undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
28049 #define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
28050 aarch64_vectorize_can_special_div_by_constant
28051
28052 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
28053 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
28054 aarch64_vectorize_preferred_vector_alignment
28055 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
28056 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
28057 aarch64_simd_vector_alignment_reachable
28058
28059 /* vec_perm support. */
28060
28061 #undef TARGET_VECTORIZE_VEC_PERM_CONST
28062 #define TARGET_VECTORIZE_VEC_PERM_CONST \
28063 aarch64_vectorize_vec_perm_const
28064
28065 #undef TARGET_VECTORIZE_RELATED_MODE
28066 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
28067 #undef TARGET_VECTORIZE_GET_MASK_MODE
28068 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
28069 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
28070 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
28071 aarch64_empty_mask_is_expensive
28072 #undef TARGET_PREFERRED_ELSE_VALUE
28073 #define TARGET_PREFERRED_ELSE_VALUE \
28074 aarch64_preferred_else_value
28075
28076 #undef TARGET_INIT_LIBFUNCS
28077 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
28078
28079 #undef TARGET_FIXED_CONDITION_CODE_REGS
28080 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
28081
28082 #undef TARGET_FLAGS_REGNUM
28083 #define TARGET_FLAGS_REGNUM CC_REGNUM
28084
28085 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
28086 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
28087
28088 #undef TARGET_ASAN_SHADOW_OFFSET
28089 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
28090
28091 #undef TARGET_LEGITIMIZE_ADDRESS
28092 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
28093
28094 #undef TARGET_SCHED_CAN_SPECULATE_INSN
28095 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
28096
28097 #undef TARGET_CAN_USE_DOLOOP_P
28098 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
28099
28100 #undef TARGET_SCHED_ADJUST_PRIORITY
28101 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
28102
28103 #undef TARGET_SCHED_MACRO_FUSION_P
28104 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
28105
28106 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
28107 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
28108
28109 #undef TARGET_SCHED_FUSION_PRIORITY
28110 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
28111
28112 #undef TARGET_UNSPEC_MAY_TRAP_P
28113 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
28114
28115 #undef TARGET_USE_PSEUDO_PIC_REG
28116 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
28117
28118 #undef TARGET_PRINT_OPERAND
28119 #define TARGET_PRINT_OPERAND aarch64_print_operand
28120
28121 #undef TARGET_PRINT_OPERAND_ADDRESS
28122 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
28123
28124 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
28125 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
28126
28127 #undef TARGET_OPTAB_SUPPORTED_P
28128 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
28129
28130 #undef TARGET_OMIT_STRUCT_RETURN_REG
28131 #define TARGET_OMIT_STRUCT_RETURN_REG true
28132
28133 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
28134 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
28135 aarch64_dwarf_poly_indeterminate_value
28136
28137 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
28138 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
28139 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
28140
28141 #undef TARGET_HARD_REGNO_NREGS
28142 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
28143 #undef TARGET_HARD_REGNO_MODE_OK
28144 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
28145
28146 #undef TARGET_MODES_TIEABLE_P
28147 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
28148
28149 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
28150 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
28151 aarch64_hard_regno_call_part_clobbered
28152
28153 #undef TARGET_INSN_CALLEE_ABI
28154 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
28155
28156 #undef TARGET_CONSTANT_ALIGNMENT
28157 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
28158
28159 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
28160 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
28161 aarch64_stack_clash_protection_alloca_probe_range
28162
28163 #undef TARGET_COMPUTE_PRESSURE_CLASSES
28164 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
28165
28166 #undef TARGET_CAN_CHANGE_MODE_CLASS
28167 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
28168
28169 #undef TARGET_SELECT_EARLY_REMAT_MODES
28170 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
28171
28172 #undef TARGET_SPECULATION_SAFE_VALUE
28173 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
28174
28175 #undef TARGET_ESTIMATED_POLY_VALUE
28176 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
28177
28178 #undef TARGET_ATTRIBUTE_TABLE
28179 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
28180
28181 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
28182 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
28183 aarch64_simd_clone_compute_vecsize_and_simdlen
28184
28185 #undef TARGET_SIMD_CLONE_ADJUST
28186 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
28187
28188 #undef TARGET_SIMD_CLONE_USABLE
28189 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
28190
28191 #undef TARGET_COMP_TYPE_ATTRIBUTES
28192 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
28193
28194 #undef TARGET_GET_MULTILIB_ABI_NAME
28195 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
28196
28197 #undef TARGET_FNTYPE_ABI
28198 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
28199
28200 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
28201 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
28202
28203 #if CHECKING_P
28204 #undef TARGET_RUN_TARGET_SELFTESTS
28205 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
28206 #endif /* #if CHECKING_P */
28207
28208 #undef TARGET_ASM_POST_CFI_STARTPROC
28209 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
28210
28211 #undef TARGET_STRICT_ARGUMENT_NAMING
28212 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
28213
28214 #undef TARGET_MD_ASM_ADJUST
28215 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
28216
28217 #undef TARGET_ASM_FILE_END
28218 #define TARGET_ASM_FILE_END aarch64_asm_file_end
28219
28220 #undef TARGET_ASM_FUNCTION_EPILOGUE
28221 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
28222
28223 #undef TARGET_HAVE_SHADOW_CALL_STACK
28224 #define TARGET_HAVE_SHADOW_CALL_STACK true
28225
28226 #undef TARGET_CONST_ANCHOR
28227 #define TARGET_CONST_ANCHOR 0x1000000
28228
28229 struct gcc_target targetm = TARGET_INITIALIZER;
28230
28231 #include "gt-aarch64.h"