]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
aarch64: Add support for arm_sve_vector_bits
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74 #include "intl.h"
75 #include "expmed.h"
76 #include "function-abi.h"
77
78 /* This file should be included last. */
79 #include "target-def.h"
80
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
86 {
87 enum insn_type { MOV, MVN, INDEX, PTRUE };
88 enum modifier_type { LSL, MSL };
89
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode, rtx);
92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 insn_type = MOV, modifier_type = LSL,
94 unsigned int = 0);
95 simd_immediate_info (scalar_mode, rtx, rtx);
96 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
97
98 /* The mode of the elements. */
99 scalar_mode elt_mode;
100
101 /* The instruction to use to move the immediate into a vector. */
102 insn_type insn;
103
104 union
105 {
106 /* For MOV and MVN. */
107 struct
108 {
109 /* The value of each element. */
110 rtx value;
111
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier;
115 unsigned int shift;
116 } mov;
117
118 /* For INDEX. */
119 struct
120 {
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
123 rtx base, step;
124 } index;
125
126 /* For PTRUE. */
127 aarch64_svpattern pattern;
128 } u;
129 };
130
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
135 : elt_mode (elt_mode_in), insn (MOV)
136 {
137 u.mov.value = value_in;
138 u.mov.modifier = LSL;
139 u.mov.shift = 0;
140 }
141
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
144 fields. */
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
147 unsigned HOST_WIDE_INT value_in,
148 insn_type insn_in, modifier_type modifier_in,
149 unsigned int shift_in)
150 : elt_mode (elt_mode_in), insn (insn_in)
151 {
152 u.mov.value = gen_int_mode (value_in, elt_mode_in);
153 u.mov.modifier = modifier_in;
154 u.mov.shift = shift_in;
155 }
156
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161 : elt_mode (elt_mode_in), insn (INDEX)
162 {
163 u.index.base = base_in;
164 u.index.step = step_in;
165 }
166
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
171 aarch64_svpattern pattern_in)
172 : elt_mode (elt_mode_in), insn (PTRUE)
173 {
174 u.pattern = pattern_in;
175 }
176
177 namespace {
178
179 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
180 class pure_scalable_type_info
181 {
182 public:
183 /* Represents the result of analyzing a type. All values are nonzero,
184 in the possibly forlorn hope that accidental conversions to bool
185 trigger a warning. */
186 enum analysis_result
187 {
188 /* The type does not have an ABI identity; i.e. it doesn't contain
189 at least one object whose type is a Fundamental Data Type. */
190 NO_ABI_IDENTITY = 1,
191
192 /* The type is definitely a Pure Scalable Type. */
193 IS_PST,
194
195 /* The type is definitely not a Pure Scalable Type. */
196 ISNT_PST,
197
198 /* It doesn't matter for PCS purposes whether the type is a Pure
199 Scalable Type or not, since the type will be handled the same
200 way regardless.
201
202 Specifically, this means that if the type is a Pure Scalable Type,
203 there aren't enough argument registers to hold it, and so it will
204 need to be passed or returned in memory. If the type isn't a
205 Pure Scalable Type, it's too big to be passed or returned in core
206 or SIMD&FP registers, and so again will need to go in memory. */
207 DOESNT_MATTER
208 };
209
210 /* Aggregates of 17 bytes or more are normally passed and returned
211 in memory, so aggregates of that size can safely be analyzed as
212 DOESNT_MATTER. We need to be able to collect enough pieces to
213 represent a PST that is smaller than that. Since predicates are
214 2 bytes in size for -msve-vector-bits=128, that means we need to be
215 able to store at least 8 pieces.
216
217 We also need to be able to store enough pieces to represent
218 a single vector in each vector argument register and a single
219 predicate in each predicate argument register. This means that
220 we need at least 12 pieces. */
221 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
222 #if __cplusplus >= 201103L
223 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
224 #endif
225
226 /* Describes one piece of a PST. Each piece is one of:
227
228 - a single Scalable Vector Type (SVT)
229 - a single Scalable Predicate Type (SPT)
230 - a PST containing 2, 3 or 4 SVTs, with no padding
231
232 It either represents a single built-in type or a PST formed from
233 multiple homogeneous built-in types. */
234 struct piece
235 {
236 rtx get_rtx (unsigned int, unsigned int) const;
237
238 /* The number of vector and predicate registers that the piece
239 occupies. One of the two is always zero. */
240 unsigned int num_zr;
241 unsigned int num_pr;
242
243 /* The mode of the registers described above. */
244 machine_mode mode;
245
246 /* If this piece is formed from multiple homogeneous built-in types,
247 this is the mode of the built-in types, otherwise it is MODE. */
248 machine_mode orig_mode;
249
250 /* The offset in bytes of the piece from the start of the type. */
251 poly_uint64_pod offset;
252 };
253
254 /* Divides types analyzed as IS_PST into individual pieces. The pieces
255 are in memory order. */
256 auto_vec<piece, MAX_PIECES> pieces;
257
258 unsigned int num_zr () const;
259 unsigned int num_pr () const;
260
261 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
262
263 analysis_result analyze (const_tree);
264 bool analyze_registers (const_tree);
265
266 private:
267 analysis_result analyze_array (const_tree);
268 analysis_result analyze_record (const_tree);
269 void add_piece (const piece &);
270 };
271 }
272
273 /* The current code model. */
274 enum aarch64_code_model aarch64_cmodel;
275
276 /* The number of 64-bit elements in an SVE vector. */
277 poly_uint16 aarch64_sve_vg;
278
279 #ifdef HAVE_AS_TLS
280 #undef TARGET_HAVE_TLS
281 #define TARGET_HAVE_TLS 1
282 #endif
283
284 static bool aarch64_composite_type_p (const_tree, machine_mode);
285 static bool aarch64_return_in_memory_1 (const_tree);
286 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
287 const_tree,
288 machine_mode *, int *,
289 bool *);
290 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
291 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
292 static void aarch64_override_options_after_change (void);
293 static bool aarch64_vector_mode_supported_p (machine_mode);
294 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
295 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
296 const_tree type,
297 int misalignment,
298 bool is_packed);
299 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
300 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
301 aarch64_addr_query_type);
302 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
303
304 /* Major revision number of the ARM Architecture implemented by the target. */
305 unsigned aarch64_architecture_version;
306
307 /* The processor for which instructions should be scheduled. */
308 enum aarch64_processor aarch64_tune = cortexa53;
309
310 /* Mask to specify which instruction scheduling options should be used. */
311 uint64_t aarch64_tune_flags = 0;
312
313 /* Global flag for PC relative loads. */
314 bool aarch64_pcrelative_literal_loads;
315
316 /* Global flag for whether frame pointer is enabled. */
317 bool aarch64_use_frame_pointer;
318
319 #define BRANCH_PROTECT_STR_MAX 255
320 char *accepted_branch_protection_string = NULL;
321
322 static enum aarch64_parse_opt_result
323 aarch64_parse_branch_protection (const char*, char**);
324
325 /* Support for command line parsing of boolean flags in the tuning
326 structures. */
327 struct aarch64_flag_desc
328 {
329 const char* name;
330 unsigned int flag;
331 };
332
333 #define AARCH64_FUSION_PAIR(name, internal_name) \
334 { name, AARCH64_FUSE_##internal_name },
335 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
336 {
337 { "none", AARCH64_FUSE_NOTHING },
338 #include "aarch64-fusion-pairs.def"
339 { "all", AARCH64_FUSE_ALL },
340 { NULL, AARCH64_FUSE_NOTHING }
341 };
342
343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
344 { name, AARCH64_EXTRA_TUNE_##internal_name },
345 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
346 {
347 { "none", AARCH64_EXTRA_TUNE_NONE },
348 #include "aarch64-tuning-flags.def"
349 { "all", AARCH64_EXTRA_TUNE_ALL },
350 { NULL, AARCH64_EXTRA_TUNE_NONE }
351 };
352
353 /* Tuning parameters. */
354
355 static const struct cpu_addrcost_table generic_addrcost_table =
356 {
357 {
358 1, /* hi */
359 0, /* si */
360 0, /* di */
361 1, /* ti */
362 },
363 0, /* pre_modify */
364 0, /* post_modify */
365 0, /* register_offset */
366 0, /* register_sextend */
367 0, /* register_zextend */
368 0 /* imm_offset */
369 };
370
371 static const struct cpu_addrcost_table exynosm1_addrcost_table =
372 {
373 {
374 0, /* hi */
375 0, /* si */
376 0, /* di */
377 2, /* ti */
378 },
379 0, /* pre_modify */
380 0, /* post_modify */
381 1, /* register_offset */
382 1, /* register_sextend */
383 2, /* register_zextend */
384 0, /* imm_offset */
385 };
386
387 static const struct cpu_addrcost_table xgene1_addrcost_table =
388 {
389 {
390 1, /* hi */
391 0, /* si */
392 0, /* di */
393 1, /* ti */
394 },
395 1, /* pre_modify */
396 1, /* post_modify */
397 0, /* register_offset */
398 1, /* register_sextend */
399 1, /* register_zextend */
400 0, /* imm_offset */
401 };
402
403 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
404 {
405 {
406 1, /* hi */
407 1, /* si */
408 1, /* di */
409 2, /* ti */
410 },
411 0, /* pre_modify */
412 0, /* post_modify */
413 2, /* register_offset */
414 3, /* register_sextend */
415 3, /* register_zextend */
416 0, /* imm_offset */
417 };
418
419 static const struct cpu_addrcost_table tsv110_addrcost_table =
420 {
421 {
422 1, /* hi */
423 0, /* si */
424 0, /* di */
425 1, /* ti */
426 },
427 0, /* pre_modify */
428 0, /* post_modify */
429 0, /* register_offset */
430 1, /* register_sextend */
431 1, /* register_zextend */
432 0, /* imm_offset */
433 };
434
435 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
436 {
437 {
438 1, /* hi */
439 1, /* si */
440 1, /* di */
441 2, /* ti */
442 },
443 1, /* pre_modify */
444 1, /* post_modify */
445 3, /* register_offset */
446 3, /* register_sextend */
447 3, /* register_zextend */
448 2, /* imm_offset */
449 };
450
451 static const struct cpu_regmove_cost generic_regmove_cost =
452 {
453 1, /* GP2GP */
454 /* Avoid the use of slow int<->fp moves for spilling by setting
455 their cost higher than memmov_cost. */
456 5, /* GP2FP */
457 5, /* FP2GP */
458 2 /* FP2FP */
459 };
460
461 static const struct cpu_regmove_cost cortexa57_regmove_cost =
462 {
463 1, /* GP2GP */
464 /* Avoid the use of slow int<->fp moves for spilling by setting
465 their cost higher than memmov_cost. */
466 5, /* GP2FP */
467 5, /* FP2GP */
468 2 /* FP2FP */
469 };
470
471 static const struct cpu_regmove_cost cortexa53_regmove_cost =
472 {
473 1, /* GP2GP */
474 /* Avoid the use of slow int<->fp moves for spilling by setting
475 their cost higher than memmov_cost. */
476 5, /* GP2FP */
477 5, /* FP2GP */
478 2 /* FP2FP */
479 };
480
481 static const struct cpu_regmove_cost exynosm1_regmove_cost =
482 {
483 1, /* GP2GP */
484 /* Avoid the use of slow int<->fp moves for spilling by setting
485 their cost higher than memmov_cost (actual, 4 and 9). */
486 9, /* GP2FP */
487 9, /* FP2GP */
488 1 /* FP2FP */
489 };
490
491 static const struct cpu_regmove_cost thunderx_regmove_cost =
492 {
493 2, /* GP2GP */
494 2, /* GP2FP */
495 6, /* FP2GP */
496 4 /* FP2FP */
497 };
498
499 static const struct cpu_regmove_cost xgene1_regmove_cost =
500 {
501 1, /* GP2GP */
502 /* Avoid the use of slow int<->fp moves for spilling by setting
503 their cost higher than memmov_cost. */
504 8, /* GP2FP */
505 8, /* FP2GP */
506 2 /* FP2FP */
507 };
508
509 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
510 {
511 2, /* GP2GP */
512 /* Avoid the use of int<->fp moves for spilling. */
513 6, /* GP2FP */
514 6, /* FP2GP */
515 4 /* FP2FP */
516 };
517
518 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
519 {
520 1, /* GP2GP */
521 /* Avoid the use of int<->fp moves for spilling. */
522 8, /* GP2FP */
523 8, /* FP2GP */
524 4 /* FP2FP */
525 };
526
527 static const struct cpu_regmove_cost tsv110_regmove_cost =
528 {
529 1, /* GP2GP */
530 /* Avoid the use of slow int<->fp moves for spilling by setting
531 their cost higher than memmov_cost. */
532 2, /* GP2FP */
533 3, /* FP2GP */
534 2 /* FP2FP */
535 };
536
537 /* Generic costs for vector insn classes. */
538 static const struct cpu_vector_cost generic_vector_cost =
539 {
540 1, /* scalar_int_stmt_cost */
541 1, /* scalar_fp_stmt_cost */
542 1, /* scalar_load_cost */
543 1, /* scalar_store_cost */
544 1, /* vec_int_stmt_cost */
545 1, /* vec_fp_stmt_cost */
546 2, /* vec_permute_cost */
547 2, /* vec_to_scalar_cost */
548 1, /* scalar_to_vec_cost */
549 1, /* vec_align_load_cost */
550 1, /* vec_unalign_load_cost */
551 1, /* vec_unalign_store_cost */
552 1, /* vec_store_cost */
553 3, /* cond_taken_branch_cost */
554 1 /* cond_not_taken_branch_cost */
555 };
556
557 /* QDF24XX costs for vector insn classes. */
558 static const struct cpu_vector_cost qdf24xx_vector_cost =
559 {
560 1, /* scalar_int_stmt_cost */
561 1, /* scalar_fp_stmt_cost */
562 1, /* scalar_load_cost */
563 1, /* scalar_store_cost */
564 1, /* vec_int_stmt_cost */
565 3, /* vec_fp_stmt_cost */
566 2, /* vec_permute_cost */
567 1, /* vec_to_scalar_cost */
568 1, /* scalar_to_vec_cost */
569 1, /* vec_align_load_cost */
570 1, /* vec_unalign_load_cost */
571 1, /* vec_unalign_store_cost */
572 1, /* vec_store_cost */
573 3, /* cond_taken_branch_cost */
574 1 /* cond_not_taken_branch_cost */
575 };
576
577 /* ThunderX costs for vector insn classes. */
578 static const struct cpu_vector_cost thunderx_vector_cost =
579 {
580 1, /* scalar_int_stmt_cost */
581 1, /* scalar_fp_stmt_cost */
582 3, /* scalar_load_cost */
583 1, /* scalar_store_cost */
584 4, /* vec_int_stmt_cost */
585 1, /* vec_fp_stmt_cost */
586 4, /* vec_permute_cost */
587 2, /* vec_to_scalar_cost */
588 2, /* scalar_to_vec_cost */
589 3, /* vec_align_load_cost */
590 5, /* vec_unalign_load_cost */
591 5, /* vec_unalign_store_cost */
592 1, /* vec_store_cost */
593 3, /* cond_taken_branch_cost */
594 3 /* cond_not_taken_branch_cost */
595 };
596
597 static const struct cpu_vector_cost tsv110_vector_cost =
598 {
599 1, /* scalar_int_stmt_cost */
600 1, /* scalar_fp_stmt_cost */
601 5, /* scalar_load_cost */
602 1, /* scalar_store_cost */
603 2, /* vec_int_stmt_cost */
604 2, /* vec_fp_stmt_cost */
605 2, /* vec_permute_cost */
606 3, /* vec_to_scalar_cost */
607 2, /* scalar_to_vec_cost */
608 5, /* vec_align_load_cost */
609 5, /* vec_unalign_load_cost */
610 1, /* vec_unalign_store_cost */
611 1, /* vec_store_cost */
612 1, /* cond_taken_branch_cost */
613 1 /* cond_not_taken_branch_cost */
614 };
615
616 /* Generic costs for vector insn classes. */
617 static const struct cpu_vector_cost cortexa57_vector_cost =
618 {
619 1, /* scalar_int_stmt_cost */
620 1, /* scalar_fp_stmt_cost */
621 4, /* scalar_load_cost */
622 1, /* scalar_store_cost */
623 2, /* vec_int_stmt_cost */
624 2, /* vec_fp_stmt_cost */
625 3, /* vec_permute_cost */
626 8, /* vec_to_scalar_cost */
627 8, /* scalar_to_vec_cost */
628 4, /* vec_align_load_cost */
629 4, /* vec_unalign_load_cost */
630 1, /* vec_unalign_store_cost */
631 1, /* vec_store_cost */
632 1, /* cond_taken_branch_cost */
633 1 /* cond_not_taken_branch_cost */
634 };
635
636 static const struct cpu_vector_cost exynosm1_vector_cost =
637 {
638 1, /* scalar_int_stmt_cost */
639 1, /* scalar_fp_stmt_cost */
640 5, /* scalar_load_cost */
641 1, /* scalar_store_cost */
642 3, /* vec_int_stmt_cost */
643 3, /* vec_fp_stmt_cost */
644 3, /* vec_permute_cost */
645 3, /* vec_to_scalar_cost */
646 3, /* scalar_to_vec_cost */
647 5, /* vec_align_load_cost */
648 5, /* vec_unalign_load_cost */
649 1, /* vec_unalign_store_cost */
650 1, /* vec_store_cost */
651 1, /* cond_taken_branch_cost */
652 1 /* cond_not_taken_branch_cost */
653 };
654
655 /* Generic costs for vector insn classes. */
656 static const struct cpu_vector_cost xgene1_vector_cost =
657 {
658 1, /* scalar_int_stmt_cost */
659 1, /* scalar_fp_stmt_cost */
660 5, /* scalar_load_cost */
661 1, /* scalar_store_cost */
662 2, /* vec_int_stmt_cost */
663 2, /* vec_fp_stmt_cost */
664 2, /* vec_permute_cost */
665 4, /* vec_to_scalar_cost */
666 4, /* scalar_to_vec_cost */
667 10, /* vec_align_load_cost */
668 10, /* vec_unalign_load_cost */
669 2, /* vec_unalign_store_cost */
670 2, /* vec_store_cost */
671 2, /* cond_taken_branch_cost */
672 1 /* cond_not_taken_branch_cost */
673 };
674
675 /* Costs for vector insn classes for Vulcan. */
676 static const struct cpu_vector_cost thunderx2t99_vector_cost =
677 {
678 1, /* scalar_int_stmt_cost */
679 6, /* scalar_fp_stmt_cost */
680 4, /* scalar_load_cost */
681 1, /* scalar_store_cost */
682 5, /* vec_int_stmt_cost */
683 6, /* vec_fp_stmt_cost */
684 10, /* vec_permute_cost */
685 6, /* vec_to_scalar_cost */
686 5, /* scalar_to_vec_cost */
687 8, /* vec_align_load_cost */
688 8, /* vec_unalign_load_cost */
689 4, /* vec_unalign_store_cost */
690 4, /* vec_store_cost */
691 2, /* cond_taken_branch_cost */
692 1 /* cond_not_taken_branch_cost */
693 };
694
695 /* Generic costs for branch instructions. */
696 static const struct cpu_branch_cost generic_branch_cost =
697 {
698 1, /* Predictable. */
699 3 /* Unpredictable. */
700 };
701
702 /* Generic approximation modes. */
703 static const cpu_approx_modes generic_approx_modes =
704 {
705 AARCH64_APPROX_NONE, /* division */
706 AARCH64_APPROX_NONE, /* sqrt */
707 AARCH64_APPROX_NONE /* recip_sqrt */
708 };
709
710 /* Approximation modes for Exynos M1. */
711 static const cpu_approx_modes exynosm1_approx_modes =
712 {
713 AARCH64_APPROX_NONE, /* division */
714 AARCH64_APPROX_ALL, /* sqrt */
715 AARCH64_APPROX_ALL /* recip_sqrt */
716 };
717
718 /* Approximation modes for X-Gene 1. */
719 static const cpu_approx_modes xgene1_approx_modes =
720 {
721 AARCH64_APPROX_NONE, /* division */
722 AARCH64_APPROX_NONE, /* sqrt */
723 AARCH64_APPROX_ALL /* recip_sqrt */
724 };
725
726 /* Generic prefetch settings (which disable prefetch). */
727 static const cpu_prefetch_tune generic_prefetch_tune =
728 {
729 0, /* num_slots */
730 -1, /* l1_cache_size */
731 -1, /* l1_cache_line_size */
732 -1, /* l2_cache_size */
733 true, /* prefetch_dynamic_strides */
734 -1, /* minimum_stride */
735 -1 /* default_opt_level */
736 };
737
738 static const cpu_prefetch_tune exynosm1_prefetch_tune =
739 {
740 0, /* num_slots */
741 -1, /* l1_cache_size */
742 64, /* l1_cache_line_size */
743 -1, /* l2_cache_size */
744 true, /* prefetch_dynamic_strides */
745 -1, /* minimum_stride */
746 -1 /* default_opt_level */
747 };
748
749 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
750 {
751 4, /* num_slots */
752 32, /* l1_cache_size */
753 64, /* l1_cache_line_size */
754 512, /* l2_cache_size */
755 false, /* prefetch_dynamic_strides */
756 2048, /* minimum_stride */
757 3 /* default_opt_level */
758 };
759
760 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
761 {
762 8, /* num_slots */
763 32, /* l1_cache_size */
764 128, /* l1_cache_line_size */
765 16*1024, /* l2_cache_size */
766 true, /* prefetch_dynamic_strides */
767 -1, /* minimum_stride */
768 3 /* default_opt_level */
769 };
770
771 static const cpu_prefetch_tune thunderx_prefetch_tune =
772 {
773 8, /* num_slots */
774 32, /* l1_cache_size */
775 128, /* l1_cache_line_size */
776 -1, /* l2_cache_size */
777 true, /* prefetch_dynamic_strides */
778 -1, /* minimum_stride */
779 -1 /* default_opt_level */
780 };
781
782 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
783 {
784 8, /* num_slots */
785 32, /* l1_cache_size */
786 64, /* l1_cache_line_size */
787 256, /* l2_cache_size */
788 true, /* prefetch_dynamic_strides */
789 -1, /* minimum_stride */
790 -1 /* default_opt_level */
791 };
792
793 static const cpu_prefetch_tune tsv110_prefetch_tune =
794 {
795 0, /* num_slots */
796 64, /* l1_cache_size */
797 64, /* l1_cache_line_size */
798 512, /* l2_cache_size */
799 true, /* prefetch_dynamic_strides */
800 -1, /* minimum_stride */
801 -1 /* default_opt_level */
802 };
803
804 static const cpu_prefetch_tune xgene1_prefetch_tune =
805 {
806 8, /* num_slots */
807 32, /* l1_cache_size */
808 64, /* l1_cache_line_size */
809 256, /* l2_cache_size */
810 true, /* prefetch_dynamic_strides */
811 -1, /* minimum_stride */
812 -1 /* default_opt_level */
813 };
814
815 static const struct tune_params generic_tunings =
816 {
817 &cortexa57_extra_costs,
818 &generic_addrcost_table,
819 &generic_regmove_cost,
820 &generic_vector_cost,
821 &generic_branch_cost,
822 &generic_approx_modes,
823 SVE_NOT_IMPLEMENTED, /* sve_width */
824 4, /* memmov_cost */
825 2, /* issue_rate */
826 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
827 "16:12", /* function_align. */
828 "4", /* jump_align. */
829 "8", /* loop_align. */
830 2, /* int_reassoc_width. */
831 4, /* fp_reassoc_width. */
832 1, /* vec_reassoc_width. */
833 2, /* min_div_recip_mul_sf. */
834 2, /* min_div_recip_mul_df. */
835 0, /* max_case_values. */
836 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
837 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
838 &generic_prefetch_tune
839 };
840
841 static const struct tune_params cortexa35_tunings =
842 {
843 &cortexa53_extra_costs,
844 &generic_addrcost_table,
845 &cortexa53_regmove_cost,
846 &generic_vector_cost,
847 &generic_branch_cost,
848 &generic_approx_modes,
849 SVE_NOT_IMPLEMENTED, /* sve_width */
850 4, /* memmov_cost */
851 1, /* issue_rate */
852 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
853 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
854 "16", /* function_align. */
855 "4", /* jump_align. */
856 "8", /* loop_align. */
857 2, /* int_reassoc_width. */
858 4, /* fp_reassoc_width. */
859 1, /* vec_reassoc_width. */
860 2, /* min_div_recip_mul_sf. */
861 2, /* min_div_recip_mul_df. */
862 0, /* max_case_values. */
863 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
864 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
865 &generic_prefetch_tune
866 };
867
868 static const struct tune_params cortexa53_tunings =
869 {
870 &cortexa53_extra_costs,
871 &generic_addrcost_table,
872 &cortexa53_regmove_cost,
873 &generic_vector_cost,
874 &generic_branch_cost,
875 &generic_approx_modes,
876 SVE_NOT_IMPLEMENTED, /* sve_width */
877 4, /* memmov_cost */
878 2, /* issue_rate */
879 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
880 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
881 "16", /* function_align. */
882 "4", /* jump_align. */
883 "8", /* loop_align. */
884 2, /* int_reassoc_width. */
885 4, /* fp_reassoc_width. */
886 1, /* vec_reassoc_width. */
887 2, /* min_div_recip_mul_sf. */
888 2, /* min_div_recip_mul_df. */
889 0, /* max_case_values. */
890 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
891 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
892 &generic_prefetch_tune
893 };
894
895 static const struct tune_params cortexa57_tunings =
896 {
897 &cortexa57_extra_costs,
898 &generic_addrcost_table,
899 &cortexa57_regmove_cost,
900 &cortexa57_vector_cost,
901 &generic_branch_cost,
902 &generic_approx_modes,
903 SVE_NOT_IMPLEMENTED, /* sve_width */
904 4, /* memmov_cost */
905 3, /* issue_rate */
906 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
907 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
908 "16", /* function_align. */
909 "4", /* jump_align. */
910 "8", /* loop_align. */
911 2, /* int_reassoc_width. */
912 4, /* fp_reassoc_width. */
913 1, /* vec_reassoc_width. */
914 2, /* min_div_recip_mul_sf. */
915 2, /* min_div_recip_mul_df. */
916 0, /* max_case_values. */
917 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
918 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
919 &generic_prefetch_tune
920 };
921
922 static const struct tune_params cortexa72_tunings =
923 {
924 &cortexa57_extra_costs,
925 &generic_addrcost_table,
926 &cortexa57_regmove_cost,
927 &cortexa57_vector_cost,
928 &generic_branch_cost,
929 &generic_approx_modes,
930 SVE_NOT_IMPLEMENTED, /* sve_width */
931 4, /* memmov_cost */
932 3, /* issue_rate */
933 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
934 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
935 "16", /* function_align. */
936 "4", /* jump_align. */
937 "8", /* loop_align. */
938 2, /* int_reassoc_width. */
939 4, /* fp_reassoc_width. */
940 1, /* vec_reassoc_width. */
941 2, /* min_div_recip_mul_sf. */
942 2, /* min_div_recip_mul_df. */
943 0, /* max_case_values. */
944 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
945 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
946 &generic_prefetch_tune
947 };
948
949 static const struct tune_params cortexa73_tunings =
950 {
951 &cortexa57_extra_costs,
952 &generic_addrcost_table,
953 &cortexa57_regmove_cost,
954 &cortexa57_vector_cost,
955 &generic_branch_cost,
956 &generic_approx_modes,
957 SVE_NOT_IMPLEMENTED, /* sve_width */
958 4, /* memmov_cost. */
959 2, /* issue_rate. */
960 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
961 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
962 "16", /* function_align. */
963 "4", /* jump_align. */
964 "8", /* loop_align. */
965 2, /* int_reassoc_width. */
966 4, /* fp_reassoc_width. */
967 1, /* vec_reassoc_width. */
968 2, /* min_div_recip_mul_sf. */
969 2, /* min_div_recip_mul_df. */
970 0, /* max_case_values. */
971 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
972 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
973 &generic_prefetch_tune
974 };
975
976
977
978 static const struct tune_params exynosm1_tunings =
979 {
980 &exynosm1_extra_costs,
981 &exynosm1_addrcost_table,
982 &exynosm1_regmove_cost,
983 &exynosm1_vector_cost,
984 &generic_branch_cost,
985 &exynosm1_approx_modes,
986 SVE_NOT_IMPLEMENTED, /* sve_width */
987 4, /* memmov_cost */
988 3, /* issue_rate */
989 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
990 "4", /* function_align. */
991 "4", /* jump_align. */
992 "4", /* loop_align. */
993 2, /* int_reassoc_width. */
994 4, /* fp_reassoc_width. */
995 1, /* vec_reassoc_width. */
996 2, /* min_div_recip_mul_sf. */
997 2, /* min_div_recip_mul_df. */
998 48, /* max_case_values. */
999 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1000 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1001 &exynosm1_prefetch_tune
1002 };
1003
1004 static const struct tune_params thunderxt88_tunings =
1005 {
1006 &thunderx_extra_costs,
1007 &generic_addrcost_table,
1008 &thunderx_regmove_cost,
1009 &thunderx_vector_cost,
1010 &generic_branch_cost,
1011 &generic_approx_modes,
1012 SVE_NOT_IMPLEMENTED, /* sve_width */
1013 6, /* memmov_cost */
1014 2, /* issue_rate */
1015 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1016 "8", /* function_align. */
1017 "8", /* jump_align. */
1018 "8", /* loop_align. */
1019 2, /* int_reassoc_width. */
1020 4, /* fp_reassoc_width. */
1021 1, /* vec_reassoc_width. */
1022 2, /* min_div_recip_mul_sf. */
1023 2, /* min_div_recip_mul_df. */
1024 0, /* max_case_values. */
1025 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1026 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1027 &thunderxt88_prefetch_tune
1028 };
1029
1030 static const struct tune_params thunderx_tunings =
1031 {
1032 &thunderx_extra_costs,
1033 &generic_addrcost_table,
1034 &thunderx_regmove_cost,
1035 &thunderx_vector_cost,
1036 &generic_branch_cost,
1037 &generic_approx_modes,
1038 SVE_NOT_IMPLEMENTED, /* sve_width */
1039 6, /* memmov_cost */
1040 2, /* issue_rate */
1041 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1042 "8", /* function_align. */
1043 "8", /* jump_align. */
1044 "8", /* loop_align. */
1045 2, /* int_reassoc_width. */
1046 4, /* fp_reassoc_width. */
1047 1, /* vec_reassoc_width. */
1048 2, /* min_div_recip_mul_sf. */
1049 2, /* min_div_recip_mul_df. */
1050 0, /* max_case_values. */
1051 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1052 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1053 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1054 &thunderx_prefetch_tune
1055 };
1056
1057 static const struct tune_params tsv110_tunings =
1058 {
1059 &tsv110_extra_costs,
1060 &tsv110_addrcost_table,
1061 &tsv110_regmove_cost,
1062 &tsv110_vector_cost,
1063 &generic_branch_cost,
1064 &generic_approx_modes,
1065 SVE_NOT_IMPLEMENTED, /* sve_width */
1066 4, /* memmov_cost */
1067 4, /* issue_rate */
1068 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1069 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1070 "16", /* function_align. */
1071 "4", /* jump_align. */
1072 "8", /* loop_align. */
1073 2, /* int_reassoc_width. */
1074 4, /* fp_reassoc_width. */
1075 1, /* vec_reassoc_width. */
1076 2, /* min_div_recip_mul_sf. */
1077 2, /* min_div_recip_mul_df. */
1078 0, /* max_case_values. */
1079 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1080 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1081 &tsv110_prefetch_tune
1082 };
1083
1084 static const struct tune_params xgene1_tunings =
1085 {
1086 &xgene1_extra_costs,
1087 &xgene1_addrcost_table,
1088 &xgene1_regmove_cost,
1089 &xgene1_vector_cost,
1090 &generic_branch_cost,
1091 &xgene1_approx_modes,
1092 SVE_NOT_IMPLEMENTED, /* sve_width */
1093 6, /* memmov_cost */
1094 4, /* issue_rate */
1095 AARCH64_FUSE_NOTHING, /* fusible_ops */
1096 "16", /* function_align. */
1097 "16", /* jump_align. */
1098 "16", /* loop_align. */
1099 2, /* int_reassoc_width. */
1100 4, /* fp_reassoc_width. */
1101 1, /* vec_reassoc_width. */
1102 2, /* min_div_recip_mul_sf. */
1103 2, /* min_div_recip_mul_df. */
1104 17, /* max_case_values. */
1105 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1106 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1107 &xgene1_prefetch_tune
1108 };
1109
1110 static const struct tune_params emag_tunings =
1111 {
1112 &xgene1_extra_costs,
1113 &xgene1_addrcost_table,
1114 &xgene1_regmove_cost,
1115 &xgene1_vector_cost,
1116 &generic_branch_cost,
1117 &xgene1_approx_modes,
1118 SVE_NOT_IMPLEMENTED,
1119 6, /* memmov_cost */
1120 4, /* issue_rate */
1121 AARCH64_FUSE_NOTHING, /* fusible_ops */
1122 "16", /* function_align. */
1123 "16", /* jump_align. */
1124 "16", /* loop_align. */
1125 2, /* int_reassoc_width. */
1126 4, /* fp_reassoc_width. */
1127 1, /* vec_reassoc_width. */
1128 2, /* min_div_recip_mul_sf. */
1129 2, /* min_div_recip_mul_df. */
1130 17, /* max_case_values. */
1131 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1132 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1133 &xgene1_prefetch_tune
1134 };
1135
1136 static const struct tune_params qdf24xx_tunings =
1137 {
1138 &qdf24xx_extra_costs,
1139 &qdf24xx_addrcost_table,
1140 &qdf24xx_regmove_cost,
1141 &qdf24xx_vector_cost,
1142 &generic_branch_cost,
1143 &generic_approx_modes,
1144 SVE_NOT_IMPLEMENTED, /* sve_width */
1145 4, /* memmov_cost */
1146 4, /* issue_rate */
1147 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1148 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1149 "16", /* function_align. */
1150 "8", /* jump_align. */
1151 "16", /* loop_align. */
1152 2, /* int_reassoc_width. */
1153 4, /* fp_reassoc_width. */
1154 1, /* vec_reassoc_width. */
1155 2, /* min_div_recip_mul_sf. */
1156 2, /* min_div_recip_mul_df. */
1157 0, /* max_case_values. */
1158 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1159 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1160 &qdf24xx_prefetch_tune
1161 };
1162
1163 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1164 for now. */
1165 static const struct tune_params saphira_tunings =
1166 {
1167 &generic_extra_costs,
1168 &generic_addrcost_table,
1169 &generic_regmove_cost,
1170 &generic_vector_cost,
1171 &generic_branch_cost,
1172 &generic_approx_modes,
1173 SVE_NOT_IMPLEMENTED, /* sve_width */
1174 4, /* memmov_cost */
1175 4, /* issue_rate */
1176 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1177 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1178 "16", /* function_align. */
1179 "8", /* jump_align. */
1180 "16", /* loop_align. */
1181 2, /* int_reassoc_width. */
1182 4, /* fp_reassoc_width. */
1183 1, /* vec_reassoc_width. */
1184 2, /* min_div_recip_mul_sf. */
1185 2, /* min_div_recip_mul_df. */
1186 0, /* max_case_values. */
1187 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1188 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1189 &generic_prefetch_tune
1190 };
1191
1192 static const struct tune_params thunderx2t99_tunings =
1193 {
1194 &thunderx2t99_extra_costs,
1195 &thunderx2t99_addrcost_table,
1196 &thunderx2t99_regmove_cost,
1197 &thunderx2t99_vector_cost,
1198 &generic_branch_cost,
1199 &generic_approx_modes,
1200 SVE_NOT_IMPLEMENTED, /* sve_width */
1201 4, /* memmov_cost. */
1202 4, /* issue_rate. */
1203 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1204 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1205 "16", /* function_align. */
1206 "8", /* jump_align. */
1207 "16", /* loop_align. */
1208 3, /* int_reassoc_width. */
1209 2, /* fp_reassoc_width. */
1210 2, /* vec_reassoc_width. */
1211 2, /* min_div_recip_mul_sf. */
1212 2, /* min_div_recip_mul_df. */
1213 0, /* max_case_values. */
1214 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1215 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1216 &thunderx2t99_prefetch_tune
1217 };
1218
1219 static const struct tune_params neoversen1_tunings =
1220 {
1221 &cortexa57_extra_costs,
1222 &generic_addrcost_table,
1223 &generic_regmove_cost,
1224 &cortexa57_vector_cost,
1225 &generic_branch_cost,
1226 &generic_approx_modes,
1227 SVE_NOT_IMPLEMENTED, /* sve_width */
1228 4, /* memmov_cost */
1229 3, /* issue_rate */
1230 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1231 "32:16", /* function_align. */
1232 "4", /* jump_align. */
1233 "32:16", /* loop_align. */
1234 2, /* int_reassoc_width. */
1235 4, /* fp_reassoc_width. */
1236 2, /* vec_reassoc_width. */
1237 2, /* min_div_recip_mul_sf. */
1238 2, /* min_div_recip_mul_df. */
1239 0, /* max_case_values. */
1240 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1241 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1242 &generic_prefetch_tune
1243 };
1244
1245 /* Support for fine-grained override of the tuning structures. */
1246 struct aarch64_tuning_override_function
1247 {
1248 const char* name;
1249 void (*parse_override)(const char*, struct tune_params*);
1250 };
1251
1252 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1253 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1254 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1255
1256 static const struct aarch64_tuning_override_function
1257 aarch64_tuning_override_functions[] =
1258 {
1259 { "fuse", aarch64_parse_fuse_string },
1260 { "tune", aarch64_parse_tune_string },
1261 { "sve_width", aarch64_parse_sve_width_string },
1262 { NULL, NULL }
1263 };
1264
1265 /* A processor implementing AArch64. */
1266 struct processor
1267 {
1268 const char *const name;
1269 enum aarch64_processor ident;
1270 enum aarch64_processor sched_core;
1271 enum aarch64_arch arch;
1272 unsigned architecture_version;
1273 const uint64_t flags;
1274 const struct tune_params *const tune;
1275 };
1276
1277 /* Architectures implementing AArch64. */
1278 static const struct processor all_architectures[] =
1279 {
1280 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1281 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1282 #include "aarch64-arches.def"
1283 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1284 };
1285
1286 /* Processor cores implementing AArch64. */
1287 static const struct processor all_cores[] =
1288 {
1289 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1290 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1291 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1292 FLAGS, &COSTS##_tunings},
1293 #include "aarch64-cores.def"
1294 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1295 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1296 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1297 };
1298
1299
1300 /* Target specification. These are populated by the -march, -mtune, -mcpu
1301 handling code or by target attributes. */
1302 static const struct processor *selected_arch;
1303 static const struct processor *selected_cpu;
1304 static const struct processor *selected_tune;
1305
1306 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1307
1308 /* The current tuning set. */
1309 struct tune_params aarch64_tune_params = generic_tunings;
1310
1311 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1312
1313 static tree
1314 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1315 int, bool *no_add_attrs)
1316 {
1317 /* Since we set fn_type_req to true, the caller should have checked
1318 this for us. */
1319 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1320 switch ((arm_pcs) fntype_abi (*node).id ())
1321 {
1322 case ARM_PCS_AAPCS64:
1323 case ARM_PCS_SIMD:
1324 return NULL_TREE;
1325
1326 case ARM_PCS_SVE:
1327 error ("the %qE attribute cannot be applied to an SVE function type",
1328 name);
1329 *no_add_attrs = true;
1330 return NULL_TREE;
1331
1332 case ARM_PCS_TLSDESC:
1333 case ARM_PCS_UNKNOWN:
1334 break;
1335 }
1336 gcc_unreachable ();
1337 }
1338
1339 /* Table of machine attributes. */
1340 static const struct attribute_spec aarch64_attribute_table[] =
1341 {
1342 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1343 affects_type_identity, handler, exclude } */
1344 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1345 handle_aarch64_vector_pcs_attribute, NULL },
1346 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
1347 aarch64_sve::handle_arm_sve_vector_bits_attribute,
1348 NULL },
1349 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
1350 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
1351 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1352 };
1353
1354 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1355
1356 /* An ISA extension in the co-processor and main instruction set space. */
1357 struct aarch64_option_extension
1358 {
1359 const char *const name;
1360 const unsigned long flags_on;
1361 const unsigned long flags_off;
1362 };
1363
1364 typedef enum aarch64_cond_code
1365 {
1366 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1367 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1368 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1369 }
1370 aarch64_cc;
1371
1372 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1373
1374 struct aarch64_branch_protect_type
1375 {
1376 /* The type's name that the user passes to the branch-protection option
1377 string. */
1378 const char* name;
1379 /* Function to handle the protection type and set global variables.
1380 First argument is the string token corresponding with this type and the
1381 second argument is the next token in the option string.
1382 Return values:
1383 * AARCH64_PARSE_OK: Handling was sucessful.
1384 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1385 should print an error.
1386 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1387 own error. */
1388 enum aarch64_parse_opt_result (*handler)(char*, char*);
1389 /* A list of types that can follow this type in the option string. */
1390 const aarch64_branch_protect_type* subtypes;
1391 unsigned int num_subtypes;
1392 };
1393
1394 static enum aarch64_parse_opt_result
1395 aarch64_handle_no_branch_protection (char* str, char* rest)
1396 {
1397 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1398 aarch64_enable_bti = 0;
1399 if (rest)
1400 {
1401 error ("unexpected %<%s%> after %<%s%>", rest, str);
1402 return AARCH64_PARSE_INVALID_FEATURE;
1403 }
1404 return AARCH64_PARSE_OK;
1405 }
1406
1407 static enum aarch64_parse_opt_result
1408 aarch64_handle_standard_branch_protection (char* str, char* rest)
1409 {
1410 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1411 aarch64_ra_sign_key = AARCH64_KEY_A;
1412 aarch64_enable_bti = 1;
1413 if (rest)
1414 {
1415 error ("unexpected %<%s%> after %<%s%>", rest, str);
1416 return AARCH64_PARSE_INVALID_FEATURE;
1417 }
1418 return AARCH64_PARSE_OK;
1419 }
1420
1421 static enum aarch64_parse_opt_result
1422 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1423 char* rest ATTRIBUTE_UNUSED)
1424 {
1425 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1426 aarch64_ra_sign_key = AARCH64_KEY_A;
1427 return AARCH64_PARSE_OK;
1428 }
1429
1430 static enum aarch64_parse_opt_result
1431 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1432 char* rest ATTRIBUTE_UNUSED)
1433 {
1434 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1435 return AARCH64_PARSE_OK;
1436 }
1437
1438 static enum aarch64_parse_opt_result
1439 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1440 char* rest ATTRIBUTE_UNUSED)
1441 {
1442 aarch64_ra_sign_key = AARCH64_KEY_B;
1443 return AARCH64_PARSE_OK;
1444 }
1445
1446 static enum aarch64_parse_opt_result
1447 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1448 char* rest ATTRIBUTE_UNUSED)
1449 {
1450 aarch64_enable_bti = 1;
1451 return AARCH64_PARSE_OK;
1452 }
1453
1454 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1455 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1456 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1457 { NULL, NULL, NULL, 0 }
1458 };
1459
1460 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1461 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1462 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1463 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1464 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1465 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1466 { NULL, NULL, NULL, 0 }
1467 };
1468
1469 /* The condition codes of the processor, and the inverse function. */
1470 static const char * const aarch64_condition_codes[] =
1471 {
1472 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1473 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1474 };
1475
1476 /* The preferred condition codes for SVE conditions. */
1477 static const char *const aarch64_sve_condition_codes[] =
1478 {
1479 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1480 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1481 };
1482
1483 /* Return the assembly token for svpattern value VALUE. */
1484
1485 static const char *
1486 svpattern_token (enum aarch64_svpattern pattern)
1487 {
1488 switch (pattern)
1489 {
1490 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1491 AARCH64_FOR_SVPATTERN (CASE)
1492 #undef CASE
1493 case AARCH64_NUM_SVPATTERNS:
1494 break;
1495 }
1496 gcc_unreachable ();
1497 }
1498
1499 /* Return the location of a piece that is known to be passed or returned
1500 in registers. FIRST_ZR is the first unused vector argument register
1501 and FIRST_PR is the first unused predicate argument register. */
1502
1503 rtx
1504 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
1505 unsigned int first_pr) const
1506 {
1507 gcc_assert (VECTOR_MODE_P (mode)
1508 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
1509 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
1510
1511 if (num_zr > 0 && num_pr == 0)
1512 return gen_rtx_REG (mode, first_zr);
1513
1514 if (num_zr == 0 && num_pr == 1)
1515 return gen_rtx_REG (mode, first_pr);
1516
1517 gcc_unreachable ();
1518 }
1519
1520 /* Return the total number of vector registers required by the PST. */
1521
1522 unsigned int
1523 pure_scalable_type_info::num_zr () const
1524 {
1525 unsigned int res = 0;
1526 for (unsigned int i = 0; i < pieces.length (); ++i)
1527 res += pieces[i].num_zr;
1528 return res;
1529 }
1530
1531 /* Return the total number of predicate registers required by the PST. */
1532
1533 unsigned int
1534 pure_scalable_type_info::num_pr () const
1535 {
1536 unsigned int res = 0;
1537 for (unsigned int i = 0; i < pieces.length (); ++i)
1538 res += pieces[i].num_pr;
1539 return res;
1540 }
1541
1542 /* Return the location of a PST that is known to be passed or returned
1543 in registers. FIRST_ZR is the first unused vector argument register
1544 and FIRST_PR is the first unused predicate argument register. */
1545
1546 rtx
1547 pure_scalable_type_info::get_rtx (machine_mode mode,
1548 unsigned int first_zr,
1549 unsigned int first_pr) const
1550 {
1551 /* Try to return a single REG if possible. This leads to better
1552 code generation; it isn't required for correctness. */
1553 if (mode == pieces[0].mode)
1554 {
1555 gcc_assert (pieces.length () == 1);
1556 return pieces[0].get_rtx (first_zr, first_pr);
1557 }
1558
1559 /* Build up a PARALLEL that contains the individual pieces. */
1560 rtvec rtxes = rtvec_alloc (pieces.length ());
1561 for (unsigned int i = 0; i < pieces.length (); ++i)
1562 {
1563 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1564 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1565 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1566 first_zr += pieces[i].num_zr;
1567 first_pr += pieces[i].num_pr;
1568 }
1569 return gen_rtx_PARALLEL (mode, rtxes);
1570 }
1571
1572 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1573 in the AAPCS64. */
1574
1575 pure_scalable_type_info::analysis_result
1576 pure_scalable_type_info::analyze (const_tree type)
1577 {
1578 /* Prevent accidental reuse. */
1579 gcc_assert (pieces.is_empty ());
1580
1581 /* No code will be generated for erroneous types, so we won't establish
1582 an ABI mapping. */
1583 if (type == error_mark_node)
1584 return NO_ABI_IDENTITY;
1585
1586 /* Zero-sized types disappear in the language->ABI mapping. */
1587 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1588 return NO_ABI_IDENTITY;
1589
1590 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1591 piece p = {};
1592 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1593 {
1594 machine_mode mode = TYPE_MODE_RAW (type);
1595 gcc_assert (VECTOR_MODE_P (mode)
1596 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1597
1598 p.mode = p.orig_mode = mode;
1599 add_piece (p);
1600 return IS_PST;
1601 }
1602
1603 /* Check for user-defined PSTs. */
1604 if (TREE_CODE (type) == ARRAY_TYPE)
1605 return analyze_array (type);
1606 if (TREE_CODE (type) == RECORD_TYPE)
1607 return analyze_record (type);
1608
1609 return ISNT_PST;
1610 }
1611
1612 /* Analyze a type that is known not to be passed or returned in memory.
1613 Return true if it has an ABI identity and is a Pure Scalable Type. */
1614
1615 bool
1616 pure_scalable_type_info::analyze_registers (const_tree type)
1617 {
1618 analysis_result result = analyze (type);
1619 gcc_assert (result != DOESNT_MATTER);
1620 return result == IS_PST;
1621 }
1622
1623 /* Subroutine of analyze for handling ARRAY_TYPEs. */
1624
1625 pure_scalable_type_info::analysis_result
1626 pure_scalable_type_info::analyze_array (const_tree type)
1627 {
1628 /* Analyze the element type. */
1629 pure_scalable_type_info element_info;
1630 analysis_result result = element_info.analyze (TREE_TYPE (type));
1631 if (result != IS_PST)
1632 return result;
1633
1634 /* An array of unknown, flexible or variable length will be passed and
1635 returned by reference whatever we do. */
1636 tree nelts_minus_one = array_type_nelts (type);
1637 if (!tree_fits_uhwi_p (nelts_minus_one))
1638 return DOESNT_MATTER;
1639
1640 /* Likewise if the array is constant-sized but too big to be interesting.
1641 The double checks against MAX_PIECES are to protect against overflow. */
1642 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1643 if (count > MAX_PIECES)
1644 return DOESNT_MATTER;
1645 count += 1;
1646 if (count * element_info.pieces.length () > MAX_PIECES)
1647 return DOESNT_MATTER;
1648
1649 /* The above checks should have weeded out elements of unknown size. */
1650 poly_uint64 element_bytes;
1651 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1652 gcc_unreachable ();
1653
1654 /* Build up the list of individual vectors and predicates. */
1655 gcc_assert (!element_info.pieces.is_empty ());
1656 for (unsigned int i = 0; i < count; ++i)
1657 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1658 {
1659 piece p = element_info.pieces[j];
1660 p.offset += i * element_bytes;
1661 add_piece (p);
1662 }
1663 return IS_PST;
1664 }
1665
1666 /* Subroutine of analyze for handling RECORD_TYPEs. */
1667
1668 pure_scalable_type_info::analysis_result
1669 pure_scalable_type_info::analyze_record (const_tree type)
1670 {
1671 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1672 {
1673 if (TREE_CODE (field) != FIELD_DECL)
1674 continue;
1675
1676 /* Zero-sized fields disappear in the language->ABI mapping. */
1677 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1678 continue;
1679
1680 /* All fields with an ABI identity must be PSTs for the record as
1681 a whole to be a PST. If any individual field is too big to be
1682 interesting then the record is too. */
1683 pure_scalable_type_info field_info;
1684 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1685 if (subresult == NO_ABI_IDENTITY)
1686 continue;
1687 if (subresult != IS_PST)
1688 return subresult;
1689
1690 /* Since all previous fields are PSTs, we ought to be able to track
1691 the field offset using poly_ints. */
1692 tree bitpos = bit_position (field);
1693 gcc_assert (poly_int_tree_p (bitpos));
1694
1695 /* For the same reason, it shouldn't be possible to create a PST field
1696 whose offset isn't byte-aligned. */
1697 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1698 BITS_PER_UNIT);
1699
1700 /* Punt if the record is too big to be interesting. */
1701 poly_uint64 bytepos;
1702 if (!wide_bytepos.to_uhwi (&bytepos)
1703 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1704 return DOESNT_MATTER;
1705
1706 /* Add the individual vectors and predicates in the field to the
1707 record's list. */
1708 gcc_assert (!field_info.pieces.is_empty ());
1709 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1710 {
1711 piece p = field_info.pieces[i];
1712 p.offset += bytepos;
1713 add_piece (p);
1714 }
1715 }
1716 /* Empty structures disappear in the language->ABI mapping. */
1717 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1718 }
1719
1720 /* Add P to the list of pieces in the type. */
1721
1722 void
1723 pure_scalable_type_info::add_piece (const piece &p)
1724 {
1725 /* Try to fold the new piece into the previous one to form a
1726 single-mode PST. For example, if we see three consecutive vectors
1727 of the same mode, we can represent them using the corresponding
1728 3-tuple mode.
1729
1730 This is purely an optimization. */
1731 if (!pieces.is_empty ())
1732 {
1733 piece &prev = pieces.last ();
1734 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1735 unsigned int nelems1, nelems2;
1736 if (prev.orig_mode == p.orig_mode
1737 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1738 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1739 GET_MODE_NUNITS (p.orig_mode), &nelems1)
1740 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1741 GET_MODE_NUNITS (p.orig_mode), &nelems2)
1742 && targetm.array_mode (p.orig_mode,
1743 nelems1 + nelems2).exists (&prev.mode))
1744 {
1745 prev.num_zr += p.num_zr;
1746 prev.num_pr += p.num_pr;
1747 return;
1748 }
1749 }
1750 pieces.quick_push (p);
1751 }
1752
1753 /* Return true if at least one possible value of type TYPE includes at
1754 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1755
1756 This is a relatively expensive test for some types, so it should
1757 generally be made as late as possible. */
1758
1759 static bool
1760 aarch64_some_values_include_pst_objects_p (const_tree type)
1761 {
1762 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1763 return false;
1764
1765 if (aarch64_sve::builtin_type_p (type))
1766 return true;
1767
1768 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1769 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1770
1771 if (RECORD_OR_UNION_TYPE_P (type))
1772 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1773 if (TREE_CODE (field) == FIELD_DECL
1774 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1775 return true;
1776
1777 return false;
1778 }
1779
1780 /* Return the descriptor of the SIMD ABI. */
1781
1782 static const predefined_function_abi &
1783 aarch64_simd_abi (void)
1784 {
1785 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1786 if (!simd_abi.initialized_p ())
1787 {
1788 HARD_REG_SET full_reg_clobbers
1789 = default_function_abi.full_reg_clobbers ();
1790 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1791 if (FP_SIMD_SAVED_REGNUM_P (regno))
1792 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1793 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1794 }
1795 return simd_abi;
1796 }
1797
1798 /* Return the descriptor of the SVE PCS. */
1799
1800 static const predefined_function_abi &
1801 aarch64_sve_abi (void)
1802 {
1803 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1804 if (!sve_abi.initialized_p ())
1805 {
1806 HARD_REG_SET full_reg_clobbers
1807 = default_function_abi.full_reg_clobbers ();
1808 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1809 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1810 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1811 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1812 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1813 }
1814 return sve_abi;
1815 }
1816
1817 /* Generate code to enable conditional branches in functions over 1 MiB. */
1818 const char *
1819 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1820 const char * branch_format)
1821 {
1822 rtx_code_label * tmp_label = gen_label_rtx ();
1823 char label_buf[256];
1824 char buffer[128];
1825 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1826 CODE_LABEL_NUMBER (tmp_label));
1827 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1828 rtx dest_label = operands[pos_label];
1829 operands[pos_label] = tmp_label;
1830
1831 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1832 output_asm_insn (buffer, operands);
1833
1834 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1835 operands[pos_label] = dest_label;
1836 output_asm_insn (buffer, operands);
1837 return "";
1838 }
1839
1840 void
1841 aarch64_err_no_fpadvsimd (machine_mode mode)
1842 {
1843 if (TARGET_GENERAL_REGS_ONLY)
1844 if (FLOAT_MODE_P (mode))
1845 error ("%qs is incompatible with the use of floating-point types",
1846 "-mgeneral-regs-only");
1847 else
1848 error ("%qs is incompatible with the use of vector types",
1849 "-mgeneral-regs-only");
1850 else
1851 if (FLOAT_MODE_P (mode))
1852 error ("%qs feature modifier is incompatible with the use of"
1853 " floating-point types", "+nofp");
1854 else
1855 error ("%qs feature modifier is incompatible with the use of"
1856 " vector types", "+nofp");
1857 }
1858
1859 /* Report when we try to do something that requires SVE when SVE is disabled.
1860 This is an error of last resort and isn't very high-quality. It usually
1861 involves attempts to measure the vector length in some way. */
1862 static void
1863 aarch64_report_sve_required (void)
1864 {
1865 static bool reported_p = false;
1866
1867 /* Avoid reporting a slew of messages for a single oversight. */
1868 if (reported_p)
1869 return;
1870
1871 error ("this operation requires the SVE ISA extension");
1872 inform (input_location, "you can enable SVE using the command-line"
1873 " option %<-march%>, or by using the %<target%>"
1874 " attribute or pragma");
1875 reported_p = true;
1876 }
1877
1878 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1879 registers. */
1880 inline bool
1881 pr_or_ffr_regnum_p (unsigned int regno)
1882 {
1883 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1884 }
1885
1886 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1887 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1888 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1889 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1890 and GENERAL_REGS is lower than the memory cost (in this case the best class
1891 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1892 cost results in bad allocations with many redundant int<->FP moves which
1893 are expensive on various cores.
1894 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1895 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1896 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1897 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1898 The result of this is that it is no longer inefficient to have a higher
1899 memory move cost than the register move cost.
1900 */
1901
1902 static reg_class_t
1903 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1904 reg_class_t best_class)
1905 {
1906 machine_mode mode;
1907
1908 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1909 || !reg_class_subset_p (FP_REGS, allocno_class))
1910 return allocno_class;
1911
1912 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1913 || !reg_class_subset_p (FP_REGS, best_class))
1914 return best_class;
1915
1916 mode = PSEUDO_REGNO_MODE (regno);
1917 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1918 }
1919
1920 static unsigned int
1921 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1922 {
1923 if (GET_MODE_UNIT_SIZE (mode) == 4)
1924 return aarch64_tune_params.min_div_recip_mul_sf;
1925 return aarch64_tune_params.min_div_recip_mul_df;
1926 }
1927
1928 /* Return the reassociation width of treeop OPC with mode MODE. */
1929 static int
1930 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1931 {
1932 if (VECTOR_MODE_P (mode))
1933 return aarch64_tune_params.vec_reassoc_width;
1934 if (INTEGRAL_MODE_P (mode))
1935 return aarch64_tune_params.int_reassoc_width;
1936 /* Avoid reassociating floating point addition so we emit more FMAs. */
1937 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1938 return aarch64_tune_params.fp_reassoc_width;
1939 return 1;
1940 }
1941
1942 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1943 unsigned
1944 aarch64_dbx_register_number (unsigned regno)
1945 {
1946 if (GP_REGNUM_P (regno))
1947 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1948 else if (regno == SP_REGNUM)
1949 return AARCH64_DWARF_SP;
1950 else if (FP_REGNUM_P (regno))
1951 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1952 else if (PR_REGNUM_P (regno))
1953 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1954 else if (regno == VG_REGNUM)
1955 return AARCH64_DWARF_VG;
1956
1957 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1958 equivalent DWARF register. */
1959 return DWARF_FRAME_REGISTERS;
1960 }
1961
1962 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1963 integer, otherwise return X unmodified. */
1964 static rtx
1965 aarch64_bit_representation (rtx x)
1966 {
1967 if (CONST_DOUBLE_P (x))
1968 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1969 return x;
1970 }
1971
1972 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1973 static bool
1974 aarch64_advsimd_struct_mode_p (machine_mode mode)
1975 {
1976 return (TARGET_SIMD
1977 && (mode == OImode || mode == CImode || mode == XImode));
1978 }
1979
1980 /* Return true if MODE is an SVE predicate mode. */
1981 static bool
1982 aarch64_sve_pred_mode_p (machine_mode mode)
1983 {
1984 return (TARGET_SVE
1985 && (mode == VNx16BImode
1986 || mode == VNx8BImode
1987 || mode == VNx4BImode
1988 || mode == VNx2BImode));
1989 }
1990
1991 /* Three mutually-exclusive flags describing a vector or predicate type. */
1992 const unsigned int VEC_ADVSIMD = 1;
1993 const unsigned int VEC_SVE_DATA = 2;
1994 const unsigned int VEC_SVE_PRED = 4;
1995 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1996 a structure of 2, 3 or 4 vectors. */
1997 const unsigned int VEC_STRUCT = 8;
1998 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1999 vector has fewer significant bytes than a full SVE vector. */
2000 const unsigned int VEC_PARTIAL = 16;
2001 /* Useful combinations of the above. */
2002 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
2003 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2004
2005 /* Return a set of flags describing the vector properties of mode MODE.
2006 Ignore modes that are not supported by the current target. */
2007 static unsigned int
2008 aarch64_classify_vector_mode (machine_mode mode)
2009 {
2010 if (aarch64_advsimd_struct_mode_p (mode))
2011 return VEC_ADVSIMD | VEC_STRUCT;
2012
2013 if (aarch64_sve_pred_mode_p (mode))
2014 return VEC_SVE_PRED;
2015
2016 /* Make the decision based on the mode's enum value rather than its
2017 properties, so that we keep the correct classification regardless
2018 of -msve-vector-bits. */
2019 switch (mode)
2020 {
2021 /* Partial SVE QI vectors. */
2022 case E_VNx2QImode:
2023 case E_VNx4QImode:
2024 case E_VNx8QImode:
2025 /* Partial SVE HI vectors. */
2026 case E_VNx2HImode:
2027 case E_VNx4HImode:
2028 /* Partial SVE SI vector. */
2029 case E_VNx2SImode:
2030 /* Partial SVE HF vectors. */
2031 case E_VNx2HFmode:
2032 case E_VNx4HFmode:
2033 /* Partial SVE SF vector. */
2034 case E_VNx2SFmode:
2035 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2036
2037 case E_VNx16QImode:
2038 case E_VNx8HImode:
2039 case E_VNx4SImode:
2040 case E_VNx2DImode:
2041 case E_VNx8BFmode:
2042 case E_VNx8HFmode:
2043 case E_VNx4SFmode:
2044 case E_VNx2DFmode:
2045 return TARGET_SVE ? VEC_SVE_DATA : 0;
2046
2047 /* x2 SVE vectors. */
2048 case E_VNx32QImode:
2049 case E_VNx16HImode:
2050 case E_VNx8SImode:
2051 case E_VNx4DImode:
2052 case E_VNx16BFmode:
2053 case E_VNx16HFmode:
2054 case E_VNx8SFmode:
2055 case E_VNx4DFmode:
2056 /* x3 SVE vectors. */
2057 case E_VNx48QImode:
2058 case E_VNx24HImode:
2059 case E_VNx12SImode:
2060 case E_VNx6DImode:
2061 case E_VNx24BFmode:
2062 case E_VNx24HFmode:
2063 case E_VNx12SFmode:
2064 case E_VNx6DFmode:
2065 /* x4 SVE vectors. */
2066 case E_VNx64QImode:
2067 case E_VNx32HImode:
2068 case E_VNx16SImode:
2069 case E_VNx8DImode:
2070 case E_VNx32BFmode:
2071 case E_VNx32HFmode:
2072 case E_VNx16SFmode:
2073 case E_VNx8DFmode:
2074 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2075
2076 /* 64-bit Advanced SIMD vectors. */
2077 case E_V8QImode:
2078 case E_V4HImode:
2079 case E_V2SImode:
2080 /* ...E_V1DImode doesn't exist. */
2081 case E_V4HFmode:
2082 case E_V4BFmode:
2083 case E_V2SFmode:
2084 case E_V1DFmode:
2085 /* 128-bit Advanced SIMD vectors. */
2086 case E_V16QImode:
2087 case E_V8HImode:
2088 case E_V4SImode:
2089 case E_V2DImode:
2090 case E_V8HFmode:
2091 case E_V8BFmode:
2092 case E_V4SFmode:
2093 case E_V2DFmode:
2094 return TARGET_SIMD ? VEC_ADVSIMD : 0;
2095
2096 default:
2097 return 0;
2098 }
2099 }
2100
2101 /* Return true if MODE is any of the data vector modes, including
2102 structure modes. */
2103 static bool
2104 aarch64_vector_data_mode_p (machine_mode mode)
2105 {
2106 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
2107 }
2108
2109 /* Return true if MODE is any form of SVE mode, including predicates,
2110 vectors and structures. */
2111 bool
2112 aarch64_sve_mode_p (machine_mode mode)
2113 {
2114 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2115 }
2116
2117 /* Return true if MODE is an SVE data vector mode; either a single vector
2118 or a structure of vectors. */
2119 static bool
2120 aarch64_sve_data_mode_p (machine_mode mode)
2121 {
2122 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
2123 }
2124
2125 /* Return the number of defined bytes in one constituent vector of
2126 SVE mode MODE, which has vector flags VEC_FLAGS. */
2127 static poly_int64
2128 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2129 {
2130 if (vec_flags & VEC_PARTIAL)
2131 /* A single partial vector. */
2132 return GET_MODE_SIZE (mode);
2133
2134 if (vec_flags & VEC_SVE_DATA)
2135 /* A single vector or a tuple. */
2136 return BYTES_PER_SVE_VECTOR;
2137
2138 /* A single predicate. */
2139 gcc_assert (vec_flags & VEC_SVE_PRED);
2140 return BYTES_PER_SVE_PRED;
2141 }
2142
2143 /* Implement target hook TARGET_ARRAY_MODE. */
2144 static opt_machine_mode
2145 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2146 {
2147 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2148 && IN_RANGE (nelems, 2, 4))
2149 return mode_for_vector (GET_MODE_INNER (mode),
2150 GET_MODE_NUNITS (mode) * nelems);
2151
2152 return opt_machine_mode ();
2153 }
2154
2155 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
2156 static bool
2157 aarch64_array_mode_supported_p (machine_mode mode,
2158 unsigned HOST_WIDE_INT nelems)
2159 {
2160 if (TARGET_SIMD
2161 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2162 || AARCH64_VALID_SIMD_DREG_MODE (mode))
2163 && (nelems >= 2 && nelems <= 4))
2164 return true;
2165
2166 return false;
2167 }
2168
2169 /* MODE is some form of SVE vector mode. For data modes, return the number
2170 of vector register bits that each element of MODE occupies, such as 64
2171 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2172 in a 64-bit container). For predicate modes, return the number of
2173 data bits controlled by each significant predicate bit. */
2174
2175 static unsigned int
2176 aarch64_sve_container_bits (machine_mode mode)
2177 {
2178 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2179 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2180 ? BITS_PER_SVE_VECTOR
2181 : GET_MODE_BITSIZE (mode));
2182 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2183 }
2184
2185 /* Return the SVE predicate mode to use for elements that have
2186 ELEM_NBYTES bytes, if such a mode exists. */
2187
2188 opt_machine_mode
2189 aarch64_sve_pred_mode (unsigned int elem_nbytes)
2190 {
2191 if (TARGET_SVE)
2192 {
2193 if (elem_nbytes == 1)
2194 return VNx16BImode;
2195 if (elem_nbytes == 2)
2196 return VNx8BImode;
2197 if (elem_nbytes == 4)
2198 return VNx4BImode;
2199 if (elem_nbytes == 8)
2200 return VNx2BImode;
2201 }
2202 return opt_machine_mode ();
2203 }
2204
2205 /* Return the SVE predicate mode that should be used to control
2206 SVE mode MODE. */
2207
2208 machine_mode
2209 aarch64_sve_pred_mode (machine_mode mode)
2210 {
2211 unsigned int bits = aarch64_sve_container_bits (mode);
2212 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2213 }
2214
2215 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
2216
2217 static opt_machine_mode
2218 aarch64_get_mask_mode (machine_mode mode)
2219 {
2220 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2221 if (vec_flags & VEC_SVE_DATA)
2222 return aarch64_sve_pred_mode (mode);
2223
2224 return default_get_mask_mode (mode);
2225 }
2226
2227 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
2228
2229 opt_machine_mode
2230 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2231 {
2232 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2233 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2234 machine_mode mode;
2235 FOR_EACH_MODE_IN_CLASS (mode, mclass)
2236 if (inner_mode == GET_MODE_INNER (mode)
2237 && known_eq (nunits, GET_MODE_NUNITS (mode))
2238 && aarch64_sve_data_mode_p (mode))
2239 return mode;
2240 return opt_machine_mode ();
2241 }
2242
2243 /* Return the integer element mode associated with SVE mode MODE. */
2244
2245 static scalar_int_mode
2246 aarch64_sve_element_int_mode (machine_mode mode)
2247 {
2248 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2249 ? BITS_PER_SVE_VECTOR
2250 : GET_MODE_BITSIZE (mode));
2251 unsigned int elt_bits = vector_element_size (vector_bits,
2252 GET_MODE_NUNITS (mode));
2253 return int_mode_for_size (elt_bits, 0).require ();
2254 }
2255
2256 /* Return an integer element mode that contains exactly
2257 aarch64_sve_container_bits (MODE) bits. This is wider than
2258 aarch64_sve_element_int_mode if MODE is a partial vector,
2259 otherwise it's the same. */
2260
2261 static scalar_int_mode
2262 aarch64_sve_container_int_mode (machine_mode mode)
2263 {
2264 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2265 }
2266
2267 /* Return the integer vector mode associated with SVE mode MODE.
2268 Unlike related_int_vector_mode, this can handle the case in which
2269 MODE is a predicate (and thus has a different total size). */
2270
2271 machine_mode
2272 aarch64_sve_int_mode (machine_mode mode)
2273 {
2274 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2275 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2276 }
2277
2278 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
2279
2280 static opt_machine_mode
2281 aarch64_vectorize_related_mode (machine_mode vector_mode,
2282 scalar_mode element_mode,
2283 poly_uint64 nunits)
2284 {
2285 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2286
2287 /* If we're operating on SVE vectors, try to return an SVE mode. */
2288 poly_uint64 sve_nunits;
2289 if ((vec_flags & VEC_SVE_DATA)
2290 && multiple_p (BYTES_PER_SVE_VECTOR,
2291 GET_MODE_SIZE (element_mode), &sve_nunits))
2292 {
2293 machine_mode sve_mode;
2294 if (maybe_ne (nunits, 0U))
2295 {
2296 /* Try to find a full or partial SVE mode with exactly
2297 NUNITS units. */
2298 if (multiple_p (sve_nunits, nunits)
2299 && aarch64_sve_data_mode (element_mode,
2300 nunits).exists (&sve_mode))
2301 return sve_mode;
2302 }
2303 else
2304 {
2305 /* Take the preferred number of units from the number of bytes
2306 that fit in VECTOR_MODE. We always start by "autodetecting"
2307 a full vector mode with preferred_simd_mode, so vectors
2308 chosen here will also be full vector modes. Then
2309 autovectorize_vector_modes tries smaller starting modes
2310 and thus smaller preferred numbers of units. */
2311 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2312 if (aarch64_sve_data_mode (element_mode,
2313 sve_nunits).exists (&sve_mode))
2314 return sve_mode;
2315 }
2316 }
2317
2318 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
2319 if ((vec_flags & VEC_ADVSIMD)
2320 && known_eq (nunits, 0U)
2321 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2322 && maybe_ge (GET_MODE_BITSIZE (element_mode)
2323 * GET_MODE_NUNITS (vector_mode), 128U))
2324 {
2325 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2326 if (VECTOR_MODE_P (res))
2327 return res;
2328 }
2329
2330 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2331 }
2332
2333 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
2334 prefer to use the first arithmetic operand as the else value if
2335 the else value doesn't matter, since that exactly matches the SVE
2336 destructive merging form. For ternary operations we could either
2337 pick the first operand and use FMAD-like instructions or the last
2338 operand and use FMLA-like instructions; the latter seems more
2339 natural. */
2340
2341 static tree
2342 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
2343 {
2344 return nops == 3 ? ops[2] : ops[0];
2345 }
2346
2347 /* Implement TARGET_HARD_REGNO_NREGS. */
2348
2349 static unsigned int
2350 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
2351 {
2352 /* ??? Logically we should only need to provide a value when
2353 HARD_REGNO_MODE_OK says that the combination is valid,
2354 but at the moment we need to handle all modes. Just ignore
2355 any runtime parts for registers that can't store them. */
2356 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2357 switch (aarch64_regno_regclass (regno))
2358 {
2359 case FP_REGS:
2360 case FP_LO_REGS:
2361 case FP_LO8_REGS:
2362 {
2363 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2364 if (vec_flags & VEC_SVE_DATA)
2365 return exact_div (GET_MODE_SIZE (mode),
2366 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2367 return CEIL (lowest_size, UNITS_PER_VREG);
2368 }
2369 case PR_REGS:
2370 case PR_LO_REGS:
2371 case PR_HI_REGS:
2372 case FFR_REGS:
2373 case PR_AND_FFR_REGS:
2374 return 1;
2375 default:
2376 return CEIL (lowest_size, UNITS_PER_WORD);
2377 }
2378 gcc_unreachable ();
2379 }
2380
2381 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2382
2383 static bool
2384 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2385 {
2386 if (GET_MODE_CLASS (mode) == MODE_CC)
2387 return regno == CC_REGNUM;
2388
2389 if (regno == VG_REGNUM)
2390 /* This must have the same size as _Unwind_Word. */
2391 return mode == DImode;
2392
2393 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2394 if (vec_flags & VEC_SVE_PRED)
2395 return pr_or_ffr_regnum_p (regno);
2396
2397 if (pr_or_ffr_regnum_p (regno))
2398 return false;
2399
2400 if (regno == SP_REGNUM)
2401 /* The purpose of comparing with ptr_mode is to support the
2402 global register variable associated with the stack pointer
2403 register via the syntax of asm ("wsp") in ILP32. */
2404 return mode == Pmode || mode == ptr_mode;
2405
2406 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2407 return mode == Pmode;
2408
2409 if (GP_REGNUM_P (regno))
2410 {
2411 if (vec_flags & VEC_ANY_SVE)
2412 return false;
2413 if (known_le (GET_MODE_SIZE (mode), 8))
2414 return true;
2415 if (known_le (GET_MODE_SIZE (mode), 16))
2416 return (regno & 1) == 0;
2417 }
2418 else if (FP_REGNUM_P (regno))
2419 {
2420 if (vec_flags & VEC_STRUCT)
2421 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2422 else
2423 return !VECTOR_MODE_P (mode) || vec_flags != 0;
2424 }
2425
2426 return false;
2427 }
2428
2429 /* Return true if a function with type FNTYPE returns its value in
2430 SVE vector or predicate registers. */
2431
2432 static bool
2433 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2434 {
2435 tree return_type = TREE_TYPE (fntype);
2436
2437 pure_scalable_type_info pst_info;
2438 switch (pst_info.analyze (return_type))
2439 {
2440 case pure_scalable_type_info::IS_PST:
2441 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2442 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2443
2444 case pure_scalable_type_info::DOESNT_MATTER:
2445 gcc_assert (aarch64_return_in_memory_1 (return_type));
2446 return false;
2447
2448 case pure_scalable_type_info::NO_ABI_IDENTITY:
2449 case pure_scalable_type_info::ISNT_PST:
2450 return false;
2451 }
2452 gcc_unreachable ();
2453 }
2454
2455 /* Return true if a function with type FNTYPE takes arguments in
2456 SVE vector or predicate registers. */
2457
2458 static bool
2459 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2460 {
2461 CUMULATIVE_ARGS args_so_far_v;
2462 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2463 NULL_TREE, 0, true);
2464 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2465
2466 for (tree chain = TYPE_ARG_TYPES (fntype);
2467 chain && chain != void_list_node;
2468 chain = TREE_CHAIN (chain))
2469 {
2470 tree arg_type = TREE_VALUE (chain);
2471 if (arg_type == error_mark_node)
2472 return false;
2473
2474 function_arg_info arg (arg_type, /*named=*/true);
2475 apply_pass_by_reference_rules (&args_so_far_v, arg);
2476 pure_scalable_type_info pst_info;
2477 if (pst_info.analyze_registers (arg.type))
2478 {
2479 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2480 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2481 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2482 return true;
2483 }
2484
2485 targetm.calls.function_arg_advance (args_so_far, arg);
2486 }
2487 return false;
2488 }
2489
2490 /* Implement TARGET_FNTYPE_ABI. */
2491
2492 static const predefined_function_abi &
2493 aarch64_fntype_abi (const_tree fntype)
2494 {
2495 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2496 return aarch64_simd_abi ();
2497
2498 if (aarch64_returns_value_in_sve_regs_p (fntype)
2499 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2500 return aarch64_sve_abi ();
2501
2502 return default_function_abi;
2503 }
2504
2505 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2506
2507 static bool
2508 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2509 {
2510 return (aarch64_sve::builtin_type_p (type1)
2511 == aarch64_sve::builtin_type_p (type2));
2512 }
2513
2514 /* Return true if we should emit CFI for register REGNO. */
2515
2516 static bool
2517 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2518 {
2519 return (GP_REGNUM_P (regno)
2520 || !default_function_abi.clobbers_full_reg_p (regno));
2521 }
2522
2523 /* Return the mode we should use to save and restore register REGNO. */
2524
2525 static machine_mode
2526 aarch64_reg_save_mode (unsigned int regno)
2527 {
2528 if (GP_REGNUM_P (regno))
2529 return DImode;
2530
2531 if (FP_REGNUM_P (regno))
2532 switch (crtl->abi->id ())
2533 {
2534 case ARM_PCS_AAPCS64:
2535 /* Only the low 64 bits are saved by the base PCS. */
2536 return DFmode;
2537
2538 case ARM_PCS_SIMD:
2539 /* The vector PCS saves the low 128 bits (which is the full
2540 register on non-SVE targets). */
2541 return TFmode;
2542
2543 case ARM_PCS_SVE:
2544 /* Use vectors of DImode for registers that need frame
2545 information, so that the first 64 bytes of the save slot
2546 are always the equivalent of what storing D<n> would give. */
2547 if (aarch64_emit_cfi_for_reg_p (regno))
2548 return VNx2DImode;
2549
2550 /* Use vectors of bytes otherwise, so that the layout is
2551 endian-agnostic, and so that we can use LDR and STR for
2552 big-endian targets. */
2553 return VNx16QImode;
2554
2555 case ARM_PCS_TLSDESC:
2556 case ARM_PCS_UNKNOWN:
2557 break;
2558 }
2559
2560 if (PR_REGNUM_P (regno))
2561 /* Save the full predicate register. */
2562 return VNx16BImode;
2563
2564 gcc_unreachable ();
2565 }
2566
2567 /* Implement TARGET_INSN_CALLEE_ABI. */
2568
2569 const predefined_function_abi &
2570 aarch64_insn_callee_abi (const rtx_insn *insn)
2571 {
2572 rtx pat = PATTERN (insn);
2573 gcc_assert (GET_CODE (pat) == PARALLEL);
2574 rtx unspec = XVECEXP (pat, 0, 1);
2575 gcc_assert (GET_CODE (unspec) == UNSPEC
2576 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2577 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2578 }
2579
2580 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2581 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2582 clobbers the top 64 bits when restoring the bottom 64 bits. */
2583
2584 static bool
2585 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2586 unsigned int regno,
2587 machine_mode mode)
2588 {
2589 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2590 {
2591 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2592 unsigned int nregs = hard_regno_nregs (regno, mode);
2593 if (nregs > 1)
2594 per_register_size = exact_div (per_register_size, nregs);
2595 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2596 return maybe_gt (per_register_size, 16);
2597 return maybe_gt (per_register_size, 8);
2598 }
2599 return false;
2600 }
2601
2602 /* Implement REGMODE_NATURAL_SIZE. */
2603 poly_uint64
2604 aarch64_regmode_natural_size (machine_mode mode)
2605 {
2606 /* The natural size for SVE data modes is one SVE data vector,
2607 and similarly for predicates. We can't independently modify
2608 anything smaller than that. */
2609 /* ??? For now, only do this for variable-width SVE registers.
2610 Doing it for constant-sized registers breaks lower-subreg.c. */
2611 /* ??? And once that's fixed, we should probably have similar
2612 code for Advanced SIMD. */
2613 if (!aarch64_sve_vg.is_constant ())
2614 {
2615 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2616 if (vec_flags & VEC_SVE_PRED)
2617 return BYTES_PER_SVE_PRED;
2618 if (vec_flags & VEC_SVE_DATA)
2619 return BYTES_PER_SVE_VECTOR;
2620 }
2621 return UNITS_PER_WORD;
2622 }
2623
2624 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2625 machine_mode
2626 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2627 machine_mode mode)
2628 {
2629 /* The predicate mode determines which bits are significant and
2630 which are "don't care". Decreasing the number of lanes would
2631 lose data while increasing the number of lanes would make bits
2632 unnecessarily significant. */
2633 if (PR_REGNUM_P (regno))
2634 return mode;
2635 if (known_ge (GET_MODE_SIZE (mode), 4))
2636 return mode;
2637 else
2638 return SImode;
2639 }
2640
2641 /* Return true if I's bits are consecutive ones from the MSB. */
2642 bool
2643 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2644 {
2645 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2646 }
2647
2648 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2649 that strcpy from constants will be faster. */
2650
2651 static HOST_WIDE_INT
2652 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2653 {
2654 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2655 return MAX (align, BITS_PER_WORD);
2656 return align;
2657 }
2658
2659 /* Return true if calls to DECL should be treated as
2660 long-calls (ie called via a register). */
2661 static bool
2662 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2663 {
2664 return false;
2665 }
2666
2667 /* Return true if calls to symbol-ref SYM should be treated as
2668 long-calls (ie called via a register). */
2669 bool
2670 aarch64_is_long_call_p (rtx sym)
2671 {
2672 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2673 }
2674
2675 /* Return true if calls to symbol-ref SYM should not go through
2676 plt stubs. */
2677
2678 bool
2679 aarch64_is_noplt_call_p (rtx sym)
2680 {
2681 const_tree decl = SYMBOL_REF_DECL (sym);
2682
2683 if (flag_pic
2684 && decl
2685 && (!flag_plt
2686 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2687 && !targetm.binds_local_p (decl))
2688 return true;
2689
2690 return false;
2691 }
2692
2693 /* Return true if the offsets to a zero/sign-extract operation
2694 represent an expression that matches an extend operation. The
2695 operands represent the parameters from
2696
2697 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2698 bool
2699 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2700 rtx extract_imm)
2701 {
2702 HOST_WIDE_INT mult_val, extract_val;
2703
2704 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2705 return false;
2706
2707 mult_val = INTVAL (mult_imm);
2708 extract_val = INTVAL (extract_imm);
2709
2710 if (extract_val > 8
2711 && extract_val < GET_MODE_BITSIZE (mode)
2712 && exact_log2 (extract_val & ~7) > 0
2713 && (extract_val & 7) <= 4
2714 && mult_val == (1 << (extract_val & 7)))
2715 return true;
2716
2717 return false;
2718 }
2719
2720 /* Emit an insn that's a simple single-set. Both the operands must be
2721 known to be valid. */
2722 inline static rtx_insn *
2723 emit_set_insn (rtx x, rtx y)
2724 {
2725 return emit_insn (gen_rtx_SET (x, y));
2726 }
2727
2728 /* X and Y are two things to compare using CODE. Emit the compare insn and
2729 return the rtx for register 0 in the proper mode. */
2730 rtx
2731 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2732 {
2733 machine_mode cmp_mode = GET_MODE (x);
2734 machine_mode cc_mode;
2735 rtx cc_reg;
2736
2737 if (cmp_mode == TImode)
2738 {
2739 gcc_assert (code == NE);
2740
2741 cc_mode = CCmode;
2742 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2743
2744 rtx x_lo = operand_subword (x, 0, 0, TImode);
2745 rtx y_lo = operand_subword (y, 0, 0, TImode);
2746 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2747
2748 rtx x_hi = operand_subword (x, 1, 0, TImode);
2749 rtx y_hi = operand_subword (y, 1, 0, TImode);
2750 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2751 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2752 GEN_INT (AARCH64_EQ)));
2753 }
2754 else
2755 {
2756 cc_mode = SELECT_CC_MODE (code, x, y);
2757 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2758 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2759 }
2760 return cc_reg;
2761 }
2762
2763 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2764
2765 static rtx
2766 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2767 machine_mode y_mode)
2768 {
2769 if (y_mode == E_QImode || y_mode == E_HImode)
2770 {
2771 if (CONST_INT_P (y))
2772 {
2773 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2774 y_mode = SImode;
2775 }
2776 else
2777 {
2778 rtx t, cc_reg;
2779 machine_mode cc_mode;
2780
2781 t = gen_rtx_ZERO_EXTEND (SImode, y);
2782 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2783 cc_mode = CC_SWPmode;
2784 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2785 emit_set_insn (cc_reg, t);
2786 return cc_reg;
2787 }
2788 }
2789
2790 if (!aarch64_plus_operand (y, y_mode))
2791 y = force_reg (y_mode, y);
2792
2793 return aarch64_gen_compare_reg (code, x, y);
2794 }
2795
2796 /* Build the SYMBOL_REF for __tls_get_addr. */
2797
2798 static GTY(()) rtx tls_get_addr_libfunc;
2799
2800 rtx
2801 aarch64_tls_get_addr (void)
2802 {
2803 if (!tls_get_addr_libfunc)
2804 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2805 return tls_get_addr_libfunc;
2806 }
2807
2808 /* Return the TLS model to use for ADDR. */
2809
2810 static enum tls_model
2811 tls_symbolic_operand_type (rtx addr)
2812 {
2813 enum tls_model tls_kind = TLS_MODEL_NONE;
2814 if (GET_CODE (addr) == CONST)
2815 {
2816 poly_int64 addend;
2817 rtx sym = strip_offset (addr, &addend);
2818 if (GET_CODE (sym) == SYMBOL_REF)
2819 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2820 }
2821 else if (GET_CODE (addr) == SYMBOL_REF)
2822 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2823
2824 return tls_kind;
2825 }
2826
2827 /* We'll allow lo_sum's in addresses in our legitimate addresses
2828 so that combine would take care of combining addresses where
2829 necessary, but for generation purposes, we'll generate the address
2830 as :
2831 RTL Absolute
2832 tmp = hi (symbol_ref); adrp x1, foo
2833 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2834 nop
2835
2836 PIC TLS
2837 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2838 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2839 bl __tls_get_addr
2840 nop
2841
2842 Load TLS symbol, depending on TLS mechanism and TLS access model.
2843
2844 Global Dynamic - Traditional TLS:
2845 adrp tmp, :tlsgd:imm
2846 add dest, tmp, #:tlsgd_lo12:imm
2847 bl __tls_get_addr
2848
2849 Global Dynamic - TLS Descriptors:
2850 adrp dest, :tlsdesc:imm
2851 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2852 add dest, dest, #:tlsdesc_lo12:imm
2853 blr tmp
2854 mrs tp, tpidr_el0
2855 add dest, dest, tp
2856
2857 Initial Exec:
2858 mrs tp, tpidr_el0
2859 adrp tmp, :gottprel:imm
2860 ldr dest, [tmp, #:gottprel_lo12:imm]
2861 add dest, dest, tp
2862
2863 Local Exec:
2864 mrs tp, tpidr_el0
2865 add t0, tp, #:tprel_hi12:imm, lsl #12
2866 add t0, t0, #:tprel_lo12_nc:imm
2867 */
2868
2869 static void
2870 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2871 enum aarch64_symbol_type type)
2872 {
2873 switch (type)
2874 {
2875 case SYMBOL_SMALL_ABSOLUTE:
2876 {
2877 /* In ILP32, the mode of dest can be either SImode or DImode. */
2878 rtx tmp_reg = dest;
2879 machine_mode mode = GET_MODE (dest);
2880
2881 gcc_assert (mode == Pmode || mode == ptr_mode);
2882
2883 if (can_create_pseudo_p ())
2884 tmp_reg = gen_reg_rtx (mode);
2885
2886 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2887 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2888 return;
2889 }
2890
2891 case SYMBOL_TINY_ABSOLUTE:
2892 emit_insn (gen_rtx_SET (dest, imm));
2893 return;
2894
2895 case SYMBOL_SMALL_GOT_28K:
2896 {
2897 machine_mode mode = GET_MODE (dest);
2898 rtx gp_rtx = pic_offset_table_rtx;
2899 rtx insn;
2900 rtx mem;
2901
2902 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2903 here before rtl expand. Tree IVOPT will generate rtl pattern to
2904 decide rtx costs, in which case pic_offset_table_rtx is not
2905 initialized. For that case no need to generate the first adrp
2906 instruction as the final cost for global variable access is
2907 one instruction. */
2908 if (gp_rtx != NULL)
2909 {
2910 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2911 using the page base as GOT base, the first page may be wasted,
2912 in the worst scenario, there is only 28K space for GOT).
2913
2914 The generate instruction sequence for accessing global variable
2915 is:
2916
2917 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2918
2919 Only one instruction needed. But we must initialize
2920 pic_offset_table_rtx properly. We generate initialize insn for
2921 every global access, and allow CSE to remove all redundant.
2922
2923 The final instruction sequences will look like the following
2924 for multiply global variables access.
2925
2926 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2927
2928 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2929 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2930 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2931 ... */
2932
2933 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2934 crtl->uses_pic_offset_table = 1;
2935 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2936
2937 if (mode != GET_MODE (gp_rtx))
2938 gp_rtx = gen_lowpart (mode, gp_rtx);
2939
2940 }
2941
2942 if (mode == ptr_mode)
2943 {
2944 if (mode == DImode)
2945 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2946 else
2947 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2948
2949 mem = XVECEXP (SET_SRC (insn), 0, 0);
2950 }
2951 else
2952 {
2953 gcc_assert (mode == Pmode);
2954
2955 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2956 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2957 }
2958
2959 /* The operand is expected to be MEM. Whenever the related insn
2960 pattern changed, above code which calculate mem should be
2961 updated. */
2962 gcc_assert (GET_CODE (mem) == MEM);
2963 MEM_READONLY_P (mem) = 1;
2964 MEM_NOTRAP_P (mem) = 1;
2965 emit_insn (insn);
2966 return;
2967 }
2968
2969 case SYMBOL_SMALL_GOT_4G:
2970 {
2971 /* In ILP32, the mode of dest can be either SImode or DImode,
2972 while the got entry is always of SImode size. The mode of
2973 dest depends on how dest is used: if dest is assigned to a
2974 pointer (e.g. in the memory), it has SImode; it may have
2975 DImode if dest is dereferenced to access the memeory.
2976 This is why we have to handle three different ldr_got_small
2977 patterns here (two patterns for ILP32). */
2978
2979 rtx insn;
2980 rtx mem;
2981 rtx tmp_reg = dest;
2982 machine_mode mode = GET_MODE (dest);
2983
2984 if (can_create_pseudo_p ())
2985 tmp_reg = gen_reg_rtx (mode);
2986
2987 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2988 if (mode == ptr_mode)
2989 {
2990 if (mode == DImode)
2991 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2992 else
2993 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2994
2995 mem = XVECEXP (SET_SRC (insn), 0, 0);
2996 }
2997 else
2998 {
2999 gcc_assert (mode == Pmode);
3000
3001 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3002 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3003 }
3004
3005 gcc_assert (GET_CODE (mem) == MEM);
3006 MEM_READONLY_P (mem) = 1;
3007 MEM_NOTRAP_P (mem) = 1;
3008 emit_insn (insn);
3009 return;
3010 }
3011
3012 case SYMBOL_SMALL_TLSGD:
3013 {
3014 rtx_insn *insns;
3015 /* The return type of __tls_get_addr is the C pointer type
3016 so use ptr_mode. */
3017 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3018 rtx tmp_reg = dest;
3019
3020 if (GET_MODE (dest) != ptr_mode)
3021 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3022
3023 start_sequence ();
3024 if (ptr_mode == SImode)
3025 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3026 else
3027 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3028 insns = get_insns ();
3029 end_sequence ();
3030
3031 RTL_CONST_CALL_P (insns) = 1;
3032 emit_libcall_block (insns, tmp_reg, result, imm);
3033 /* Convert back to the mode of the dest adding a zero_extend
3034 from SImode (ptr_mode) to DImode (Pmode). */
3035 if (dest != tmp_reg)
3036 convert_move (dest, tmp_reg, true);
3037 return;
3038 }
3039
3040 case SYMBOL_SMALL_TLSDESC:
3041 {
3042 machine_mode mode = GET_MODE (dest);
3043 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3044 rtx tp;
3045
3046 gcc_assert (mode == Pmode || mode == ptr_mode);
3047
3048 /* In ILP32, the got entry is always of SImode size. Unlike
3049 small GOT, the dest is fixed at reg 0. */
3050 if (TARGET_ILP32)
3051 emit_insn (gen_tlsdesc_small_si (imm));
3052 else
3053 emit_insn (gen_tlsdesc_small_di (imm));
3054 tp = aarch64_load_tp (NULL);
3055
3056 if (mode != Pmode)
3057 tp = gen_lowpart (mode, tp);
3058
3059 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3060 if (REG_P (dest))
3061 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3062 return;
3063 }
3064
3065 case SYMBOL_SMALL_TLSIE:
3066 {
3067 /* In ILP32, the mode of dest can be either SImode or DImode,
3068 while the got entry is always of SImode size. The mode of
3069 dest depends on how dest is used: if dest is assigned to a
3070 pointer (e.g. in the memory), it has SImode; it may have
3071 DImode if dest is dereferenced to access the memeory.
3072 This is why we have to handle three different tlsie_small
3073 patterns here (two patterns for ILP32). */
3074 machine_mode mode = GET_MODE (dest);
3075 rtx tmp_reg = gen_reg_rtx (mode);
3076 rtx tp = aarch64_load_tp (NULL);
3077
3078 if (mode == ptr_mode)
3079 {
3080 if (mode == DImode)
3081 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3082 else
3083 {
3084 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3085 tp = gen_lowpart (mode, tp);
3086 }
3087 }
3088 else
3089 {
3090 gcc_assert (mode == Pmode);
3091 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3092 }
3093
3094 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3095 if (REG_P (dest))
3096 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3097 return;
3098 }
3099
3100 case SYMBOL_TLSLE12:
3101 case SYMBOL_TLSLE24:
3102 case SYMBOL_TLSLE32:
3103 case SYMBOL_TLSLE48:
3104 {
3105 machine_mode mode = GET_MODE (dest);
3106 rtx tp = aarch64_load_tp (NULL);
3107
3108 if (mode != Pmode)
3109 tp = gen_lowpart (mode, tp);
3110
3111 switch (type)
3112 {
3113 case SYMBOL_TLSLE12:
3114 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3115 (dest, tp, imm));
3116 break;
3117 case SYMBOL_TLSLE24:
3118 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3119 (dest, tp, imm));
3120 break;
3121 case SYMBOL_TLSLE32:
3122 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3123 (dest, imm));
3124 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3125 (dest, dest, tp));
3126 break;
3127 case SYMBOL_TLSLE48:
3128 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3129 (dest, imm));
3130 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3131 (dest, dest, tp));
3132 break;
3133 default:
3134 gcc_unreachable ();
3135 }
3136
3137 if (REG_P (dest))
3138 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3139 return;
3140 }
3141
3142 case SYMBOL_TINY_GOT:
3143 {
3144 rtx insn;
3145 machine_mode mode = GET_MODE (dest);
3146
3147 if (mode == ptr_mode)
3148 insn = gen_ldr_got_tiny (mode, dest, imm);
3149 else
3150 {
3151 gcc_assert (mode == Pmode);
3152 insn = gen_ldr_got_tiny_sidi (dest, imm);
3153 }
3154
3155 emit_insn (insn);
3156 return;
3157 }
3158
3159 case SYMBOL_TINY_TLSIE:
3160 {
3161 machine_mode mode = GET_MODE (dest);
3162 rtx tp = aarch64_load_tp (NULL);
3163
3164 if (mode == ptr_mode)
3165 {
3166 if (mode == DImode)
3167 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3168 else
3169 {
3170 tp = gen_lowpart (mode, tp);
3171 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3172 }
3173 }
3174 else
3175 {
3176 gcc_assert (mode == Pmode);
3177 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3178 }
3179
3180 if (REG_P (dest))
3181 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3182 return;
3183 }
3184
3185 default:
3186 gcc_unreachable ();
3187 }
3188 }
3189
3190 /* Emit a move from SRC to DEST. Assume that the move expanders can
3191 handle all moves if !can_create_pseudo_p (). The distinction is
3192 important because, unlike emit_move_insn, the move expanders know
3193 how to force Pmode objects into the constant pool even when the
3194 constant pool address is not itself legitimate. */
3195 static rtx
3196 aarch64_emit_move (rtx dest, rtx src)
3197 {
3198 return (can_create_pseudo_p ()
3199 ? emit_move_insn (dest, src)
3200 : emit_move_insn_1 (dest, src));
3201 }
3202
3203 /* Apply UNOPTAB to OP and store the result in DEST. */
3204
3205 static void
3206 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3207 {
3208 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3209 if (dest != tmp)
3210 emit_move_insn (dest, tmp);
3211 }
3212
3213 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3214
3215 static void
3216 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3217 {
3218 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3219 OPTAB_DIRECT);
3220 if (dest != tmp)
3221 emit_move_insn (dest, tmp);
3222 }
3223
3224 /* Split a 128-bit move operation into two 64-bit move operations,
3225 taking care to handle partial overlap of register to register
3226 copies. Special cases are needed when moving between GP regs and
3227 FP regs. SRC can be a register, constant or memory; DST a register
3228 or memory. If either operand is memory it must not have any side
3229 effects. */
3230 void
3231 aarch64_split_128bit_move (rtx dst, rtx src)
3232 {
3233 rtx dst_lo, dst_hi;
3234 rtx src_lo, src_hi;
3235
3236 machine_mode mode = GET_MODE (dst);
3237
3238 gcc_assert (mode == TImode || mode == TFmode);
3239 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3240 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3241
3242 if (REG_P (dst) && REG_P (src))
3243 {
3244 int src_regno = REGNO (src);
3245 int dst_regno = REGNO (dst);
3246
3247 /* Handle FP <-> GP regs. */
3248 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3249 {
3250 src_lo = gen_lowpart (word_mode, src);
3251 src_hi = gen_highpart (word_mode, src);
3252
3253 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3254 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3255 return;
3256 }
3257 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3258 {
3259 dst_lo = gen_lowpart (word_mode, dst);
3260 dst_hi = gen_highpart (word_mode, dst);
3261
3262 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3263 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3264 return;
3265 }
3266 }
3267
3268 dst_lo = gen_lowpart (word_mode, dst);
3269 dst_hi = gen_highpart (word_mode, dst);
3270 src_lo = gen_lowpart (word_mode, src);
3271 src_hi = gen_highpart_mode (word_mode, mode, src);
3272
3273 /* At most one pairing may overlap. */
3274 if (reg_overlap_mentioned_p (dst_lo, src_hi))
3275 {
3276 aarch64_emit_move (dst_hi, src_hi);
3277 aarch64_emit_move (dst_lo, src_lo);
3278 }
3279 else
3280 {
3281 aarch64_emit_move (dst_lo, src_lo);
3282 aarch64_emit_move (dst_hi, src_hi);
3283 }
3284 }
3285
3286 bool
3287 aarch64_split_128bit_move_p (rtx dst, rtx src)
3288 {
3289 return (! REG_P (src)
3290 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
3291 }
3292
3293 /* Split a complex SIMD combine. */
3294
3295 void
3296 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3297 {
3298 machine_mode src_mode = GET_MODE (src1);
3299 machine_mode dst_mode = GET_MODE (dst);
3300
3301 gcc_assert (VECTOR_MODE_P (dst_mode));
3302 gcc_assert (register_operand (dst, dst_mode)
3303 && register_operand (src1, src_mode)
3304 && register_operand (src2, src_mode));
3305
3306 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
3307 return;
3308 }
3309
3310 /* Split a complex SIMD move. */
3311
3312 void
3313 aarch64_split_simd_move (rtx dst, rtx src)
3314 {
3315 machine_mode src_mode = GET_MODE (src);
3316 machine_mode dst_mode = GET_MODE (dst);
3317
3318 gcc_assert (VECTOR_MODE_P (dst_mode));
3319
3320 if (REG_P (dst) && REG_P (src))
3321 {
3322 gcc_assert (VECTOR_MODE_P (src_mode));
3323 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3324 }
3325 }
3326
3327 bool
3328 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3329 machine_mode ymode, rtx y)
3330 {
3331 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3332 gcc_assert (r != NULL);
3333 return rtx_equal_p (x, r);
3334 }
3335
3336 /* Return TARGET if it is nonnull and a register of mode MODE.
3337 Otherwise, return a fresh register of mode MODE if we can,
3338 or TARGET reinterpreted as MODE if we can't. */
3339
3340 static rtx
3341 aarch64_target_reg (rtx target, machine_mode mode)
3342 {
3343 if (target && REG_P (target) && GET_MODE (target) == mode)
3344 return target;
3345 if (!can_create_pseudo_p ())
3346 {
3347 gcc_assert (target);
3348 return gen_lowpart (mode, target);
3349 }
3350 return gen_reg_rtx (mode);
3351 }
3352
3353 /* Return a register that contains the constant in BUILDER, given that
3354 the constant is a legitimate move operand. Use TARGET as the register
3355 if it is nonnull and convenient. */
3356
3357 static rtx
3358 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3359 {
3360 rtx src = builder.build ();
3361 target = aarch64_target_reg (target, GET_MODE (src));
3362 emit_insn (gen_rtx_SET (target, src));
3363 return target;
3364 }
3365
3366 static rtx
3367 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3368 {
3369 if (can_create_pseudo_p ())
3370 return force_reg (mode, value);
3371 else
3372 {
3373 gcc_assert (x);
3374 aarch64_emit_move (x, value);
3375 return x;
3376 }
3377 }
3378
3379 /* Return true if predicate value X is a constant in which every element
3380 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3381 value, i.e. as a predicate in which all bits are significant. */
3382
3383 static bool
3384 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3385 {
3386 if (GET_CODE (x) != CONST_VECTOR)
3387 return false;
3388
3389 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3390 GET_MODE_NUNITS (GET_MODE (x)));
3391 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3392 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3393 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3394
3395 unsigned int nelts = const_vector_encoded_nelts (x);
3396 for (unsigned int i = 0; i < nelts; ++i)
3397 {
3398 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3399 if (!CONST_INT_P (elt))
3400 return false;
3401
3402 builder.quick_push (elt);
3403 for (unsigned int j = 1; j < factor; ++j)
3404 builder.quick_push (const0_rtx);
3405 }
3406 builder.finalize ();
3407 return true;
3408 }
3409
3410 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
3411 widest predicate element size it can have (that is, the largest size
3412 for which each element would still be 0 or 1). */
3413
3414 unsigned int
3415 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3416 {
3417 /* Start with the most optimistic assumption: that we only need
3418 one bit per pattern. This is what we will use if only the first
3419 bit in each pattern is ever set. */
3420 unsigned int mask = GET_MODE_SIZE (DImode);
3421 mask |= builder.npatterns ();
3422
3423 /* Look for set bits. */
3424 unsigned int nelts = builder.encoded_nelts ();
3425 for (unsigned int i = 1; i < nelts; ++i)
3426 if (INTVAL (builder.elt (i)) != 0)
3427 {
3428 if (i & 1)
3429 return 1;
3430 mask |= i;
3431 }
3432 return mask & -mask;
3433 }
3434
3435 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3436 return that predicate mode, otherwise return opt_machine_mode (). */
3437
3438 opt_machine_mode
3439 aarch64_ptrue_all_mode (rtx x)
3440 {
3441 gcc_assert (GET_MODE (x) == VNx16BImode);
3442 if (GET_CODE (x) != CONST_VECTOR
3443 || !CONST_VECTOR_DUPLICATE_P (x)
3444 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3445 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3446 return opt_machine_mode ();
3447
3448 unsigned int nelts = const_vector_encoded_nelts (x);
3449 for (unsigned int i = 1; i < nelts; ++i)
3450 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3451 return opt_machine_mode ();
3452
3453 return aarch64_sve_pred_mode (nelts);
3454 }
3455
3456 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3457 that the constant would have with predicate element size ELT_SIZE
3458 (ignoring the upper bits in each element) and return:
3459
3460 * -1 if all bits are set
3461 * N if the predicate has N leading set bits followed by all clear bits
3462 * 0 if the predicate does not have any of these forms. */
3463
3464 int
3465 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3466 unsigned int elt_size)
3467 {
3468 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3469 followed by set bits. */
3470 if (builder.nelts_per_pattern () == 3)
3471 return 0;
3472
3473 /* Skip over leading set bits. */
3474 unsigned int nelts = builder.encoded_nelts ();
3475 unsigned int i = 0;
3476 for (; i < nelts; i += elt_size)
3477 if (INTVAL (builder.elt (i)) == 0)
3478 break;
3479 unsigned int vl = i / elt_size;
3480
3481 /* Check for the all-true case. */
3482 if (i == nelts)
3483 return -1;
3484
3485 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3486 repeating pattern of set bits followed by clear bits. */
3487 if (builder.nelts_per_pattern () != 2)
3488 return 0;
3489
3490 /* We have a "foreground" value and a duplicated "background" value.
3491 If the background might repeat and the last set bit belongs to it,
3492 we might have set bits followed by clear bits followed by set bits. */
3493 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3494 return 0;
3495
3496 /* Make sure that the rest are all clear. */
3497 for (; i < nelts; i += elt_size)
3498 if (INTVAL (builder.elt (i)) != 0)
3499 return 0;
3500
3501 return vl;
3502 }
3503
3504 /* See if there is an svpattern that encodes an SVE predicate of mode
3505 PRED_MODE in which the first VL bits are set and the rest are clear.
3506 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3507 A VL of -1 indicates an all-true vector. */
3508
3509 aarch64_svpattern
3510 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3511 {
3512 if (vl < 0)
3513 return AARCH64_SV_ALL;
3514
3515 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3516 return AARCH64_NUM_SVPATTERNS;
3517
3518 if (vl >= 1 && vl <= 8)
3519 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3520
3521 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3522 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3523
3524 int max_vl;
3525 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3526 {
3527 if (vl == (max_vl / 3) * 3)
3528 return AARCH64_SV_MUL3;
3529 /* These would only trigger for non-power-of-2 lengths. */
3530 if (vl == (max_vl & -4))
3531 return AARCH64_SV_MUL4;
3532 if (vl == (1 << floor_log2 (max_vl)))
3533 return AARCH64_SV_POW2;
3534 if (vl == max_vl)
3535 return AARCH64_SV_ALL;
3536 }
3537 return AARCH64_NUM_SVPATTERNS;
3538 }
3539
3540 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3541 bits has the lowest bit set and the upper bits clear. This is the
3542 VNx16BImode equivalent of a PTRUE for controlling elements of
3543 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3544 all bits are significant, even the upper zeros. */
3545
3546 rtx
3547 aarch64_ptrue_all (unsigned int elt_size)
3548 {
3549 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3550 builder.quick_push (const1_rtx);
3551 for (unsigned int i = 1; i < elt_size; ++i)
3552 builder.quick_push (const0_rtx);
3553 return builder.build ();
3554 }
3555
3556 /* Return an all-true predicate register of mode MODE. */
3557
3558 rtx
3559 aarch64_ptrue_reg (machine_mode mode)
3560 {
3561 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3562 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3563 return gen_lowpart (mode, reg);
3564 }
3565
3566 /* Return an all-false predicate register of mode MODE. */
3567
3568 rtx
3569 aarch64_pfalse_reg (machine_mode mode)
3570 {
3571 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3572 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3573 return gen_lowpart (mode, reg);
3574 }
3575
3576 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3577 true, or alternatively if we know that the operation predicated by
3578 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
3579 aarch64_sve_gp_strictness operand that describes the operation
3580 predicated by PRED1[0]. */
3581
3582 bool
3583 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3584 {
3585 machine_mode mode = GET_MODE (pred2);
3586 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3587 && mode == GET_MODE (pred1[0])
3588 && aarch64_sve_gp_strictness (pred1[1], SImode));
3589 return (pred1[0] == CONSTM1_RTX (mode)
3590 || INTVAL (pred1[1]) == SVE_RELAXED_GP
3591 || rtx_equal_p (pred1[0], pred2));
3592 }
3593
3594 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3595 for it. PRED2[0] is the predicate for the instruction whose result
3596 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3597 for it. Return true if we can prove that the two predicates are
3598 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3599 with PRED1[0] without changing behavior. */
3600
3601 bool
3602 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3603 {
3604 machine_mode mode = GET_MODE (pred1[0]);
3605 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3606 && mode == GET_MODE (pred2[0])
3607 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3608 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3609
3610 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3611 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3612 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3613 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3614 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3615 }
3616
3617 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3618 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3619 Use TARGET as the target register if nonnull and convenient. */
3620
3621 static rtx
3622 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3623 machine_mode data_mode, rtx op1, rtx op2)
3624 {
3625 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3626 expand_operand ops[5];
3627 create_output_operand (&ops[0], target, pred_mode);
3628 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3629 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3630 create_input_operand (&ops[3], op1, data_mode);
3631 create_input_operand (&ops[4], op2, data_mode);
3632 expand_insn (icode, 5, ops);
3633 return ops[0].value;
3634 }
3635
3636 /* Use a comparison to convert integer vector SRC into MODE, which is
3637 the corresponding SVE predicate mode. Use TARGET for the result
3638 if it's nonnull and convenient. */
3639
3640 rtx
3641 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3642 {
3643 machine_mode src_mode = GET_MODE (src);
3644 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3645 src, CONST0_RTX (src_mode));
3646 }
3647
3648 /* Return the assembly token for svprfop value PRFOP. */
3649
3650 static const char *
3651 svprfop_token (enum aarch64_svprfop prfop)
3652 {
3653 switch (prfop)
3654 {
3655 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3656 AARCH64_FOR_SVPRFOP (CASE)
3657 #undef CASE
3658 case AARCH64_NUM_SVPRFOPS:
3659 break;
3660 }
3661 gcc_unreachable ();
3662 }
3663
3664 /* Return the assembly string for an SVE prefetch operation with
3665 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3666 and that SUFFIX is the format for the remaining operands. */
3667
3668 char *
3669 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3670 const char *suffix)
3671 {
3672 static char buffer[128];
3673 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3674 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3675 mnemonic, svprfop_token (prfop), suffix);
3676 gcc_assert (written < sizeof (buffer));
3677 return buffer;
3678 }
3679
3680 /* Check whether we can calculate the number of elements in PATTERN
3681 at compile time, given that there are NELTS_PER_VQ elements per
3682 128-bit block. Return the value if so, otherwise return -1. */
3683
3684 HOST_WIDE_INT
3685 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3686 {
3687 unsigned int vl, const_vg;
3688 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3689 vl = 1 + (pattern - AARCH64_SV_VL1);
3690 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3691 vl = 16 << (pattern - AARCH64_SV_VL16);
3692 else if (aarch64_sve_vg.is_constant (&const_vg))
3693 {
3694 /* There are two vector granules per quadword. */
3695 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3696 switch (pattern)
3697 {
3698 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3699 case AARCH64_SV_MUL4: return nelts & -4;
3700 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3701 case AARCH64_SV_ALL: return nelts;
3702 default: gcc_unreachable ();
3703 }
3704 }
3705 else
3706 return -1;
3707
3708 /* There are two vector granules per quadword. */
3709 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3710 if (known_le (vl, nelts_all))
3711 return vl;
3712
3713 /* Requesting more elements than are available results in a PFALSE. */
3714 if (known_gt (vl, nelts_all))
3715 return 0;
3716
3717 return -1;
3718 }
3719
3720 /* Return true if we can move VALUE into a register using a single
3721 CNT[BHWD] instruction. */
3722
3723 static bool
3724 aarch64_sve_cnt_immediate_p (poly_int64 value)
3725 {
3726 HOST_WIDE_INT factor = value.coeffs[0];
3727 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3728 return (value.coeffs[1] == factor
3729 && IN_RANGE (factor, 2, 16 * 16)
3730 && (factor & 1) == 0
3731 && factor <= 16 * (factor & -factor));
3732 }
3733
3734 /* Likewise for rtx X. */
3735
3736 bool
3737 aarch64_sve_cnt_immediate_p (rtx x)
3738 {
3739 poly_int64 value;
3740 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3741 }
3742
3743 /* Return the asm string for an instruction with a CNT-like vector size
3744 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3745 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3746 first part of the operands template (the part that comes before the
3747 vector size itself). PATTERN is the pattern to use. FACTOR is the
3748 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3749 in each quadword. If it is zero, we can use any element size. */
3750
3751 static char *
3752 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3753 aarch64_svpattern pattern,
3754 unsigned int factor,
3755 unsigned int nelts_per_vq)
3756 {
3757 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3758
3759 if (nelts_per_vq == 0)
3760 /* There is some overlap in the ranges of the four CNT instructions.
3761 Here we always use the smallest possible element size, so that the
3762 multiplier is 1 whereever possible. */
3763 nelts_per_vq = factor & -factor;
3764 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3765 gcc_assert (IN_RANGE (shift, 1, 4));
3766 char suffix = "dwhb"[shift - 1];
3767
3768 factor >>= shift;
3769 unsigned int written;
3770 if (pattern == AARCH64_SV_ALL && factor == 1)
3771 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3772 prefix, suffix, operands);
3773 else if (factor == 1)
3774 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3775 prefix, suffix, operands, svpattern_token (pattern));
3776 else
3777 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3778 prefix, suffix, operands, svpattern_token (pattern),
3779 factor);
3780 gcc_assert (written < sizeof (buffer));
3781 return buffer;
3782 }
3783
3784 /* Return the asm string for an instruction with a CNT-like vector size
3785 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3786 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3787 first part of the operands template (the part that comes before the
3788 vector size itself). X is the value of the vector size operand,
3789 as a polynomial integer rtx; we need to convert this into an "all"
3790 pattern with a multiplier. */
3791
3792 char *
3793 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3794 rtx x)
3795 {
3796 poly_int64 value = rtx_to_poly_int64 (x);
3797 gcc_assert (aarch64_sve_cnt_immediate_p (value));
3798 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3799 value.coeffs[1], 0);
3800 }
3801
3802 /* Return the asm string for an instruction with a CNT-like vector size
3803 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3804 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3805 first part of the operands template (the part that comes before the
3806 vector size itself). CNT_PAT[0..2] are the operands of the
3807 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3808
3809 char *
3810 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3811 const char *operands, rtx *cnt_pat)
3812 {
3813 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3814 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3815 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3816 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3817 factor, nelts_per_vq);
3818 }
3819
3820 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3821
3822 bool
3823 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3824 {
3825 poly_int64 value;
3826 return (poly_int_rtx_p (x, &value)
3827 && (aarch64_sve_cnt_immediate_p (value)
3828 || aarch64_sve_cnt_immediate_p (-value)));
3829 }
3830
3831 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3832 operand 0. */
3833
3834 char *
3835 aarch64_output_sve_scalar_inc_dec (rtx offset)
3836 {
3837 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3838 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3839 if (offset_value.coeffs[1] > 0)
3840 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3841 offset_value.coeffs[1], 0);
3842 else
3843 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3844 -offset_value.coeffs[1], 0);
3845 }
3846
3847 /* Return true if we can add VALUE to a register using a single ADDVL
3848 or ADDPL instruction. */
3849
3850 static bool
3851 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3852 {
3853 HOST_WIDE_INT factor = value.coeffs[0];
3854 if (factor == 0 || value.coeffs[1] != factor)
3855 return false;
3856 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3857 and a value of 16 is one vector width. */
3858 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3859 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3860 }
3861
3862 /* Likewise for rtx X. */
3863
3864 bool
3865 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3866 {
3867 poly_int64 value;
3868 return (poly_int_rtx_p (x, &value)
3869 && aarch64_sve_addvl_addpl_immediate_p (value));
3870 }
3871
3872 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3873 to operand 1 and storing the result in operand 0. */
3874
3875 char *
3876 aarch64_output_sve_addvl_addpl (rtx offset)
3877 {
3878 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3879 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3880 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3881
3882 int factor = offset_value.coeffs[1];
3883 if ((factor & 15) == 0)
3884 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3885 else
3886 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3887 return buffer;
3888 }
3889
3890 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3891 instruction. If it is, store the number of elements in each vector
3892 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3893 factor in *FACTOR_OUT (if nonnull). */
3894
3895 bool
3896 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3897 unsigned int *nelts_per_vq_out)
3898 {
3899 rtx elt;
3900 poly_int64 value;
3901
3902 if (!const_vec_duplicate_p (x, &elt)
3903 || !poly_int_rtx_p (elt, &value))
3904 return false;
3905
3906 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3907 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3908 /* There's no vector INCB. */
3909 return false;
3910
3911 HOST_WIDE_INT factor = value.coeffs[0];
3912 if (value.coeffs[1] != factor)
3913 return false;
3914
3915 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3916 if ((factor % nelts_per_vq) != 0
3917 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3918 return false;
3919
3920 if (factor_out)
3921 *factor_out = factor;
3922 if (nelts_per_vq_out)
3923 *nelts_per_vq_out = nelts_per_vq;
3924 return true;
3925 }
3926
3927 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3928 instruction. */
3929
3930 bool
3931 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3932 {
3933 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3934 }
3935
3936 /* Return the asm template for an SVE vector INC or DEC instruction.
3937 OPERANDS gives the operands before the vector count and X is the
3938 value of the vector count operand itself. */
3939
3940 char *
3941 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3942 {
3943 int factor;
3944 unsigned int nelts_per_vq;
3945 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3946 gcc_unreachable ();
3947 if (factor < 0)
3948 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3949 -factor, nelts_per_vq);
3950 else
3951 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3952 factor, nelts_per_vq);
3953 }
3954
3955 static int
3956 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3957 scalar_int_mode mode)
3958 {
3959 int i;
3960 unsigned HOST_WIDE_INT val, val2, mask;
3961 int one_match, zero_match;
3962 int num_insns;
3963
3964 val = INTVAL (imm);
3965
3966 if (aarch64_move_imm (val, mode))
3967 {
3968 if (generate)
3969 emit_insn (gen_rtx_SET (dest, imm));
3970 return 1;
3971 }
3972
3973 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3974 (with XXXX non-zero). In that case check to see if the move can be done in
3975 a smaller mode. */
3976 val2 = val & 0xffffffff;
3977 if (mode == DImode
3978 && aarch64_move_imm (val2, SImode)
3979 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3980 {
3981 if (generate)
3982 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3983
3984 /* Check if we have to emit a second instruction by checking to see
3985 if any of the upper 32 bits of the original DI mode value is set. */
3986 if (val == val2)
3987 return 1;
3988
3989 i = (val >> 48) ? 48 : 32;
3990
3991 if (generate)
3992 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3993 GEN_INT ((val >> i) & 0xffff)));
3994
3995 return 2;
3996 }
3997
3998 if ((val >> 32) == 0 || mode == SImode)
3999 {
4000 if (generate)
4001 {
4002 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4003 if (mode == SImode)
4004 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4005 GEN_INT ((val >> 16) & 0xffff)));
4006 else
4007 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4008 GEN_INT ((val >> 16) & 0xffff)));
4009 }
4010 return 2;
4011 }
4012
4013 /* Remaining cases are all for DImode. */
4014
4015 mask = 0xffff;
4016 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4017 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4018 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4019 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4020
4021 if (zero_match != 2 && one_match != 2)
4022 {
4023 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4024 For a 64-bit bitmask try whether changing 16 bits to all ones or
4025 zeroes creates a valid bitmask. To check any repeated bitmask,
4026 try using 16 bits from the other 32-bit half of val. */
4027
4028 for (i = 0; i < 64; i += 16, mask <<= 16)
4029 {
4030 val2 = val & ~mask;
4031 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4032 break;
4033 val2 = val | mask;
4034 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4035 break;
4036 val2 = val2 & ~mask;
4037 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4038 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4039 break;
4040 }
4041 if (i != 64)
4042 {
4043 if (generate)
4044 {
4045 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4046 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4047 GEN_INT ((val >> i) & 0xffff)));
4048 }
4049 return 2;
4050 }
4051 }
4052
4053 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4054 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4055 otherwise skip zero bits. */
4056
4057 num_insns = 1;
4058 mask = 0xffff;
4059 val2 = one_match > zero_match ? ~val : val;
4060 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4061
4062 if (generate)
4063 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4064 ? (val | ~(mask << i))
4065 : (val & (mask << i)))));
4066 for (i += 16; i < 64; i += 16)
4067 {
4068 if ((val2 & (mask << i)) == 0)
4069 continue;
4070 if (generate)
4071 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4072 GEN_INT ((val >> i) & 0xffff)));
4073 num_insns ++;
4074 }
4075
4076 return num_insns;
4077 }
4078
4079 /* Return whether imm is a 128-bit immediate which is simple enough to
4080 expand inline. */
4081 bool
4082 aarch64_mov128_immediate (rtx imm)
4083 {
4084 if (GET_CODE (imm) == CONST_INT)
4085 return true;
4086
4087 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4088
4089 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4090 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4091
4092 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4093 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4094 }
4095
4096
4097 /* Return the number of temporary registers that aarch64_add_offset_1
4098 would need to add OFFSET to a register. */
4099
4100 static unsigned int
4101 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4102 {
4103 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
4104 }
4105
4106 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4107 a non-polynomial OFFSET. MODE is the mode of the addition.
4108 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4109 be set and CFA adjustments added to the generated instructions.
4110
4111 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4112 temporary if register allocation is already complete. This temporary
4113 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4114 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4115 the immediate again.
4116
4117 Since this function may be used to adjust the stack pointer, we must
4118 ensure that it cannot cause transient stack deallocation (for example
4119 by first incrementing SP and then decrementing when adjusting by a
4120 large immediate). */
4121
4122 static void
4123 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4124 rtx src, HOST_WIDE_INT offset, rtx temp1,
4125 bool frame_related_p, bool emit_move_imm)
4126 {
4127 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4128 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4129
4130 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4131 rtx_insn *insn;
4132
4133 if (!moffset)
4134 {
4135 if (!rtx_equal_p (dest, src))
4136 {
4137 insn = emit_insn (gen_rtx_SET (dest, src));
4138 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4139 }
4140 return;
4141 }
4142
4143 /* Single instruction adjustment. */
4144 if (aarch64_uimm12_shift (moffset))
4145 {
4146 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4147 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4148 return;
4149 }
4150
4151 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4152 and either:
4153
4154 a) the offset cannot be loaded by a 16-bit move or
4155 b) there is no spare register into which we can move it. */
4156 if (moffset < 0x1000000
4157 && ((!temp1 && !can_create_pseudo_p ())
4158 || !aarch64_move_imm (moffset, mode)))
4159 {
4160 HOST_WIDE_INT low_off = moffset & 0xfff;
4161
4162 low_off = offset < 0 ? -low_off : low_off;
4163 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4164 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4165 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4166 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4167 return;
4168 }
4169
4170 /* Emit a move immediate if required and an addition/subtraction. */
4171 if (emit_move_imm)
4172 {
4173 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4174 temp1 = aarch64_force_temporary (mode, temp1,
4175 gen_int_mode (moffset, mode));
4176 }
4177 insn = emit_insn (offset < 0
4178 ? gen_sub3_insn (dest, src, temp1)
4179 : gen_add3_insn (dest, src, temp1));
4180 if (frame_related_p)
4181 {
4182 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4183 rtx adj = plus_constant (mode, src, offset);
4184 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4185 }
4186 }
4187
4188 /* Return the number of temporary registers that aarch64_add_offset
4189 would need to move OFFSET into a register or add OFFSET to a register;
4190 ADD_P is true if we want the latter rather than the former. */
4191
4192 static unsigned int
4193 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4194 {
4195 /* This follows the same structure as aarch64_add_offset. */
4196 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4197 return 0;
4198
4199 unsigned int count = 0;
4200 HOST_WIDE_INT factor = offset.coeffs[1];
4201 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4202 poly_int64 poly_offset (factor, factor);
4203 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4204 /* Need one register for the ADDVL/ADDPL result. */
4205 count += 1;
4206 else if (factor != 0)
4207 {
4208 factor = abs (factor);
4209 if (factor > 16 * (factor & -factor))
4210 /* Need one register for the CNT result and one for the multiplication
4211 factor. If necessary, the second temporary can be reused for the
4212 constant part of the offset. */
4213 return 2;
4214 /* Need one register for the CNT result (which might then
4215 be shifted). */
4216 count += 1;
4217 }
4218 return count + aarch64_add_offset_1_temporaries (constant);
4219 }
4220
4221 /* If X can be represented as a poly_int64, return the number
4222 of temporaries that are required to add it to a register.
4223 Return -1 otherwise. */
4224
4225 int
4226 aarch64_add_offset_temporaries (rtx x)
4227 {
4228 poly_int64 offset;
4229 if (!poly_int_rtx_p (x, &offset))
4230 return -1;
4231 return aarch64_offset_temporaries (true, offset);
4232 }
4233
4234 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4235 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4236 be set and CFA adjustments added to the generated instructions.
4237
4238 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4239 temporary if register allocation is already complete. This temporary
4240 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4241 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4242 false to avoid emitting the immediate again.
4243
4244 TEMP2, if nonnull, is a second temporary register that doesn't
4245 overlap either DEST or REG.
4246
4247 Since this function may be used to adjust the stack pointer, we must
4248 ensure that it cannot cause transient stack deallocation (for example
4249 by first incrementing SP and then decrementing when adjusting by a
4250 large immediate). */
4251
4252 static void
4253 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4254 poly_int64 offset, rtx temp1, rtx temp2,
4255 bool frame_related_p, bool emit_move_imm = true)
4256 {
4257 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4258 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4259 gcc_assert (temp1 == NULL_RTX
4260 || !frame_related_p
4261 || !reg_overlap_mentioned_p (temp1, dest));
4262 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4263
4264 /* Try using ADDVL or ADDPL to add the whole value. */
4265 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4266 {
4267 rtx offset_rtx = gen_int_mode (offset, mode);
4268 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4269 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4270 return;
4271 }
4272
4273 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4274 SVE vector register, over and above the minimum size of 128 bits.
4275 This is equivalent to half the value returned by CNTD with a
4276 vector shape of ALL. */
4277 HOST_WIDE_INT factor = offset.coeffs[1];
4278 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4279
4280 /* Try using ADDVL or ADDPL to add the VG-based part. */
4281 poly_int64 poly_offset (factor, factor);
4282 if (src != const0_rtx
4283 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4284 {
4285 rtx offset_rtx = gen_int_mode (poly_offset, mode);
4286 if (frame_related_p)
4287 {
4288 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4289 RTX_FRAME_RELATED_P (insn) = true;
4290 src = dest;
4291 }
4292 else
4293 {
4294 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4295 src = aarch64_force_temporary (mode, temp1, addr);
4296 temp1 = temp2;
4297 temp2 = NULL_RTX;
4298 }
4299 }
4300 /* Otherwise use a CNT-based sequence. */
4301 else if (factor != 0)
4302 {
4303 /* Use a subtraction if we have a negative factor. */
4304 rtx_code code = PLUS;
4305 if (factor < 0)
4306 {
4307 factor = -factor;
4308 code = MINUS;
4309 }
4310
4311 /* Calculate CNTD * FACTOR / 2. First try to fold the division
4312 into the multiplication. */
4313 rtx val;
4314 int shift = 0;
4315 if (factor & 1)
4316 /* Use a right shift by 1. */
4317 shift = -1;
4318 else
4319 factor /= 2;
4320 HOST_WIDE_INT low_bit = factor & -factor;
4321 if (factor <= 16 * low_bit)
4322 {
4323 if (factor > 16 * 8)
4324 {
4325 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4326 the value with the minimum multiplier and shift it into
4327 position. */
4328 int extra_shift = exact_log2 (low_bit);
4329 shift += extra_shift;
4330 factor >>= extra_shift;
4331 }
4332 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4333 }
4334 else
4335 {
4336 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4337 directly, since that should increase the chances of being
4338 able to use a shift and add sequence. If LOW_BIT itself
4339 is out of range, just use CNTD. */
4340 if (low_bit <= 16 * 8)
4341 factor /= low_bit;
4342 else
4343 low_bit = 1;
4344
4345 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
4346 val = aarch64_force_temporary (mode, temp1, val);
4347
4348 if (can_create_pseudo_p ())
4349 {
4350 rtx coeff1 = gen_int_mode (factor, mode);
4351 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
4352 }
4353 else
4354 {
4355 /* Go back to using a negative multiplication factor if we have
4356 no register from which to subtract. */
4357 if (code == MINUS && src == const0_rtx)
4358 {
4359 factor = -factor;
4360 code = PLUS;
4361 }
4362 rtx coeff1 = gen_int_mode (factor, mode);
4363 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4364 val = gen_rtx_MULT (mode, val, coeff1);
4365 }
4366 }
4367
4368 if (shift > 0)
4369 {
4370 /* Multiply by 1 << SHIFT. */
4371 val = aarch64_force_temporary (mode, temp1, val);
4372 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4373 }
4374 else if (shift == -1)
4375 {
4376 /* Divide by 2. */
4377 val = aarch64_force_temporary (mode, temp1, val);
4378 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
4379 }
4380
4381 /* Calculate SRC +/- CNTD * FACTOR / 2. */
4382 if (src != const0_rtx)
4383 {
4384 val = aarch64_force_temporary (mode, temp1, val);
4385 val = gen_rtx_fmt_ee (code, mode, src, val);
4386 }
4387 else if (code == MINUS)
4388 {
4389 val = aarch64_force_temporary (mode, temp1, val);
4390 val = gen_rtx_NEG (mode, val);
4391 }
4392
4393 if (constant == 0 || frame_related_p)
4394 {
4395 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4396 if (frame_related_p)
4397 {
4398 RTX_FRAME_RELATED_P (insn) = true;
4399 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4400 gen_rtx_SET (dest, plus_constant (Pmode, src,
4401 poly_offset)));
4402 }
4403 src = dest;
4404 if (constant == 0)
4405 return;
4406 }
4407 else
4408 {
4409 src = aarch64_force_temporary (mode, temp1, val);
4410 temp1 = temp2;
4411 temp2 = NULL_RTX;
4412 }
4413
4414 emit_move_imm = true;
4415 }
4416
4417 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4418 frame_related_p, emit_move_imm);
4419 }
4420
4421 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4422 than a poly_int64. */
4423
4424 void
4425 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4426 rtx offset_rtx, rtx temp1, rtx temp2)
4427 {
4428 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4429 temp1, temp2, false);
4430 }
4431
4432 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4433 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4434 if TEMP1 already contains abs (DELTA). */
4435
4436 static inline void
4437 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4438 {
4439 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4440 temp1, temp2, true, emit_move_imm);
4441 }
4442
4443 /* Subtract DELTA from the stack pointer, marking the instructions
4444 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4445 if nonnull. */
4446
4447 static inline void
4448 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4449 bool emit_move_imm = true)
4450 {
4451 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4452 temp1, temp2, frame_related_p, emit_move_imm);
4453 }
4454
4455 /* Set DEST to (vec_series BASE STEP). */
4456
4457 static void
4458 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4459 {
4460 machine_mode mode = GET_MODE (dest);
4461 scalar_mode inner = GET_MODE_INNER (mode);
4462
4463 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4464 if (!aarch64_sve_index_immediate_p (base))
4465 base = force_reg (inner, base);
4466 if (!aarch64_sve_index_immediate_p (step))
4467 step = force_reg (inner, step);
4468
4469 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4470 }
4471
4472 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4473 register of mode MODE. Use TARGET for the result if it's nonnull
4474 and convenient.
4475
4476 The two vector modes must have the same element mode. The behavior
4477 is to duplicate architectural lane N of SRC into architectural lanes
4478 N + I * STEP of the result. On big-endian targets, architectural
4479 lane 0 of an Advanced SIMD vector is the last element of the vector
4480 in memory layout, so for big-endian targets this operation has the
4481 effect of reversing SRC before duplicating it. Callers need to
4482 account for this. */
4483
4484 rtx
4485 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4486 {
4487 machine_mode src_mode = GET_MODE (src);
4488 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4489 insn_code icode = (BYTES_BIG_ENDIAN
4490 ? code_for_aarch64_vec_duplicate_vq_be (mode)
4491 : code_for_aarch64_vec_duplicate_vq_le (mode));
4492
4493 unsigned int i = 0;
4494 expand_operand ops[3];
4495 create_output_operand (&ops[i++], target, mode);
4496 create_output_operand (&ops[i++], src, src_mode);
4497 if (BYTES_BIG_ENDIAN)
4498 {
4499 /* Create a PARALLEL describing the reversal of SRC. */
4500 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4501 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4502 nelts_per_vq - 1, -1);
4503 create_fixed_operand (&ops[i++], sel);
4504 }
4505 expand_insn (icode, i, ops);
4506 return ops[0].value;
4507 }
4508
4509 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4510 the memory image into DEST. Return true on success. */
4511
4512 static bool
4513 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4514 {
4515 src = force_const_mem (GET_MODE (src), src);
4516 if (!src)
4517 return false;
4518
4519 /* Make sure that the address is legitimate. */
4520 if (!aarch64_sve_ld1rq_operand_p (src))
4521 {
4522 rtx addr = force_reg (Pmode, XEXP (src, 0));
4523 src = replace_equiv_address (src, addr);
4524 }
4525
4526 machine_mode mode = GET_MODE (dest);
4527 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4528 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4529 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4530 return true;
4531 }
4532
4533 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4534 SVE data mode and isn't a legitimate constant. Use TARGET for the
4535 result if convenient.
4536
4537 The returned register can have whatever mode seems most natural
4538 given the contents of SRC. */
4539
4540 static rtx
4541 aarch64_expand_sve_const_vector (rtx target, rtx src)
4542 {
4543 machine_mode mode = GET_MODE (src);
4544 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4545 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4546 scalar_mode elt_mode = GET_MODE_INNER (mode);
4547 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4548 unsigned int container_bits = aarch64_sve_container_bits (mode);
4549 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4550
4551 if (nelts_per_pattern == 1
4552 && encoded_bits <= 128
4553 && container_bits != elt_bits)
4554 {
4555 /* We have a partial vector mode and a constant whose full-vector
4556 equivalent would occupy a repeating 128-bit sequence. Build that
4557 full-vector equivalent instead, so that we have the option of
4558 using LD1RQ and Advanced SIMD operations. */
4559 unsigned int repeat = container_bits / elt_bits;
4560 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4561 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4562 for (unsigned int i = 0; i < npatterns; ++i)
4563 for (unsigned int j = 0; j < repeat; ++j)
4564 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4565 target = aarch64_target_reg (target, full_mode);
4566 return aarch64_expand_sve_const_vector (target, builder.build ());
4567 }
4568
4569 if (nelts_per_pattern == 1 && encoded_bits == 128)
4570 {
4571 /* The constant is a duplicated quadword but can't be narrowed
4572 beyond a quadword. Get the memory image of the first quadword
4573 as a 128-bit vector and try using LD1RQ to load it from memory.
4574
4575 The effect for both endiannesses is to load memory lane N into
4576 architectural lanes N + I * STEP of the result. On big-endian
4577 targets, the layout of the 128-bit vector in an Advanced SIMD
4578 register would be different from its layout in an SVE register,
4579 but this 128-bit vector is a memory value only. */
4580 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4581 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4582 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4583 return target;
4584 }
4585
4586 if (nelts_per_pattern == 1 && encoded_bits < 128)
4587 {
4588 /* The vector is a repeating sequence of 64 bits or fewer.
4589 See if we can load them using an Advanced SIMD move and then
4590 duplicate it to fill a vector. This is better than using a GPR
4591 move because it keeps everything in the same register file. */
4592 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4593 rtx_vector_builder builder (vq_mode, npatterns, 1);
4594 for (unsigned int i = 0; i < npatterns; ++i)
4595 {
4596 /* We want memory lane N to go into architectural lane N,
4597 so reverse for big-endian targets. The DUP .Q pattern
4598 has a compensating reverse built-in. */
4599 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4600 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4601 }
4602 rtx vq_src = builder.build ();
4603 if (aarch64_simd_valid_immediate (vq_src, NULL))
4604 {
4605 vq_src = force_reg (vq_mode, vq_src);
4606 return aarch64_expand_sve_dupq (target, mode, vq_src);
4607 }
4608
4609 /* Get an integer representation of the repeating part of Advanced
4610 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4611 which for big-endian targets is lane-swapped wrt a normal
4612 Advanced SIMD vector. This means that for both endiannesses,
4613 memory lane N of SVE vector SRC corresponds to architectural
4614 lane N of a register holding VQ_SRC. This in turn means that
4615 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4616 as a single 128-bit value) and thus that memory lane 0 of SRC is
4617 in the lsb of the integer. Duplicating the integer therefore
4618 ensures that memory lane N of SRC goes into architectural lane
4619 N + I * INDEX of the SVE register. */
4620 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4621 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4622 if (elt_value)
4623 {
4624 /* Pretend that we had a vector of INT_MODE to start with. */
4625 elt_mode = int_mode;
4626 mode = aarch64_full_sve_mode (int_mode).require ();
4627
4628 /* If the integer can be moved into a general register by a
4629 single instruction, do that and duplicate the result. */
4630 if (CONST_INT_P (elt_value)
4631 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4632 {
4633 elt_value = force_reg (elt_mode, elt_value);
4634 return expand_vector_broadcast (mode, elt_value);
4635 }
4636 }
4637 else if (npatterns == 1)
4638 /* We're duplicating a single value, but can't do better than
4639 force it to memory and load from there. This handles things
4640 like symbolic constants. */
4641 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4642
4643 if (elt_value)
4644 {
4645 /* Load the element from memory if we can, otherwise move it into
4646 a register and use a DUP. */
4647 rtx op = force_const_mem (elt_mode, elt_value);
4648 if (!op)
4649 op = force_reg (elt_mode, elt_value);
4650 return expand_vector_broadcast (mode, op);
4651 }
4652 }
4653
4654 /* Try using INDEX. */
4655 rtx base, step;
4656 if (const_vec_series_p (src, &base, &step))
4657 {
4658 aarch64_expand_vec_series (target, base, step);
4659 return target;
4660 }
4661
4662 /* From here on, it's better to force the whole constant to memory
4663 if we can. */
4664 if (GET_MODE_NUNITS (mode).is_constant ())
4665 return NULL_RTX;
4666
4667 /* Expand each pattern individually. */
4668 gcc_assert (npatterns > 1);
4669 rtx_vector_builder builder;
4670 auto_vec<rtx, 16> vectors (npatterns);
4671 for (unsigned int i = 0; i < npatterns; ++i)
4672 {
4673 builder.new_vector (mode, 1, nelts_per_pattern);
4674 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4675 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4676 vectors.quick_push (force_reg (mode, builder.build ()));
4677 }
4678
4679 /* Use permutes to interleave the separate vectors. */
4680 while (npatterns > 1)
4681 {
4682 npatterns /= 2;
4683 for (unsigned int i = 0; i < npatterns; ++i)
4684 {
4685 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4686 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4687 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4688 vectors[i] = tmp;
4689 }
4690 }
4691 gcc_assert (vectors[0] == target);
4692 return target;
4693 }
4694
4695 /* Use WHILE to set a predicate register of mode MODE in which the first
4696 VL bits are set and the rest are clear. Use TARGET for the register
4697 if it's nonnull and convenient. */
4698
4699 static rtx
4700 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4701 unsigned int vl)
4702 {
4703 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4704 target = aarch64_target_reg (target, mode);
4705 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
4706 target, const0_rtx, limit));
4707 return target;
4708 }
4709
4710 static rtx
4711 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4712
4713 /* BUILDER is a constant predicate in which the index of every set bit
4714 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4715 by inverting every element at a multiple of ELT_SIZE and EORing the
4716 result with an ELT_SIZE PTRUE.
4717
4718 Return a register that contains the constant on success, otherwise
4719 return null. Use TARGET as the register if it is nonnull and
4720 convenient. */
4721
4722 static rtx
4723 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4724 unsigned int elt_size)
4725 {
4726 /* Invert every element at a multiple of ELT_SIZE, keeping the
4727 other bits zero. */
4728 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4729 builder.nelts_per_pattern ());
4730 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4731 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4732 inv_builder.quick_push (const1_rtx);
4733 else
4734 inv_builder.quick_push (const0_rtx);
4735 inv_builder.finalize ();
4736
4737 /* See if we can load the constant cheaply. */
4738 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4739 if (!inv)
4740 return NULL_RTX;
4741
4742 /* EOR the result with an ELT_SIZE PTRUE. */
4743 rtx mask = aarch64_ptrue_all (elt_size);
4744 mask = force_reg (VNx16BImode, mask);
4745 target = aarch64_target_reg (target, VNx16BImode);
4746 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4747 return target;
4748 }
4749
4750 /* BUILDER is a constant predicate in which the index of every set bit
4751 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4752 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4753 register on success, otherwise return null. Use TARGET as the register
4754 if nonnull and convenient. */
4755
4756 static rtx
4757 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4758 unsigned int elt_size,
4759 unsigned int permute_size)
4760 {
4761 /* We're going to split the constant into two new constants A and B,
4762 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4763 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4764
4765 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4766 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4767
4768 where _ indicates elements that will be discarded by the permute.
4769
4770 First calculate the ELT_SIZEs for A and B. */
4771 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4772 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4773 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4774 if (INTVAL (builder.elt (i)) != 0)
4775 {
4776 if (i & permute_size)
4777 b_elt_size |= i - permute_size;
4778 else
4779 a_elt_size |= i;
4780 }
4781 a_elt_size &= -a_elt_size;
4782 b_elt_size &= -b_elt_size;
4783
4784 /* Now construct the vectors themselves. */
4785 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4786 builder.nelts_per_pattern ());
4787 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4788 builder.nelts_per_pattern ());
4789 unsigned int nelts = builder.encoded_nelts ();
4790 for (unsigned int i = 0; i < nelts; ++i)
4791 if (i & (elt_size - 1))
4792 {
4793 a_builder.quick_push (const0_rtx);
4794 b_builder.quick_push (const0_rtx);
4795 }
4796 else if ((i & permute_size) == 0)
4797 {
4798 /* The A and B elements are significant. */
4799 a_builder.quick_push (builder.elt (i));
4800 b_builder.quick_push (builder.elt (i + permute_size));
4801 }
4802 else
4803 {
4804 /* The A and B elements are going to be discarded, so pick whatever
4805 is likely to give a nice constant. We are targeting element
4806 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4807 with the aim of each being a sequence of ones followed by
4808 a sequence of zeros. So:
4809
4810 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4811 duplicate the last X_ELT_SIZE element, to extend the
4812 current sequence of ones or zeros.
4813
4814 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4815 zero, so that the constant really does have X_ELT_SIZE and
4816 not a smaller size. */
4817 if (a_elt_size > permute_size)
4818 a_builder.quick_push (const0_rtx);
4819 else
4820 a_builder.quick_push (a_builder.elt (i - a_elt_size));
4821 if (b_elt_size > permute_size)
4822 b_builder.quick_push (const0_rtx);
4823 else
4824 b_builder.quick_push (b_builder.elt (i - b_elt_size));
4825 }
4826 a_builder.finalize ();
4827 b_builder.finalize ();
4828
4829 /* Try loading A into a register. */
4830 rtx_insn *last = get_last_insn ();
4831 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4832 if (!a)
4833 return NULL_RTX;
4834
4835 /* Try loading B into a register. */
4836 rtx b = a;
4837 if (a_builder != b_builder)
4838 {
4839 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4840 if (!b)
4841 {
4842 delete_insns_since (last);
4843 return NULL_RTX;
4844 }
4845 }
4846
4847 /* Emit the TRN1 itself. */
4848 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4849 target = aarch64_target_reg (target, mode);
4850 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4851 gen_lowpart (mode, a),
4852 gen_lowpart (mode, b)));
4853 return target;
4854 }
4855
4856 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4857 constant in BUILDER into an SVE predicate register. Return the register
4858 on success, otherwise return null. Use TARGET for the register if
4859 nonnull and convenient.
4860
4861 ALLOW_RECURSE_P is true if we can use methods that would call this
4862 function recursively. */
4863
4864 static rtx
4865 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4866 bool allow_recurse_p)
4867 {
4868 if (builder.encoded_nelts () == 1)
4869 /* A PFALSE or a PTRUE .B ALL. */
4870 return aarch64_emit_set_immediate (target, builder);
4871
4872 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4873 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4874 {
4875 /* If we can load the constant using PTRUE, use it as-is. */
4876 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4877 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4878 return aarch64_emit_set_immediate (target, builder);
4879
4880 /* Otherwise use WHILE to set the first VL bits. */
4881 return aarch64_sve_move_pred_via_while (target, mode, vl);
4882 }
4883
4884 if (!allow_recurse_p)
4885 return NULL_RTX;
4886
4887 /* Try inverting the vector in element size ELT_SIZE and then EORing
4888 the result with an ELT_SIZE PTRUE. */
4889 if (INTVAL (builder.elt (0)) == 0)
4890 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4891 elt_size))
4892 return res;
4893
4894 /* Try using TRN1 to permute two simpler constants. */
4895 for (unsigned int i = elt_size; i <= 8; i *= 2)
4896 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4897 elt_size, i))
4898 return res;
4899
4900 return NULL_RTX;
4901 }
4902
4903 /* Return an SVE predicate register that contains the VNx16BImode
4904 constant in BUILDER, without going through the move expanders.
4905
4906 The returned register can have whatever mode seems most natural
4907 given the contents of BUILDER. Use TARGET for the result if
4908 convenient. */
4909
4910 static rtx
4911 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4912 {
4913 /* Try loading the constant using pure predicate operations. */
4914 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4915 return res;
4916
4917 /* Try forcing the constant to memory. */
4918 if (builder.full_nelts ().is_constant ())
4919 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4920 {
4921 target = aarch64_target_reg (target, VNx16BImode);
4922 emit_move_insn (target, mem);
4923 return target;
4924 }
4925
4926 /* The last resort is to load the constant as an integer and then
4927 compare it against zero. Use -1 for set bits in order to increase
4928 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4929 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4930 builder.nelts_per_pattern ());
4931 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4932 int_builder.quick_push (INTVAL (builder.elt (i))
4933 ? constm1_rtx : const0_rtx);
4934 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4935 int_builder.build ());
4936 }
4937
4938 /* Set DEST to immediate IMM. */
4939
4940 void
4941 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4942 {
4943 machine_mode mode = GET_MODE (dest);
4944
4945 /* Check on what type of symbol it is. */
4946 scalar_int_mode int_mode;
4947 if ((GET_CODE (imm) == SYMBOL_REF
4948 || GET_CODE (imm) == LABEL_REF
4949 || GET_CODE (imm) == CONST
4950 || GET_CODE (imm) == CONST_POLY_INT)
4951 && is_a <scalar_int_mode> (mode, &int_mode))
4952 {
4953 rtx mem;
4954 poly_int64 offset;
4955 HOST_WIDE_INT const_offset;
4956 enum aarch64_symbol_type sty;
4957
4958 /* If we have (const (plus symbol offset)), separate out the offset
4959 before we start classifying the symbol. */
4960 rtx base = strip_offset (imm, &offset);
4961
4962 /* We must always add an offset involving VL separately, rather than
4963 folding it into the relocation. */
4964 if (!offset.is_constant (&const_offset))
4965 {
4966 if (!TARGET_SVE)
4967 {
4968 aarch64_report_sve_required ();
4969 return;
4970 }
4971 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4972 emit_insn (gen_rtx_SET (dest, imm));
4973 else
4974 {
4975 /* Do arithmetic on 32-bit values if the result is smaller
4976 than that. */
4977 if (partial_subreg_p (int_mode, SImode))
4978 {
4979 /* It is invalid to do symbol calculations in modes
4980 narrower than SImode. */
4981 gcc_assert (base == const0_rtx);
4982 dest = gen_lowpart (SImode, dest);
4983 int_mode = SImode;
4984 }
4985 if (base != const0_rtx)
4986 {
4987 base = aarch64_force_temporary (int_mode, dest, base);
4988 aarch64_add_offset (int_mode, dest, base, offset,
4989 NULL_RTX, NULL_RTX, false);
4990 }
4991 else
4992 aarch64_add_offset (int_mode, dest, base, offset,
4993 dest, NULL_RTX, false);
4994 }
4995 return;
4996 }
4997
4998 sty = aarch64_classify_symbol (base, const_offset);
4999 switch (sty)
5000 {
5001 case SYMBOL_FORCE_TO_MEM:
5002 if (const_offset != 0
5003 && targetm.cannot_force_const_mem (int_mode, imm))
5004 {
5005 gcc_assert (can_create_pseudo_p ());
5006 base = aarch64_force_temporary (int_mode, dest, base);
5007 aarch64_add_offset (int_mode, dest, base, const_offset,
5008 NULL_RTX, NULL_RTX, false);
5009 return;
5010 }
5011
5012 mem = force_const_mem (ptr_mode, imm);
5013 gcc_assert (mem);
5014
5015 /* If we aren't generating PC relative literals, then
5016 we need to expand the literal pool access carefully.
5017 This is something that needs to be done in a number
5018 of places, so could well live as a separate function. */
5019 if (!aarch64_pcrelative_literal_loads)
5020 {
5021 gcc_assert (can_create_pseudo_p ());
5022 base = gen_reg_rtx (ptr_mode);
5023 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5024 if (ptr_mode != Pmode)
5025 base = convert_memory_address (Pmode, base);
5026 mem = gen_rtx_MEM (ptr_mode, base);
5027 }
5028
5029 if (int_mode != ptr_mode)
5030 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5031
5032 emit_insn (gen_rtx_SET (dest, mem));
5033
5034 return;
5035
5036 case SYMBOL_SMALL_TLSGD:
5037 case SYMBOL_SMALL_TLSDESC:
5038 case SYMBOL_SMALL_TLSIE:
5039 case SYMBOL_SMALL_GOT_28K:
5040 case SYMBOL_SMALL_GOT_4G:
5041 case SYMBOL_TINY_GOT:
5042 case SYMBOL_TINY_TLSIE:
5043 if (const_offset != 0)
5044 {
5045 gcc_assert(can_create_pseudo_p ());
5046 base = aarch64_force_temporary (int_mode, dest, base);
5047 aarch64_add_offset (int_mode, dest, base, const_offset,
5048 NULL_RTX, NULL_RTX, false);
5049 return;
5050 }
5051 /* FALLTHRU */
5052
5053 case SYMBOL_SMALL_ABSOLUTE:
5054 case SYMBOL_TINY_ABSOLUTE:
5055 case SYMBOL_TLSLE12:
5056 case SYMBOL_TLSLE24:
5057 case SYMBOL_TLSLE32:
5058 case SYMBOL_TLSLE48:
5059 aarch64_load_symref_appropriately (dest, imm, sty);
5060 return;
5061
5062 default:
5063 gcc_unreachable ();
5064 }
5065 }
5066
5067 if (!CONST_INT_P (imm))
5068 {
5069 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5070 {
5071 /* Only the low bit of each .H, .S and .D element is defined,
5072 so we can set the upper bits to whatever we like. If the
5073 predicate is all-true in MODE, prefer to set all the undefined
5074 bits as well, so that we can share a single .B predicate for
5075 all modes. */
5076 if (imm == CONSTM1_RTX (mode))
5077 imm = CONSTM1_RTX (VNx16BImode);
5078
5079 /* All methods for constructing predicate modes wider than VNx16BI
5080 will set the upper bits of each element to zero. Expose this
5081 by moving such constants as a VNx16BI, so that all bits are
5082 significant and so that constants for different modes can be
5083 shared. The wider constant will still be available as a
5084 REG_EQUAL note. */
5085 rtx_vector_builder builder;
5086 if (aarch64_get_sve_pred_bits (builder, imm))
5087 {
5088 rtx res = aarch64_expand_sve_const_pred (dest, builder);
5089 if (dest != res)
5090 emit_move_insn (dest, gen_lowpart (mode, res));
5091 return;
5092 }
5093 }
5094
5095 if (GET_CODE (imm) == HIGH
5096 || aarch64_simd_valid_immediate (imm, NULL))
5097 {
5098 emit_insn (gen_rtx_SET (dest, imm));
5099 return;
5100 }
5101
5102 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5103 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5104 {
5105 if (dest != res)
5106 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5107 return;
5108 }
5109
5110 rtx mem = force_const_mem (mode, imm);
5111 gcc_assert (mem);
5112 emit_move_insn (dest, mem);
5113 return;
5114 }
5115
5116 aarch64_internal_mov_immediate (dest, imm, true,
5117 as_a <scalar_int_mode> (mode));
5118 }
5119
5120 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
5121 that is known to contain PTRUE. */
5122
5123 void
5124 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5125 {
5126 expand_operand ops[3];
5127 machine_mode mode = GET_MODE (dest);
5128 create_output_operand (&ops[0], dest, mode);
5129 create_input_operand (&ops[1], pred, GET_MODE(pred));
5130 create_input_operand (&ops[2], src, mode);
5131 temporary_volatile_ok v (true);
5132 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
5133 }
5134
5135 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5136 operand is in memory. In this case we need to use the predicated LD1
5137 and ST1 instead of LDR and STR, both for correctness on big-endian
5138 targets and because LD1 and ST1 support a wider range of addressing modes.
5139 PRED_MODE is the mode of the predicate.
5140
5141 See the comment at the head of aarch64-sve.md for details about the
5142 big-endian handling. */
5143
5144 void
5145 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5146 {
5147 machine_mode mode = GET_MODE (dest);
5148 rtx ptrue = aarch64_ptrue_reg (pred_mode);
5149 if (!register_operand (src, mode)
5150 && !register_operand (dest, mode))
5151 {
5152 rtx tmp = gen_reg_rtx (mode);
5153 if (MEM_P (src))
5154 aarch64_emit_sve_pred_move (tmp, ptrue, src);
5155 else
5156 emit_move_insn (tmp, src);
5157 src = tmp;
5158 }
5159 aarch64_emit_sve_pred_move (dest, ptrue, src);
5160 }
5161
5162 /* Called only on big-endian targets. See whether an SVE vector move
5163 from SRC to DEST is effectively a REV[BHW] instruction, because at
5164 least one operand is a subreg of an SVE vector that has wider or
5165 narrower elements. Return true and emit the instruction if so.
5166
5167 For example:
5168
5169 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5170
5171 represents a VIEW_CONVERT between the following vectors, viewed
5172 in memory order:
5173
5174 R2: { [0].high, [0].low, [1].high, [1].low, ... }
5175 R1: { [0], [1], [2], [3], ... }
5176
5177 The high part of lane X in R2 should therefore correspond to lane X*2
5178 of R1, but the register representations are:
5179
5180 msb lsb
5181 R2: ...... [1].high [1].low [0].high [0].low
5182 R1: ...... [3] [2] [1] [0]
5183
5184 where the low part of lane X in R2 corresponds to lane X*2 in R1.
5185 We therefore need a reverse operation to swap the high and low values
5186 around.
5187
5188 This is purely an optimization. Without it we would spill the
5189 subreg operand to the stack in one mode and reload it in the
5190 other mode, which has the same effect as the REV. */
5191
5192 bool
5193 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5194 {
5195 gcc_assert (BYTES_BIG_ENDIAN);
5196 if (GET_CODE (dest) == SUBREG)
5197 dest = SUBREG_REG (dest);
5198 if (GET_CODE (src) == SUBREG)
5199 src = SUBREG_REG (src);
5200
5201 /* The optimization handles two single SVE REGs with different element
5202 sizes. */
5203 if (!REG_P (dest)
5204 || !REG_P (src)
5205 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5206 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5207 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5208 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5209 return false;
5210
5211 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
5212 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
5213 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5214 UNSPEC_REV_SUBREG);
5215 emit_insn (gen_rtx_SET (dest, unspec));
5216 return true;
5217 }
5218
5219 /* Return a copy of X with mode MODE, without changing its other
5220 attributes. Unlike gen_lowpart, this doesn't care whether the
5221 mode change is valid. */
5222
5223 rtx
5224 aarch64_replace_reg_mode (rtx x, machine_mode mode)
5225 {
5226 if (GET_MODE (x) == mode)
5227 return x;
5228
5229 x = shallow_copy_rtx (x);
5230 set_mode_and_regno (x, mode, REGNO (x));
5231 return x;
5232 }
5233
5234 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5235 stored in wider integer containers. */
5236
5237 static unsigned int
5238 aarch64_sve_rev_unspec (machine_mode mode)
5239 {
5240 switch (GET_MODE_UNIT_SIZE (mode))
5241 {
5242 case 1: return UNSPEC_REVB;
5243 case 2: return UNSPEC_REVH;
5244 case 4: return UNSPEC_REVW;
5245 }
5246 gcc_unreachable ();
5247 }
5248
5249 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
5250 operands. */
5251
5252 void
5253 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
5254 {
5255 /* Decide which REV operation we need. The mode with wider elements
5256 determines the mode of the operands and the mode with the narrower
5257 elements determines the reverse width. */
5258 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
5259 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
5260 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
5261 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
5262 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
5263
5264 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
5265 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
5266
5267 /* Get the operands in the appropriate modes and emit the instruction. */
5268 ptrue = gen_lowpart (pred_mode, ptrue);
5269 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
5270 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
5271 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
5272 dest, ptrue, src));
5273 }
5274
5275 static bool
5276 aarch64_function_ok_for_sibcall (tree, tree exp)
5277 {
5278 if (crtl->abi->id () != expr_callee_abi (exp).id ())
5279 return false;
5280
5281 return true;
5282 }
5283
5284 /* Subroutine of aarch64_pass_by_reference for arguments that are not
5285 passed in SVE registers. */
5286
5287 static bool
5288 aarch64_pass_by_reference_1 (const function_arg_info &arg)
5289 {
5290 HOST_WIDE_INT size;
5291 machine_mode dummymode;
5292 int nregs;
5293
5294 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
5295 if (arg.mode == BLKmode && arg.type)
5296 size = int_size_in_bytes (arg.type);
5297 else
5298 /* No frontends can create types with variable-sized modes, so we
5299 shouldn't be asked to pass or return them. */
5300 size = GET_MODE_SIZE (arg.mode).to_constant ();
5301
5302 /* Aggregates are passed by reference based on their size. */
5303 if (arg.aggregate_type_p ())
5304 size = int_size_in_bytes (arg.type);
5305
5306 /* Variable sized arguments are always returned by reference. */
5307 if (size < 0)
5308 return true;
5309
5310 /* Can this be a candidate to be passed in fp/simd register(s)? */
5311 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
5312 &dummymode, &nregs,
5313 NULL))
5314 return false;
5315
5316 /* Arguments which are variable sized or larger than 2 registers are
5317 passed by reference unless they are a homogenous floating point
5318 aggregate. */
5319 return size > 2 * UNITS_PER_WORD;
5320 }
5321
5322 /* Implement TARGET_PASS_BY_REFERENCE. */
5323
5324 static bool
5325 aarch64_pass_by_reference (cumulative_args_t pcum_v,
5326 const function_arg_info &arg)
5327 {
5328 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5329
5330 if (!arg.type)
5331 return aarch64_pass_by_reference_1 (arg);
5332
5333 pure_scalable_type_info pst_info;
5334 switch (pst_info.analyze (arg.type))
5335 {
5336 case pure_scalable_type_info::IS_PST:
5337 if (pcum && !pcum->silent_p && !TARGET_SVE)
5338 /* We can't gracefully recover at this point, so make this a
5339 fatal error. */
5340 fatal_error (input_location, "arguments of type %qT require"
5341 " the SVE ISA extension", arg.type);
5342
5343 /* Variadic SVE types are passed by reference. Normal non-variadic
5344 arguments are too if we've run out of registers. */
5345 return (!arg.named
5346 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
5347 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
5348
5349 case pure_scalable_type_info::DOESNT_MATTER:
5350 gcc_assert (aarch64_pass_by_reference_1 (arg));
5351 return true;
5352
5353 case pure_scalable_type_info::NO_ABI_IDENTITY:
5354 case pure_scalable_type_info::ISNT_PST:
5355 return aarch64_pass_by_reference_1 (arg);
5356 }
5357 gcc_unreachable ();
5358 }
5359
5360 /* Return TRUE if VALTYPE is padded to its least significant bits. */
5361 static bool
5362 aarch64_return_in_msb (const_tree valtype)
5363 {
5364 machine_mode dummy_mode;
5365 int dummy_int;
5366
5367 /* Never happens in little-endian mode. */
5368 if (!BYTES_BIG_ENDIAN)
5369 return false;
5370
5371 /* Only composite types smaller than or equal to 16 bytes can
5372 be potentially returned in registers. */
5373 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
5374 || int_size_in_bytes (valtype) <= 0
5375 || int_size_in_bytes (valtype) > 16)
5376 return false;
5377
5378 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
5379 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
5380 is always passed/returned in the least significant bits of fp/simd
5381 register(s). */
5382 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
5383 &dummy_mode, &dummy_int, NULL))
5384 return false;
5385
5386 /* Likewise pure scalable types for SVE vector and predicate registers. */
5387 pure_scalable_type_info pst_info;
5388 if (pst_info.analyze_registers (valtype))
5389 return false;
5390
5391 return true;
5392 }
5393
5394 /* Implement TARGET_FUNCTION_VALUE.
5395 Define how to find the value returned by a function. */
5396
5397 static rtx
5398 aarch64_function_value (const_tree type, const_tree func,
5399 bool outgoing ATTRIBUTE_UNUSED)
5400 {
5401 machine_mode mode;
5402 int unsignedp;
5403
5404 mode = TYPE_MODE (type);
5405 if (INTEGRAL_TYPE_P (type))
5406 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5407
5408 pure_scalable_type_info pst_info;
5409 if (type && pst_info.analyze_registers (type))
5410 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
5411
5412 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5413 are returned in memory, not by value. */
5414 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5415 bool sve_p = (vec_flags & VEC_ANY_SVE);
5416
5417 if (aarch64_return_in_msb (type))
5418 {
5419 HOST_WIDE_INT size = int_size_in_bytes (type);
5420
5421 if (size % UNITS_PER_WORD != 0)
5422 {
5423 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
5424 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
5425 }
5426 }
5427
5428 int count;
5429 machine_mode ag_mode;
5430 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
5431 &ag_mode, &count, NULL))
5432 {
5433 gcc_assert (!sve_p);
5434 if (!aarch64_composite_type_p (type, mode))
5435 {
5436 gcc_assert (count == 1 && mode == ag_mode);
5437 return gen_rtx_REG (mode, V0_REGNUM);
5438 }
5439 else
5440 {
5441 int i;
5442 rtx par;
5443
5444 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5445 for (i = 0; i < count; i++)
5446 {
5447 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5448 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5449 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5450 XVECEXP (par, 0, i) = tmp;
5451 }
5452 return par;
5453 }
5454 }
5455 else
5456 {
5457 if (sve_p)
5458 {
5459 /* Vector types can acquire a partial SVE mode using things like
5460 __attribute__((vector_size(N))), and this is potentially useful.
5461 However, the choice of mode doesn't affect the type's ABI
5462 identity, so we should treat the types as though they had
5463 the associated integer mode, just like they did before SVE
5464 was introduced.
5465
5466 We know that the vector must be 128 bits or smaller,
5467 otherwise we'd have returned it in memory instead. */
5468 gcc_assert (type
5469 && (aarch64_some_values_include_pst_objects_p (type)
5470 || (vec_flags & VEC_PARTIAL)));
5471
5472 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5473 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
5474 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5475 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5476 }
5477 return gen_rtx_REG (mode, R0_REGNUM);
5478 }
5479 }
5480
5481 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5482 Return true if REGNO is the number of a hard register in which the values
5483 of called function may come back. */
5484
5485 static bool
5486 aarch64_function_value_regno_p (const unsigned int regno)
5487 {
5488 /* Maximum of 16 bytes can be returned in the general registers. Examples
5489 of 16-byte return values are: 128-bit integers and 16-byte small
5490 structures (excluding homogeneous floating-point aggregates). */
5491 if (regno == R0_REGNUM || regno == R1_REGNUM)
5492 return true;
5493
5494 /* Up to four fp/simd registers can return a function value, e.g. a
5495 homogeneous floating-point aggregate having four members. */
5496 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5497 return TARGET_FLOAT;
5498
5499 return false;
5500 }
5501
5502 /* Subroutine for aarch64_return_in_memory for types that are not returned
5503 in SVE registers. */
5504
5505 static bool
5506 aarch64_return_in_memory_1 (const_tree type)
5507 {
5508 HOST_WIDE_INT size;
5509 machine_mode ag_mode;
5510 int count;
5511
5512 if (!AGGREGATE_TYPE_P (type)
5513 && TREE_CODE (type) != COMPLEX_TYPE
5514 && TREE_CODE (type) != VECTOR_TYPE)
5515 /* Simple scalar types always returned in registers. */
5516 return false;
5517
5518 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
5519 type,
5520 &ag_mode,
5521 &count,
5522 NULL))
5523 return false;
5524
5525 /* Types larger than 2 registers returned in memory. */
5526 size = int_size_in_bytes (type);
5527 return (size < 0 || size > 2 * UNITS_PER_WORD);
5528 }
5529
5530 /* Implement TARGET_RETURN_IN_MEMORY.
5531
5532 If the type T of the result of a function is such that
5533 void func (T arg)
5534 would require that arg be passed as a value in a register (or set of
5535 registers) according to the parameter passing rules, then the result
5536 is returned in the same registers as would be used for such an
5537 argument. */
5538
5539 static bool
5540 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5541 {
5542 pure_scalable_type_info pst_info;
5543 switch (pst_info.analyze (type))
5544 {
5545 case pure_scalable_type_info::IS_PST:
5546 return (pst_info.num_zr () > NUM_FP_ARG_REGS
5547 || pst_info.num_pr () > NUM_PR_ARG_REGS);
5548
5549 case pure_scalable_type_info::DOESNT_MATTER:
5550 gcc_assert (aarch64_return_in_memory_1 (type));
5551 return true;
5552
5553 case pure_scalable_type_info::NO_ABI_IDENTITY:
5554 case pure_scalable_type_info::ISNT_PST:
5555 return aarch64_return_in_memory_1 (type);
5556 }
5557 gcc_unreachable ();
5558 }
5559
5560 static bool
5561 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5562 const_tree type, int *nregs)
5563 {
5564 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5565 return aarch64_vfp_is_call_or_return_candidate (mode,
5566 type,
5567 &pcum->aapcs_vfp_rmode,
5568 nregs,
5569 NULL);
5570 }
5571
5572 /* Given MODE and TYPE of a function argument, return the alignment in
5573 bits. The idea is to suppress any stronger alignment requested by
5574 the user and opt for the natural alignment (specified in AAPCS64 \S
5575 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5576 calculated in versions of GCC prior to GCC-9. This is a helper
5577 function for local use only. */
5578
5579 static unsigned int
5580 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5581 bool *abi_break)
5582 {
5583 *abi_break = false;
5584 if (!type)
5585 return GET_MODE_ALIGNMENT (mode);
5586
5587 if (integer_zerop (TYPE_SIZE (type)))
5588 return 0;
5589
5590 gcc_assert (TYPE_MODE (type) == mode);
5591
5592 if (!AGGREGATE_TYPE_P (type))
5593 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5594
5595 if (TREE_CODE (type) == ARRAY_TYPE)
5596 return TYPE_ALIGN (TREE_TYPE (type));
5597
5598 unsigned int alignment = 0;
5599 unsigned int bitfield_alignment = 0;
5600 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5601 if (TREE_CODE (field) == FIELD_DECL)
5602 {
5603 alignment = std::max (alignment, DECL_ALIGN (field));
5604 if (DECL_BIT_FIELD_TYPE (field))
5605 bitfield_alignment
5606 = std::max (bitfield_alignment,
5607 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5608 }
5609
5610 if (bitfield_alignment > alignment)
5611 {
5612 *abi_break = true;
5613 return bitfield_alignment;
5614 }
5615
5616 return alignment;
5617 }
5618
5619 /* Layout a function argument according to the AAPCS64 rules. The rule
5620 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5621 mode that was originally given to us by the target hook, whereas the
5622 mode in ARG might be the result of replacing partial SVE modes with
5623 the equivalent integer mode. */
5624
5625 static void
5626 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5627 {
5628 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5629 tree type = arg.type;
5630 machine_mode mode = arg.mode;
5631 int ncrn, nvrn, nregs;
5632 bool allocate_ncrn, allocate_nvrn;
5633 HOST_WIDE_INT size;
5634 bool abi_break;
5635
5636 /* We need to do this once per argument. */
5637 if (pcum->aapcs_arg_processed)
5638 return;
5639
5640 pcum->aapcs_arg_processed = true;
5641
5642 pure_scalable_type_info pst_info;
5643 if (type && pst_info.analyze_registers (type))
5644 {
5645 /* The PCS says that it is invalid to pass an SVE value to an
5646 unprototyped function. There is no ABI-defined location we
5647 can return in this case, so we have no real choice but to raise
5648 an error immediately, even though this is only a query function. */
5649 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5650 {
5651 gcc_assert (!pcum->silent_p);
5652 error ("SVE type %qT cannot be passed to an unprototyped function",
5653 arg.type);
5654 /* Avoid repeating the message, and avoid tripping the assert
5655 below. */
5656 pcum->pcs_variant = ARM_PCS_SVE;
5657 }
5658
5659 /* We would have converted the argument into pass-by-reference
5660 form if it didn't fit in registers. */
5661 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
5662 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
5663 gcc_assert (arg.named
5664 && pcum->pcs_variant == ARM_PCS_SVE
5665 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5666 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5667 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
5668 P0_REGNUM + pcum->aapcs_nprn);
5669 return;
5670 }
5671
5672 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5673 are passed by reference, not by value. */
5674 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5675 bool sve_p = (vec_flags & VEC_ANY_SVE);
5676 if (sve_p)
5677 /* Vector types can acquire a partial SVE mode using things like
5678 __attribute__((vector_size(N))), and this is potentially useful.
5679 However, the choice of mode doesn't affect the type's ABI
5680 identity, so we should treat the types as though they had
5681 the associated integer mode, just like they did before SVE
5682 was introduced.
5683
5684 We know that the vector must be 128 bits or smaller,
5685 otherwise we'd have passed it in memory instead. */
5686 gcc_assert (type
5687 && (aarch64_some_values_include_pst_objects_p (type)
5688 || (vec_flags & VEC_PARTIAL)));
5689
5690 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
5691 if (type)
5692 size = int_size_in_bytes (type);
5693 else
5694 /* No frontends can create types with variable-sized modes, so we
5695 shouldn't be asked to pass or return them. */
5696 size = GET_MODE_SIZE (mode).to_constant ();
5697 size = ROUND_UP (size, UNITS_PER_WORD);
5698
5699 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5700 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5701 mode,
5702 type,
5703 &nregs);
5704 gcc_assert (!sve_p || !allocate_nvrn);
5705
5706 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5707 The following code thus handles passing by SIMD/FP registers first. */
5708
5709 nvrn = pcum->aapcs_nvrn;
5710
5711 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5712 and homogenous short-vector aggregates (HVA). */
5713 if (allocate_nvrn)
5714 {
5715 if (!pcum->silent_p && !TARGET_FLOAT)
5716 aarch64_err_no_fpadvsimd (mode);
5717
5718 if (nvrn + nregs <= NUM_FP_ARG_REGS)
5719 {
5720 pcum->aapcs_nextnvrn = nvrn + nregs;
5721 if (!aarch64_composite_type_p (type, mode))
5722 {
5723 gcc_assert (nregs == 1);
5724 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5725 }
5726 else
5727 {
5728 rtx par;
5729 int i;
5730 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5731 for (i = 0; i < nregs; i++)
5732 {
5733 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5734 V0_REGNUM + nvrn + i);
5735 rtx offset = gen_int_mode
5736 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5737 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5738 XVECEXP (par, 0, i) = tmp;
5739 }
5740 pcum->aapcs_reg = par;
5741 }
5742 return;
5743 }
5744 else
5745 {
5746 /* C.3 NSRN is set to 8. */
5747 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5748 goto on_stack;
5749 }
5750 }
5751
5752 ncrn = pcum->aapcs_ncrn;
5753 nregs = size / UNITS_PER_WORD;
5754
5755 /* C6 - C9. though the sign and zero extension semantics are
5756 handled elsewhere. This is the case where the argument fits
5757 entirely general registers. */
5758 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5759 {
5760 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5761
5762 /* C.8 if the argument has an alignment of 16 then the NGRN is
5763 rounded up to the next even number. */
5764 if (nregs == 2
5765 && ncrn % 2
5766 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5767 comparison is there because for > 16 * BITS_PER_UNIT
5768 alignment nregs should be > 2 and therefore it should be
5769 passed by reference rather than value. */
5770 && (aarch64_function_arg_alignment (mode, type, &abi_break)
5771 == 16 * BITS_PER_UNIT))
5772 {
5773 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5774 inform (input_location, "parameter passing for argument of type "
5775 "%qT changed in GCC 9.1", type);
5776 ++ncrn;
5777 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5778 }
5779
5780 /* If an argument with an SVE mode needs to be shifted up to the
5781 high part of the register, treat it as though it had an integer mode.
5782 Using the normal (parallel [...]) would suppress the shifting. */
5783 if (sve_p
5784 && BYTES_BIG_ENDIAN
5785 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
5786 && aarch64_pad_reg_upward (mode, type, false))
5787 {
5788 mode = int_mode_for_mode (mode).require ();
5789 sve_p = false;
5790 }
5791
5792 /* NREGS can be 0 when e.g. an empty structure is to be passed.
5793 A reg is still generated for it, but the caller should be smart
5794 enough not to use it. */
5795 if (nregs == 0
5796 || (nregs == 1 && !sve_p)
5797 || GET_MODE_CLASS (mode) == MODE_INT)
5798 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5799 else
5800 {
5801 rtx par;
5802 int i;
5803
5804 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5805 for (i = 0; i < nregs; i++)
5806 {
5807 scalar_int_mode reg_mode = word_mode;
5808 if (nregs == 1)
5809 reg_mode = int_mode_for_mode (mode).require ();
5810 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
5811 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5812 GEN_INT (i * UNITS_PER_WORD));
5813 XVECEXP (par, 0, i) = tmp;
5814 }
5815 pcum->aapcs_reg = par;
5816 }
5817
5818 pcum->aapcs_nextncrn = ncrn + nregs;
5819 return;
5820 }
5821
5822 /* C.11 */
5823 pcum->aapcs_nextncrn = NUM_ARG_REGS;
5824
5825 /* The argument is passed on stack; record the needed number of words for
5826 this argument and align the total size if necessary. */
5827 on_stack:
5828 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5829
5830 if (aarch64_function_arg_alignment (mode, type, &abi_break)
5831 == 16 * BITS_PER_UNIT)
5832 {
5833 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5834 if (pcum->aapcs_stack_size != new_size)
5835 {
5836 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5837 inform (input_location, "parameter passing for argument of type "
5838 "%qT changed in GCC 9.1", type);
5839 pcum->aapcs_stack_size = new_size;
5840 }
5841 }
5842 return;
5843 }
5844
5845 /* Implement TARGET_FUNCTION_ARG. */
5846
5847 static rtx
5848 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5849 {
5850 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5851 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5852 || pcum->pcs_variant == ARM_PCS_SIMD
5853 || pcum->pcs_variant == ARM_PCS_SVE);
5854
5855 if (arg.end_marker_p ())
5856 return gen_int_mode (pcum->pcs_variant, DImode);
5857
5858 aarch64_layout_arg (pcum_v, arg);
5859 return pcum->aapcs_reg;
5860 }
5861
5862 void
5863 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5864 const_tree fntype,
5865 rtx libname ATTRIBUTE_UNUSED,
5866 const_tree fndecl ATTRIBUTE_UNUSED,
5867 unsigned n_named ATTRIBUTE_UNUSED,
5868 bool silent_p)
5869 {
5870 pcum->aapcs_ncrn = 0;
5871 pcum->aapcs_nvrn = 0;
5872 pcum->aapcs_nprn = 0;
5873 pcum->aapcs_nextncrn = 0;
5874 pcum->aapcs_nextnvrn = 0;
5875 pcum->aapcs_nextnprn = 0;
5876 if (fntype)
5877 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5878 else
5879 pcum->pcs_variant = ARM_PCS_AAPCS64;
5880 pcum->aapcs_reg = NULL_RTX;
5881 pcum->aapcs_arg_processed = false;
5882 pcum->aapcs_stack_words = 0;
5883 pcum->aapcs_stack_size = 0;
5884 pcum->silent_p = silent_p;
5885
5886 if (!silent_p
5887 && !TARGET_FLOAT
5888 && fndecl && TREE_PUBLIC (fndecl)
5889 && fntype && fntype != error_mark_node)
5890 {
5891 const_tree type = TREE_TYPE (fntype);
5892 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
5893 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
5894 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5895 &mode, &nregs, NULL))
5896 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5897 }
5898
5899 if (!silent_p
5900 && !TARGET_SVE
5901 && pcum->pcs_variant == ARM_PCS_SVE)
5902 {
5903 /* We can't gracefully recover at this point, so make this a
5904 fatal error. */
5905 if (fndecl)
5906 fatal_error (input_location, "%qE requires the SVE ISA extension",
5907 fndecl);
5908 else
5909 fatal_error (input_location, "calls to functions of type %qT require"
5910 " the SVE ISA extension", fntype);
5911 }
5912 }
5913
5914 static void
5915 aarch64_function_arg_advance (cumulative_args_t pcum_v,
5916 const function_arg_info &arg)
5917 {
5918 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5919 if (pcum->pcs_variant == ARM_PCS_AAPCS64
5920 || pcum->pcs_variant == ARM_PCS_SIMD
5921 || pcum->pcs_variant == ARM_PCS_SVE)
5922 {
5923 aarch64_layout_arg (pcum_v, arg);
5924 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
5925 != (pcum->aapcs_stack_words != 0));
5926 pcum->aapcs_arg_processed = false;
5927 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
5928 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
5929 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
5930 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
5931 pcum->aapcs_stack_words = 0;
5932 pcum->aapcs_reg = NULL_RTX;
5933 }
5934 }
5935
5936 bool
5937 aarch64_function_arg_regno_p (unsigned regno)
5938 {
5939 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
5940 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
5941 }
5942
5943 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
5944 PARM_BOUNDARY bits of alignment, but will be given anything up
5945 to STACK_BOUNDARY bits if the type requires it. This makes sure
5946 that both before and after the layout of each argument, the Next
5947 Stacked Argument Address (NSAA) will have a minimum alignment of
5948 8 bytes. */
5949
5950 static unsigned int
5951 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
5952 {
5953 bool abi_break;
5954 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
5955 &abi_break);
5956 if (abi_break & warn_psabi)
5957 inform (input_location, "parameter passing for argument of type "
5958 "%qT changed in GCC 9.1", type);
5959
5960 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
5961 }
5962
5963 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
5964
5965 static fixed_size_mode
5966 aarch64_get_reg_raw_mode (int regno)
5967 {
5968 if (TARGET_SVE && FP_REGNUM_P (regno))
5969 /* Don't use the SVE part of the register for __builtin_apply and
5970 __builtin_return. The SVE registers aren't used by the normal PCS,
5971 so using them there would be a waste of time. The PCS extensions
5972 for SVE types are fundamentally incompatible with the
5973 __builtin_return/__builtin_apply interface. */
5974 return as_a <fixed_size_mode> (V16QImode);
5975 return default_get_reg_raw_mode (regno);
5976 }
5977
5978 /* Implement TARGET_FUNCTION_ARG_PADDING.
5979
5980 Small aggregate types are placed in the lowest memory address.
5981
5982 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
5983
5984 static pad_direction
5985 aarch64_function_arg_padding (machine_mode mode, const_tree type)
5986 {
5987 /* On little-endian targets, the least significant byte of every stack
5988 argument is passed at the lowest byte address of the stack slot. */
5989 if (!BYTES_BIG_ENDIAN)
5990 return PAD_UPWARD;
5991
5992 /* Otherwise, integral, floating-point and pointer types are padded downward:
5993 the least significant byte of a stack argument is passed at the highest
5994 byte address of the stack slot. */
5995 if (type
5996 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
5997 || POINTER_TYPE_P (type))
5998 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
5999 return PAD_DOWNWARD;
6000
6001 /* Everything else padded upward, i.e. data in first byte of stack slot. */
6002 return PAD_UPWARD;
6003 }
6004
6005 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6006
6007 It specifies padding for the last (may also be the only)
6008 element of a block move between registers and memory. If
6009 assuming the block is in the memory, padding upward means that
6010 the last element is padded after its highest significant byte,
6011 while in downward padding, the last element is padded at the
6012 its least significant byte side.
6013
6014 Small aggregates and small complex types are always padded
6015 upwards.
6016
6017 We don't need to worry about homogeneous floating-point or
6018 short-vector aggregates; their move is not affected by the
6019 padding direction determined here. Regardless of endianness,
6020 each element of such an aggregate is put in the least
6021 significant bits of a fp/simd register.
6022
6023 Return !BYTES_BIG_ENDIAN if the least significant byte of the
6024 register has useful data, and return the opposite if the most
6025 significant byte does. */
6026
6027 bool
6028 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
6029 bool first ATTRIBUTE_UNUSED)
6030 {
6031
6032 /* Aside from pure scalable types, small composite types are always
6033 padded upward. */
6034 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6035 {
6036 HOST_WIDE_INT size;
6037 if (type)
6038 size = int_size_in_bytes (type);
6039 else
6040 /* No frontends can create types with variable-sized modes, so we
6041 shouldn't be asked to pass or return them. */
6042 size = GET_MODE_SIZE (mode).to_constant ();
6043 if (size < 2 * UNITS_PER_WORD)
6044 {
6045 pure_scalable_type_info pst_info;
6046 if (pst_info.analyze_registers (type))
6047 return false;
6048 return true;
6049 }
6050 }
6051
6052 /* Otherwise, use the default padding. */
6053 return !BYTES_BIG_ENDIAN;
6054 }
6055
6056 static scalar_int_mode
6057 aarch64_libgcc_cmp_return_mode (void)
6058 {
6059 return SImode;
6060 }
6061
6062 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6063
6064 /* We use the 12-bit shifted immediate arithmetic instructions so values
6065 must be multiple of (1 << 12), i.e. 4096. */
6066 #define ARITH_FACTOR 4096
6067
6068 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6069 #error Cannot use simple address calculation for stack probing
6070 #endif
6071
6072 /* The pair of scratch registers used for stack probing. */
6073 #define PROBE_STACK_FIRST_REG R9_REGNUM
6074 #define PROBE_STACK_SECOND_REG R10_REGNUM
6075
6076 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6077 inclusive. These are offsets from the current stack pointer. */
6078
6079 static void
6080 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
6081 {
6082 HOST_WIDE_INT size;
6083 if (!poly_size.is_constant (&size))
6084 {
6085 sorry ("stack probes for SVE frames");
6086 return;
6087 }
6088
6089 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
6090
6091 /* See the same assertion on PROBE_INTERVAL above. */
6092 gcc_assert ((first % ARITH_FACTOR) == 0);
6093
6094 /* See if we have a constant small number of probes to generate. If so,
6095 that's the easy case. */
6096 if (size <= PROBE_INTERVAL)
6097 {
6098 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6099
6100 emit_set_insn (reg1,
6101 plus_constant (Pmode,
6102 stack_pointer_rtx, -(first + base)));
6103 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
6104 }
6105
6106 /* The run-time loop is made up of 8 insns in the generic case while the
6107 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
6108 else if (size <= 4 * PROBE_INTERVAL)
6109 {
6110 HOST_WIDE_INT i, rem;
6111
6112 emit_set_insn (reg1,
6113 plus_constant (Pmode,
6114 stack_pointer_rtx,
6115 -(first + PROBE_INTERVAL)));
6116 emit_stack_probe (reg1);
6117
6118 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6119 it exceeds SIZE. If only two probes are needed, this will not
6120 generate any code. Then probe at FIRST + SIZE. */
6121 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6122 {
6123 emit_set_insn (reg1,
6124 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
6125 emit_stack_probe (reg1);
6126 }
6127
6128 rem = size - (i - PROBE_INTERVAL);
6129 if (rem > 256)
6130 {
6131 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6132
6133 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6134 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
6135 }
6136 else
6137 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
6138 }
6139
6140 /* Otherwise, do the same as above, but in a loop. Note that we must be
6141 extra careful with variables wrapping around because we might be at
6142 the very top (or the very bottom) of the address space and we have
6143 to be able to handle this case properly; in particular, we use an
6144 equality test for the loop condition. */
6145 else
6146 {
6147 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
6148
6149 /* Step 1: round SIZE to the previous multiple of the interval. */
6150
6151 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6152
6153
6154 /* Step 2: compute initial and final value of the loop counter. */
6155
6156 /* TEST_ADDR = SP + FIRST. */
6157 emit_set_insn (reg1,
6158 plus_constant (Pmode, stack_pointer_rtx, -first));
6159
6160 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
6161 HOST_WIDE_INT adjustment = - (first + rounded_size);
6162 if (! aarch64_uimm12_shift (adjustment))
6163 {
6164 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6165 true, Pmode);
6166 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6167 }
6168 else
6169 emit_set_insn (reg2,
6170 plus_constant (Pmode, stack_pointer_rtx, adjustment));
6171
6172 /* Step 3: the loop
6173
6174 do
6175 {
6176 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6177 probe at TEST_ADDR
6178 }
6179 while (TEST_ADDR != LAST_ADDR)
6180
6181 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6182 until it is equal to ROUNDED_SIZE. */
6183
6184 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
6185
6186
6187 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6188 that SIZE is equal to ROUNDED_SIZE. */
6189
6190 if (size != rounded_size)
6191 {
6192 HOST_WIDE_INT rem = size - rounded_size;
6193
6194 if (rem > 256)
6195 {
6196 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6197
6198 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6199 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
6200 }
6201 else
6202 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
6203 }
6204 }
6205
6206 /* Make sure nothing is scheduled before we are done. */
6207 emit_insn (gen_blockage ());
6208 }
6209
6210 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
6211 absolute addresses. */
6212
6213 const char *
6214 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6215 {
6216 static int labelno = 0;
6217 char loop_lab[32];
6218 rtx xops[2];
6219
6220 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6221
6222 /* Loop. */
6223 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6224
6225 HOST_WIDE_INT stack_clash_probe_interval
6226 = 1 << param_stack_clash_protection_guard_size;
6227
6228 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
6229 xops[0] = reg1;
6230 HOST_WIDE_INT interval;
6231 if (flag_stack_clash_protection)
6232 interval = stack_clash_probe_interval;
6233 else
6234 interval = PROBE_INTERVAL;
6235
6236 gcc_assert (aarch64_uimm12_shift (interval));
6237 xops[1] = GEN_INT (interval);
6238
6239 output_asm_insn ("sub\t%0, %0, %1", xops);
6240
6241 /* If doing stack clash protection then we probe up by the ABI specified
6242 amount. We do this because we're dropping full pages at a time in the
6243 loop. But if we're doing non-stack clash probing, probe at SP 0. */
6244 if (flag_stack_clash_protection)
6245 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
6246 else
6247 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
6248
6249 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
6250 by this amount for each iteration. */
6251 output_asm_insn ("str\txzr, [%0, %1]", xops);
6252
6253 /* Test if TEST_ADDR == LAST_ADDR. */
6254 xops[1] = reg2;
6255 output_asm_insn ("cmp\t%0, %1", xops);
6256
6257 /* Branch. */
6258 fputs ("\tb.ne\t", asm_out_file);
6259 assemble_name_raw (asm_out_file, loop_lab);
6260 fputc ('\n', asm_out_file);
6261
6262 return "";
6263 }
6264
6265 /* Emit the probe loop for doing stack clash probes and stack adjustments for
6266 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
6267 of GUARD_SIZE. When a probe is emitted it is done at most
6268 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
6269 at most MIN_PROBE_THRESHOLD. By the end of this function
6270 BASE = BASE - ADJUSTMENT. */
6271
6272 const char *
6273 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
6274 rtx min_probe_threshold, rtx guard_size)
6275 {
6276 /* This function is not allowed to use any instruction generation function
6277 like gen_ and friends. If you do you'll likely ICE during CFG validation,
6278 so instead emit the code you want using output_asm_insn. */
6279 gcc_assert (flag_stack_clash_protection);
6280 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
6281 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
6282
6283 /* The minimum required allocation before the residual requires probing. */
6284 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
6285
6286 /* Clamp the value down to the nearest value that can be used with a cmp. */
6287 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
6288 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
6289
6290 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
6291 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
6292
6293 static int labelno = 0;
6294 char loop_start_lab[32];
6295 char loop_end_lab[32];
6296 rtx xops[2];
6297
6298 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
6299 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
6300
6301 /* Emit loop start label. */
6302 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
6303
6304 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
6305 xops[0] = adjustment;
6306 xops[1] = probe_offset_value_rtx;
6307 output_asm_insn ("cmp\t%0, %1", xops);
6308
6309 /* Branch to end if not enough adjustment to probe. */
6310 fputs ("\tb.lt\t", asm_out_file);
6311 assemble_name_raw (asm_out_file, loop_end_lab);
6312 fputc ('\n', asm_out_file);
6313
6314 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
6315 xops[0] = base;
6316 xops[1] = probe_offset_value_rtx;
6317 output_asm_insn ("sub\t%0, %0, %1", xops);
6318
6319 /* Probe at BASE. */
6320 xops[1] = const0_rtx;
6321 output_asm_insn ("str\txzr, [%0, %1]", xops);
6322
6323 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
6324 xops[0] = adjustment;
6325 xops[1] = probe_offset_value_rtx;
6326 output_asm_insn ("sub\t%0, %0, %1", xops);
6327
6328 /* Branch to start if still more bytes to allocate. */
6329 fputs ("\tb\t", asm_out_file);
6330 assemble_name_raw (asm_out_file, loop_start_lab);
6331 fputc ('\n', asm_out_file);
6332
6333 /* No probe leave. */
6334 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
6335
6336 /* BASE = BASE - ADJUSTMENT. */
6337 xops[0] = base;
6338 xops[1] = adjustment;
6339 output_asm_insn ("sub\t%0, %0, %1", xops);
6340 return "";
6341 }
6342
6343 /* Determine whether a frame chain needs to be generated. */
6344 static bool
6345 aarch64_needs_frame_chain (void)
6346 {
6347 /* Force a frame chain for EH returns so the return address is at FP+8. */
6348 if (frame_pointer_needed || crtl->calls_eh_return)
6349 return true;
6350
6351 /* A leaf function cannot have calls or write LR. */
6352 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
6353
6354 /* Don't use a frame chain in leaf functions if leaf frame pointers
6355 are disabled. */
6356 if (flag_omit_leaf_frame_pointer && is_leaf)
6357 return false;
6358
6359 return aarch64_use_frame_pointer;
6360 }
6361
6362 /* Mark the registers that need to be saved by the callee and calculate
6363 the size of the callee-saved registers area and frame record (both FP
6364 and LR may be omitted). */
6365 static void
6366 aarch64_layout_frame (void)
6367 {
6368 poly_int64 offset = 0;
6369 int regno, last_fp_reg = INVALID_REGNUM;
6370 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
6371 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
6372 bool frame_related_fp_reg_p = false;
6373 aarch64_frame &frame = cfun->machine->frame;
6374
6375 frame.emit_frame_chain = aarch64_needs_frame_chain ();
6376
6377 /* Adjust the outgoing arguments size if required. Keep it in sync with what
6378 the mid-end is doing. */
6379 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
6380
6381 #define SLOT_NOT_REQUIRED (-2)
6382 #define SLOT_REQUIRED (-1)
6383
6384 frame.wb_candidate1 = INVALID_REGNUM;
6385 frame.wb_candidate2 = INVALID_REGNUM;
6386 frame.spare_pred_reg = INVALID_REGNUM;
6387
6388 /* First mark all the registers that really need to be saved... */
6389 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6390 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
6391
6392 /* ... that includes the eh data registers (if needed)... */
6393 if (crtl->calls_eh_return)
6394 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
6395 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
6396
6397 /* ... and any callee saved register that dataflow says is live. */
6398 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6399 if (df_regs_ever_live_p (regno)
6400 && !fixed_regs[regno]
6401 && (regno == R30_REGNUM
6402 || !crtl->abi->clobbers_full_reg_p (regno)))
6403 frame.reg_offset[regno] = SLOT_REQUIRED;
6404
6405 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6406 if (df_regs_ever_live_p (regno)
6407 && !fixed_regs[regno]
6408 && !crtl->abi->clobbers_full_reg_p (regno))
6409 {
6410 frame.reg_offset[regno] = SLOT_REQUIRED;
6411 last_fp_reg = regno;
6412 if (aarch64_emit_cfi_for_reg_p (regno))
6413 frame_related_fp_reg_p = true;
6414 }
6415
6416 /* Big-endian SVE frames need a spare predicate register in order
6417 to save Z8-Z15. Decide which register they should use. Prefer
6418 an unused argument register if possible, so that we don't force P4
6419 to be saved unnecessarily. */
6420 if (frame_related_fp_reg_p
6421 && crtl->abi->id () == ARM_PCS_SVE
6422 && BYTES_BIG_ENDIAN)
6423 {
6424 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
6425 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
6426 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
6427 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
6428 break;
6429 gcc_assert (regno <= P7_REGNUM);
6430 frame.spare_pred_reg = regno;
6431 df_set_regs_ever_live (regno, true);
6432 }
6433
6434 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6435 if (df_regs_ever_live_p (regno)
6436 && !fixed_regs[regno]
6437 && !crtl->abi->clobbers_full_reg_p (regno))
6438 frame.reg_offset[regno] = SLOT_REQUIRED;
6439
6440 /* With stack-clash, LR must be saved in non-leaf functions. */
6441 gcc_assert (crtl->is_leaf
6442 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6443
6444 /* Now assign stack slots for the registers. Start with the predicate
6445 registers, since predicate LDR and STR have a relatively small
6446 offset range. These saves happen below the hard frame pointer. */
6447 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6448 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6449 {
6450 frame.reg_offset[regno] = offset;
6451 offset += BYTES_PER_SVE_PRED;
6452 }
6453
6454 if (maybe_ne (offset, 0))
6455 {
6456 /* If we have any vector registers to save above the predicate registers,
6457 the offset of the vector register save slots need to be a multiple
6458 of the vector size. This lets us use the immediate forms of LDR/STR
6459 (or LD1/ST1 for big-endian).
6460
6461 A vector register is 8 times the size of a predicate register,
6462 and we need to save a maximum of 12 predicate registers, so the
6463 first vector register will be at either #1, MUL VL or #2, MUL VL.
6464
6465 If we don't have any vector registers to save, and we know how
6466 big the predicate save area is, we can just round it up to the
6467 next 16-byte boundary. */
6468 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6469 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6470 else
6471 {
6472 if (known_le (offset, vector_save_size))
6473 offset = vector_save_size;
6474 else if (known_le (offset, vector_save_size * 2))
6475 offset = vector_save_size * 2;
6476 else
6477 gcc_unreachable ();
6478 }
6479 }
6480
6481 /* If we need to save any SVE vector registers, add them next. */
6482 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6483 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6484 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6485 {
6486 frame.reg_offset[regno] = offset;
6487 offset += vector_save_size;
6488 }
6489
6490 /* OFFSET is now the offset of the hard frame pointer from the bottom
6491 of the callee save area. */
6492 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6493 frame.below_hard_fp_saved_regs_size = offset;
6494 if (frame.emit_frame_chain)
6495 {
6496 /* FP and LR are placed in the linkage record. */
6497 frame.reg_offset[R29_REGNUM] = offset;
6498 frame.wb_candidate1 = R29_REGNUM;
6499 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6500 frame.wb_candidate2 = R30_REGNUM;
6501 offset += 2 * UNITS_PER_WORD;
6502 }
6503
6504 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6505 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6506 {
6507 frame.reg_offset[regno] = offset;
6508 if (frame.wb_candidate1 == INVALID_REGNUM)
6509 frame.wb_candidate1 = regno;
6510 else if (frame.wb_candidate2 == INVALID_REGNUM)
6511 frame.wb_candidate2 = regno;
6512 offset += UNITS_PER_WORD;
6513 }
6514
6515 poly_int64 max_int_offset = offset;
6516 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6517 bool has_align_gap = maybe_ne (offset, max_int_offset);
6518
6519 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6520 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6521 {
6522 /* If there is an alignment gap between integer and fp callee-saves,
6523 allocate the last fp register to it if possible. */
6524 if (regno == last_fp_reg
6525 && has_align_gap
6526 && known_eq (vector_save_size, 8)
6527 && multiple_p (offset, 16))
6528 {
6529 frame.reg_offset[regno] = max_int_offset;
6530 break;
6531 }
6532
6533 frame.reg_offset[regno] = offset;
6534 if (frame.wb_candidate1 == INVALID_REGNUM)
6535 frame.wb_candidate1 = regno;
6536 else if (frame.wb_candidate2 == INVALID_REGNUM
6537 && frame.wb_candidate1 >= V0_REGNUM)
6538 frame.wb_candidate2 = regno;
6539 offset += vector_save_size;
6540 }
6541
6542 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6543
6544 frame.saved_regs_size = offset;
6545
6546 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6547
6548 poly_int64 above_outgoing_args
6549 = aligned_upper_bound (varargs_and_saved_regs_size
6550 + get_frame_size (),
6551 STACK_BOUNDARY / BITS_PER_UNIT);
6552
6553 frame.hard_fp_offset
6554 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6555
6556 /* Both these values are already aligned. */
6557 gcc_assert (multiple_p (crtl->outgoing_args_size,
6558 STACK_BOUNDARY / BITS_PER_UNIT));
6559 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6560
6561 frame.locals_offset = frame.saved_varargs_size;
6562
6563 frame.initial_adjust = 0;
6564 frame.final_adjust = 0;
6565 frame.callee_adjust = 0;
6566 frame.sve_callee_adjust = 0;
6567 frame.callee_offset = 0;
6568
6569 HOST_WIDE_INT max_push_offset = 0;
6570 if (frame.wb_candidate2 != INVALID_REGNUM)
6571 max_push_offset = 512;
6572 else if (frame.wb_candidate1 != INVALID_REGNUM)
6573 max_push_offset = 256;
6574
6575 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6576 HOST_WIDE_INT const_saved_regs_size;
6577 if (frame.frame_size.is_constant (&const_size)
6578 && const_size < max_push_offset
6579 && known_eq (frame.hard_fp_offset, const_size))
6580 {
6581 /* Simple, small frame with no outgoing arguments:
6582
6583 stp reg1, reg2, [sp, -frame_size]!
6584 stp reg3, reg4, [sp, 16] */
6585 frame.callee_adjust = const_size;
6586 }
6587 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6588 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6589 && const_outgoing_args_size + const_saved_regs_size < 512
6590 /* We could handle this case even with outgoing args, provided
6591 that the number of args left us with valid offsets for all
6592 predicate and vector save slots. It's such a rare case that
6593 it hardly seems worth the effort though. */
6594 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6595 && !(cfun->calls_alloca
6596 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6597 && const_fp_offset < max_push_offset))
6598 {
6599 /* Frame with small outgoing arguments:
6600
6601 sub sp, sp, frame_size
6602 stp reg1, reg2, [sp, outgoing_args_size]
6603 stp reg3, reg4, [sp, outgoing_args_size + 16] */
6604 frame.initial_adjust = frame.frame_size;
6605 frame.callee_offset = const_outgoing_args_size;
6606 }
6607 else if (saves_below_hard_fp_p
6608 && known_eq (frame.saved_regs_size,
6609 frame.below_hard_fp_saved_regs_size))
6610 {
6611 /* Frame in which all saves are SVE saves:
6612
6613 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6614 save SVE registers relative to SP
6615 sub sp, sp, outgoing_args_size */
6616 frame.initial_adjust = (frame.hard_fp_offset
6617 + frame.below_hard_fp_saved_regs_size);
6618 frame.final_adjust = crtl->outgoing_args_size;
6619 }
6620 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6621 && const_fp_offset < max_push_offset)
6622 {
6623 /* Frame with large outgoing arguments or SVE saves, but with
6624 a small local area:
6625
6626 stp reg1, reg2, [sp, -hard_fp_offset]!
6627 stp reg3, reg4, [sp, 16]
6628 [sub sp, sp, below_hard_fp_saved_regs_size]
6629 [save SVE registers relative to SP]
6630 sub sp, sp, outgoing_args_size */
6631 frame.callee_adjust = const_fp_offset;
6632 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6633 frame.final_adjust = crtl->outgoing_args_size;
6634 }
6635 else
6636 {
6637 /* Frame with large local area and outgoing arguments or SVE saves,
6638 using frame pointer:
6639
6640 sub sp, sp, hard_fp_offset
6641 stp x29, x30, [sp, 0]
6642 add x29, sp, 0
6643 stp reg3, reg4, [sp, 16]
6644 [sub sp, sp, below_hard_fp_saved_regs_size]
6645 [save SVE registers relative to SP]
6646 sub sp, sp, outgoing_args_size */
6647 frame.initial_adjust = frame.hard_fp_offset;
6648 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6649 frame.final_adjust = crtl->outgoing_args_size;
6650 }
6651
6652 /* Make sure the individual adjustments add up to the full frame size. */
6653 gcc_assert (known_eq (frame.initial_adjust
6654 + frame.callee_adjust
6655 + frame.sve_callee_adjust
6656 + frame.final_adjust, frame.frame_size));
6657
6658 frame.laid_out = true;
6659 }
6660
6661 /* Return true if the register REGNO is saved on entry to
6662 the current function. */
6663
6664 static bool
6665 aarch64_register_saved_on_entry (int regno)
6666 {
6667 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6668 }
6669
6670 /* Return the next register up from REGNO up to LIMIT for the callee
6671 to save. */
6672
6673 static unsigned
6674 aarch64_next_callee_save (unsigned regno, unsigned limit)
6675 {
6676 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6677 regno ++;
6678 return regno;
6679 }
6680
6681 /* Push the register number REGNO of mode MODE to the stack with write-back
6682 adjusting the stack by ADJUSTMENT. */
6683
6684 static void
6685 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6686 HOST_WIDE_INT adjustment)
6687 {
6688 rtx base_rtx = stack_pointer_rtx;
6689 rtx insn, reg, mem;
6690
6691 reg = gen_rtx_REG (mode, regno);
6692 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6693 plus_constant (Pmode, base_rtx, -adjustment));
6694 mem = gen_frame_mem (mode, mem);
6695
6696 insn = emit_move_insn (mem, reg);
6697 RTX_FRAME_RELATED_P (insn) = 1;
6698 }
6699
6700 /* Generate and return an instruction to store the pair of registers
6701 REG and REG2 of mode MODE to location BASE with write-back adjusting
6702 the stack location BASE by ADJUSTMENT. */
6703
6704 static rtx
6705 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6706 HOST_WIDE_INT adjustment)
6707 {
6708 switch (mode)
6709 {
6710 case E_DImode:
6711 return gen_storewb_pairdi_di (base, base, reg, reg2,
6712 GEN_INT (-adjustment),
6713 GEN_INT (UNITS_PER_WORD - adjustment));
6714 case E_DFmode:
6715 return gen_storewb_pairdf_di (base, base, reg, reg2,
6716 GEN_INT (-adjustment),
6717 GEN_INT (UNITS_PER_WORD - adjustment));
6718 case E_TFmode:
6719 return gen_storewb_pairtf_di (base, base, reg, reg2,
6720 GEN_INT (-adjustment),
6721 GEN_INT (UNITS_PER_VREG - adjustment));
6722 default:
6723 gcc_unreachable ();
6724 }
6725 }
6726
6727 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6728 stack pointer by ADJUSTMENT. */
6729
6730 static void
6731 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6732 {
6733 rtx_insn *insn;
6734 machine_mode mode = aarch64_reg_save_mode (regno1);
6735
6736 if (regno2 == INVALID_REGNUM)
6737 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6738
6739 rtx reg1 = gen_rtx_REG (mode, regno1);
6740 rtx reg2 = gen_rtx_REG (mode, regno2);
6741
6742 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6743 reg2, adjustment));
6744 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6745 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6746 RTX_FRAME_RELATED_P (insn) = 1;
6747 }
6748
6749 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6750 adjusting it by ADJUSTMENT afterwards. */
6751
6752 static rtx
6753 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6754 HOST_WIDE_INT adjustment)
6755 {
6756 switch (mode)
6757 {
6758 case E_DImode:
6759 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6760 GEN_INT (UNITS_PER_WORD));
6761 case E_DFmode:
6762 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6763 GEN_INT (UNITS_PER_WORD));
6764 case E_TFmode:
6765 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6766 GEN_INT (UNITS_PER_VREG));
6767 default:
6768 gcc_unreachable ();
6769 }
6770 }
6771
6772 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6773 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6774 into CFI_OPS. */
6775
6776 static void
6777 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6778 rtx *cfi_ops)
6779 {
6780 machine_mode mode = aarch64_reg_save_mode (regno1);
6781 rtx reg1 = gen_rtx_REG (mode, regno1);
6782
6783 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6784
6785 if (regno2 == INVALID_REGNUM)
6786 {
6787 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6788 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6789 emit_move_insn (reg1, gen_frame_mem (mode, mem));
6790 }
6791 else
6792 {
6793 rtx reg2 = gen_rtx_REG (mode, regno2);
6794 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6795 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6796 reg2, adjustment));
6797 }
6798 }
6799
6800 /* Generate and return a store pair instruction of mode MODE to store
6801 register REG1 to MEM1 and register REG2 to MEM2. */
6802
6803 static rtx
6804 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6805 rtx reg2)
6806 {
6807 switch (mode)
6808 {
6809 case E_DImode:
6810 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6811
6812 case E_DFmode:
6813 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6814
6815 case E_TFmode:
6816 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6817
6818 default:
6819 gcc_unreachable ();
6820 }
6821 }
6822
6823 /* Generate and regurn a load pair isntruction of mode MODE to load register
6824 REG1 from MEM1 and register REG2 from MEM2. */
6825
6826 static rtx
6827 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6828 rtx mem2)
6829 {
6830 switch (mode)
6831 {
6832 case E_DImode:
6833 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6834
6835 case E_DFmode:
6836 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6837
6838 case E_TFmode:
6839 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6840
6841 default:
6842 gcc_unreachable ();
6843 }
6844 }
6845
6846 /* Return TRUE if return address signing should be enabled for the current
6847 function, otherwise return FALSE. */
6848
6849 bool
6850 aarch64_return_address_signing_enabled (void)
6851 {
6852 /* This function should only be called after frame laid out. */
6853 gcc_assert (cfun->machine->frame.laid_out);
6854
6855 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6856 if its LR is pushed onto stack. */
6857 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6858 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6859 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6860 }
6861
6862 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
6863 bool
6864 aarch64_bti_enabled (void)
6865 {
6866 return (aarch64_enable_bti == 1);
6867 }
6868
6869 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6870 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6871 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
6872
6873 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6874 or LD1D address
6875
6876 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6877 if the variable isn't already nonnull
6878
6879 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6880 Handle this case using a temporary base register that is suitable for
6881 all offsets in that range. Use ANCHOR_REG as this base register if it
6882 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
6883
6884 static inline void
6885 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
6886 rtx &anchor_reg, poly_int64 &offset,
6887 rtx &ptrue)
6888 {
6889 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
6890 {
6891 /* This is the maximum valid offset of the anchor from the base.
6892 Lower values would be valid too. */
6893 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
6894 if (!anchor_reg)
6895 {
6896 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6897 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6898 gen_int_mode (anchor_offset, Pmode)));
6899 }
6900 base_rtx = anchor_reg;
6901 offset -= anchor_offset;
6902 }
6903 if (!ptrue)
6904 {
6905 int pred_reg = cfun->machine->frame.spare_pred_reg;
6906 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
6907 CONSTM1_RTX (VNx16BImode));
6908 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
6909 }
6910 }
6911
6912 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6913 is saved at BASE + OFFSET. */
6914
6915 static void
6916 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
6917 rtx base, poly_int64 offset)
6918 {
6919 rtx mem = gen_frame_mem (GET_MODE (reg),
6920 plus_constant (Pmode, base, offset));
6921 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
6922 }
6923
6924 /* Emit code to save the callee-saved registers from register number START
6925 to LIMIT to the stack at the location starting at offset START_OFFSET,
6926 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
6927 is true if the hard frame pointer has been set up. */
6928
6929 static void
6930 aarch64_save_callee_saves (poly_int64 start_offset,
6931 unsigned start, unsigned limit, bool skip_wb,
6932 bool hard_fp_valid_p)
6933 {
6934 rtx_insn *insn;
6935 unsigned regno;
6936 unsigned regno2;
6937 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6938
6939 for (regno = aarch64_next_callee_save (start, limit);
6940 regno <= limit;
6941 regno = aarch64_next_callee_save (regno + 1, limit))
6942 {
6943 rtx reg, mem;
6944 poly_int64 offset;
6945 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6946
6947 if (skip_wb
6948 && (regno == cfun->machine->frame.wb_candidate1
6949 || regno == cfun->machine->frame.wb_candidate2))
6950 continue;
6951
6952 if (cfun->machine->reg_is_wrapped_separately[regno])
6953 continue;
6954
6955 machine_mode mode = aarch64_reg_save_mode (regno);
6956 reg = gen_rtx_REG (mode, regno);
6957 offset = start_offset + cfun->machine->frame.reg_offset[regno];
6958 rtx base_rtx = stack_pointer_rtx;
6959 poly_int64 sp_offset = offset;
6960
6961 HOST_WIDE_INT const_offset;
6962 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6963 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6964 offset, ptrue);
6965 else if (GP_REGNUM_P (regno)
6966 && (!offset.is_constant (&const_offset) || const_offset >= 512))
6967 {
6968 gcc_assert (known_eq (start_offset, 0));
6969 poly_int64 fp_offset
6970 = cfun->machine->frame.below_hard_fp_saved_regs_size;
6971 if (hard_fp_valid_p)
6972 base_rtx = hard_frame_pointer_rtx;
6973 else
6974 {
6975 if (!anchor_reg)
6976 {
6977 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6978 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6979 gen_int_mode (fp_offset, Pmode)));
6980 }
6981 base_rtx = anchor_reg;
6982 }
6983 offset -= fp_offset;
6984 }
6985 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6986 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
6987
6988 if (!aarch64_sve_mode_p (mode)
6989 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6990 && !cfun->machine->reg_is_wrapped_separately[regno2]
6991 && known_eq (GET_MODE_SIZE (mode),
6992 cfun->machine->frame.reg_offset[regno2]
6993 - cfun->machine->frame.reg_offset[regno]))
6994 {
6995 rtx reg2 = gen_rtx_REG (mode, regno2);
6996 rtx mem2;
6997
6998 offset += GET_MODE_SIZE (mode);
6999 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7000 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7001 reg2));
7002
7003 /* The first part of a frame-related parallel insn is
7004 always assumed to be relevant to the frame
7005 calculations; subsequent parts, are only
7006 frame-related if explicitly marked. */
7007 if (aarch64_emit_cfi_for_reg_p (regno2))
7008 {
7009 if (need_cfa_note_p)
7010 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7011 sp_offset + GET_MODE_SIZE (mode));
7012 else
7013 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7014 }
7015
7016 regno = regno2;
7017 }
7018 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7019 {
7020 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7021 need_cfa_note_p = true;
7022 }
7023 else if (aarch64_sve_mode_p (mode))
7024 insn = emit_insn (gen_rtx_SET (mem, reg));
7025 else
7026 insn = emit_move_insn (mem, reg);
7027
7028 RTX_FRAME_RELATED_P (insn) = frame_related_p;
7029 if (frame_related_p && need_cfa_note_p)
7030 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
7031 }
7032 }
7033
7034 /* Emit code to restore the callee registers from register number START
7035 up to and including LIMIT. Restore from the stack offset START_OFFSET,
7036 skipping any write-back candidates if SKIP_WB is true. Write the
7037 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
7038
7039 static void
7040 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
7041 unsigned limit, bool skip_wb, rtx *cfi_ops)
7042 {
7043 unsigned regno;
7044 unsigned regno2;
7045 poly_int64 offset;
7046 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7047
7048 for (regno = aarch64_next_callee_save (start, limit);
7049 regno <= limit;
7050 regno = aarch64_next_callee_save (regno + 1, limit))
7051 {
7052 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7053 if (cfun->machine->reg_is_wrapped_separately[regno])
7054 continue;
7055
7056 rtx reg, mem;
7057
7058 if (skip_wb
7059 && (regno == cfun->machine->frame.wb_candidate1
7060 || regno == cfun->machine->frame.wb_candidate2))
7061 continue;
7062
7063 machine_mode mode = aarch64_reg_save_mode (regno);
7064 reg = gen_rtx_REG (mode, regno);
7065 offset = start_offset + cfun->machine->frame.reg_offset[regno];
7066 rtx base_rtx = stack_pointer_rtx;
7067 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7068 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7069 offset, ptrue);
7070 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7071
7072 if (!aarch64_sve_mode_p (mode)
7073 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7074 && !cfun->machine->reg_is_wrapped_separately[regno2]
7075 && known_eq (GET_MODE_SIZE (mode),
7076 cfun->machine->frame.reg_offset[regno2]
7077 - cfun->machine->frame.reg_offset[regno]))
7078 {
7079 rtx reg2 = gen_rtx_REG (mode, regno2);
7080 rtx mem2;
7081
7082 offset += GET_MODE_SIZE (mode);
7083 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7084 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7085
7086 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7087 regno = regno2;
7088 }
7089 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7090 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7091 else if (aarch64_sve_mode_p (mode))
7092 emit_insn (gen_rtx_SET (reg, mem));
7093 else
7094 emit_move_insn (reg, mem);
7095 if (frame_related_p)
7096 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
7097 }
7098 }
7099
7100 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
7101 of MODE. */
7102
7103 static inline bool
7104 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7105 {
7106 HOST_WIDE_INT multiple;
7107 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7108 && IN_RANGE (multiple, -8, 7));
7109 }
7110
7111 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
7112 of MODE. */
7113
7114 static inline bool
7115 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7116 {
7117 HOST_WIDE_INT multiple;
7118 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7119 && IN_RANGE (multiple, 0, 63));
7120 }
7121
7122 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
7123 of MODE. */
7124
7125 bool
7126 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7127 {
7128 HOST_WIDE_INT multiple;
7129 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7130 && IN_RANGE (multiple, -64, 63));
7131 }
7132
7133 /* Return true if OFFSET is a signed 9-bit value. */
7134
7135 bool
7136 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7137 poly_int64 offset)
7138 {
7139 HOST_WIDE_INT const_offset;
7140 return (offset.is_constant (&const_offset)
7141 && IN_RANGE (const_offset, -256, 255));
7142 }
7143
7144 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
7145 of MODE. */
7146
7147 static inline bool
7148 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7149 {
7150 HOST_WIDE_INT multiple;
7151 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7152 && IN_RANGE (multiple, -256, 255));
7153 }
7154
7155 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7156 of MODE. */
7157
7158 static inline bool
7159 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7160 {
7161 HOST_WIDE_INT multiple;
7162 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7163 && IN_RANGE (multiple, 0, 4095));
7164 }
7165
7166 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
7167
7168 static sbitmap
7169 aarch64_get_separate_components (void)
7170 {
7171 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7172 bitmap_clear (components);
7173
7174 /* The registers we need saved to the frame. */
7175 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7176 if (aarch64_register_saved_on_entry (regno))
7177 {
7178 /* Punt on saves and restores that use ST1D and LD1D. We could
7179 try to be smarter, but it would involve making sure that the
7180 spare predicate register itself is safe to use at the save
7181 and restore points. Also, when a frame pointer is being used,
7182 the slots are often out of reach of ST1D and LD1D anyway. */
7183 machine_mode mode = aarch64_reg_save_mode (regno);
7184 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7185 continue;
7186
7187 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7188
7189 /* If the register is saved in the first SVE save slot, we use
7190 it as a stack probe for -fstack-clash-protection. */
7191 if (flag_stack_clash_protection
7192 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
7193 && known_eq (offset, 0))
7194 continue;
7195
7196 /* Get the offset relative to the register we'll use. */
7197 if (frame_pointer_needed)
7198 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7199 else
7200 offset += crtl->outgoing_args_size;
7201
7202 /* Check that we can access the stack slot of the register with one
7203 direct load with no adjustments needed. */
7204 if (aarch64_sve_mode_p (mode)
7205 ? offset_9bit_signed_scaled_p (mode, offset)
7206 : offset_12bit_unsigned_scaled_p (mode, offset))
7207 bitmap_set_bit (components, regno);
7208 }
7209
7210 /* Don't mess with the hard frame pointer. */
7211 if (frame_pointer_needed)
7212 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
7213
7214 /* If the spare predicate register used by big-endian SVE code
7215 is call-preserved, it must be saved in the main prologue
7216 before any saves that use it. */
7217 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
7218 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
7219
7220 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7221 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7222 /* If registers have been chosen to be stored/restored with
7223 writeback don't interfere with them to avoid having to output explicit
7224 stack adjustment instructions. */
7225 if (reg2 != INVALID_REGNUM)
7226 bitmap_clear_bit (components, reg2);
7227 if (reg1 != INVALID_REGNUM)
7228 bitmap_clear_bit (components, reg1);
7229
7230 bitmap_clear_bit (components, LR_REGNUM);
7231 bitmap_clear_bit (components, SP_REGNUM);
7232
7233 return components;
7234 }
7235
7236 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
7237
7238 static sbitmap
7239 aarch64_components_for_bb (basic_block bb)
7240 {
7241 bitmap in = DF_LIVE_IN (bb);
7242 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
7243 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
7244
7245 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7246 bitmap_clear (components);
7247
7248 /* Clobbered registers don't generate values in any meaningful sense,
7249 since nothing after the clobber can rely on their value. And we can't
7250 say that partially-clobbered registers are unconditionally killed,
7251 because whether they're killed or not depends on the mode of the
7252 value they're holding. Thus partially call-clobbered registers
7253 appear in neither the kill set nor the gen set.
7254
7255 Check manually for any calls that clobber more of a register than the
7256 current function can. */
7257 function_abi_aggregator callee_abis;
7258 rtx_insn *insn;
7259 FOR_BB_INSNS (bb, insn)
7260 if (CALL_P (insn))
7261 callee_abis.note_callee_abi (insn_callee_abi (insn));
7262 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
7263
7264 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
7265 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7266 if (!fixed_regs[regno]
7267 && !crtl->abi->clobbers_full_reg_p (regno)
7268 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
7269 || bitmap_bit_p (in, regno)
7270 || bitmap_bit_p (gen, regno)
7271 || bitmap_bit_p (kill, regno)))
7272 {
7273 bitmap_set_bit (components, regno);
7274
7275 /* If there is a callee-save at an adjacent offset, add it too
7276 to increase the use of LDP/STP. */
7277 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7278 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
7279
7280 if (regno2 <= LAST_SAVED_REGNUM)
7281 {
7282 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7283 if (regno < regno2
7284 ? known_eq (offset + 8, offset2)
7285 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
7286 bitmap_set_bit (components, regno2);
7287 }
7288 }
7289
7290 return components;
7291 }
7292
7293 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
7294 Nothing to do for aarch64. */
7295
7296 static void
7297 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
7298 {
7299 }
7300
7301 /* Return the next set bit in BMP from START onwards. Return the total number
7302 of bits in BMP if no set bit is found at or after START. */
7303
7304 static unsigned int
7305 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
7306 {
7307 unsigned int nbits = SBITMAP_SIZE (bmp);
7308 if (start == nbits)
7309 return start;
7310
7311 gcc_assert (start < nbits);
7312 for (unsigned int i = start; i < nbits; i++)
7313 if (bitmap_bit_p (bmp, i))
7314 return i;
7315
7316 return nbits;
7317 }
7318
7319 /* Do the work for aarch64_emit_prologue_components and
7320 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
7321 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
7322 for these components or the epilogue sequence. That is, it determines
7323 whether we should emit stores or loads and what kind of CFA notes to attach
7324 to the insns. Otherwise the logic for the two sequences is very
7325 similar. */
7326
7327 static void
7328 aarch64_process_components (sbitmap components, bool prologue_p)
7329 {
7330 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
7331 ? HARD_FRAME_POINTER_REGNUM
7332 : STACK_POINTER_REGNUM);
7333
7334 unsigned last_regno = SBITMAP_SIZE (components);
7335 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
7336 rtx_insn *insn = NULL;
7337
7338 while (regno != last_regno)
7339 {
7340 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7341 machine_mode mode = aarch64_reg_save_mode (regno);
7342
7343 rtx reg = gen_rtx_REG (mode, regno);
7344 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7345 if (frame_pointer_needed)
7346 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7347 else
7348 offset += crtl->outgoing_args_size;
7349
7350 rtx addr = plus_constant (Pmode, ptr_reg, offset);
7351 rtx mem = gen_frame_mem (mode, addr);
7352
7353 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
7354 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
7355 /* No more registers to handle after REGNO.
7356 Emit a single save/restore and exit. */
7357 if (regno2 == last_regno)
7358 {
7359 insn = emit_insn (set);
7360 if (frame_related_p)
7361 {
7362 RTX_FRAME_RELATED_P (insn) = 1;
7363 if (prologue_p)
7364 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7365 else
7366 add_reg_note (insn, REG_CFA_RESTORE, reg);
7367 }
7368 break;
7369 }
7370
7371 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7372 /* The next register is not of the same class or its offset is not
7373 mergeable with the current one into a pair. */
7374 if (aarch64_sve_mode_p (mode)
7375 || !satisfies_constraint_Ump (mem)
7376 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
7377 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
7378 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
7379 GET_MODE_SIZE (mode)))
7380 {
7381 insn = emit_insn (set);
7382 if (frame_related_p)
7383 {
7384 RTX_FRAME_RELATED_P (insn) = 1;
7385 if (prologue_p)
7386 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7387 else
7388 add_reg_note (insn, REG_CFA_RESTORE, reg);
7389 }
7390
7391 regno = regno2;
7392 continue;
7393 }
7394
7395 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
7396
7397 /* REGNO2 can be saved/restored in a pair with REGNO. */
7398 rtx reg2 = gen_rtx_REG (mode, regno2);
7399 if (frame_pointer_needed)
7400 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7401 else
7402 offset2 += crtl->outgoing_args_size;
7403 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
7404 rtx mem2 = gen_frame_mem (mode, addr2);
7405 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
7406 : gen_rtx_SET (reg2, mem2);
7407
7408 if (prologue_p)
7409 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
7410 else
7411 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7412
7413 if (frame_related_p || frame_related2_p)
7414 {
7415 RTX_FRAME_RELATED_P (insn) = 1;
7416 if (prologue_p)
7417 {
7418 if (frame_related_p)
7419 add_reg_note (insn, REG_CFA_OFFSET, set);
7420 if (frame_related2_p)
7421 add_reg_note (insn, REG_CFA_OFFSET, set2);
7422 }
7423 else
7424 {
7425 if (frame_related_p)
7426 add_reg_note (insn, REG_CFA_RESTORE, reg);
7427 if (frame_related2_p)
7428 add_reg_note (insn, REG_CFA_RESTORE, reg2);
7429 }
7430 }
7431
7432 regno = aarch64_get_next_set_bit (components, regno2 + 1);
7433 }
7434 }
7435
7436 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
7437
7438 static void
7439 aarch64_emit_prologue_components (sbitmap components)
7440 {
7441 aarch64_process_components (components, true);
7442 }
7443
7444 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
7445
7446 static void
7447 aarch64_emit_epilogue_components (sbitmap components)
7448 {
7449 aarch64_process_components (components, false);
7450 }
7451
7452 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
7453
7454 static void
7455 aarch64_set_handled_components (sbitmap components)
7456 {
7457 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7458 if (bitmap_bit_p (components, regno))
7459 cfun->machine->reg_is_wrapped_separately[regno] = true;
7460 }
7461
7462 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
7463 determining the probe offset for alloca. */
7464
7465 static HOST_WIDE_INT
7466 aarch64_stack_clash_protection_alloca_probe_range (void)
7467 {
7468 return STACK_CLASH_CALLER_GUARD;
7469 }
7470
7471
7472 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7473 registers. If POLY_SIZE is not large enough to require a probe this function
7474 will only adjust the stack. When allocating the stack space
7475 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7476 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7477 arguments. If we are then we ensure that any allocation larger than the ABI
7478 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7479 maintained.
7480
7481 We emit barriers after each stack adjustment to prevent optimizations from
7482 breaking the invariant that we never drop the stack more than a page. This
7483 invariant is needed to make it easier to correctly handle asynchronous
7484 events, e.g. if we were to allow the stack to be dropped by more than a page
7485 and then have multiple probes up and we take a signal somewhere in between
7486 then the signal handler doesn't know the state of the stack and can make no
7487 assumptions about which pages have been probed. */
7488
7489 static void
7490 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7491 poly_int64 poly_size,
7492 bool frame_related_p,
7493 bool final_adjustment_p)
7494 {
7495 HOST_WIDE_INT guard_size
7496 = 1 << param_stack_clash_protection_guard_size;
7497 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7498 HOST_WIDE_INT min_probe_threshold
7499 = (final_adjustment_p
7500 ? guard_used_by_caller
7501 : guard_size - guard_used_by_caller);
7502 /* When doing the final adjustment for the outgoing arguments, take into
7503 account any unprobed space there is above the current SP. There are
7504 two cases:
7505
7506 - When saving SVE registers below the hard frame pointer, we force
7507 the lowest save to take place in the prologue before doing the final
7508 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7509 This acts as a probe at SP, so there is no unprobed space.
7510
7511 - When there are no SVE register saves, we use the store of the link
7512 register as a probe. We can't assume that LR was saved at position 0
7513 though, so treat any space below it as unprobed. */
7514 if (final_adjustment_p
7515 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7516 {
7517 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7518 if (known_ge (lr_offset, 0))
7519 min_probe_threshold -= lr_offset.to_constant ();
7520 else
7521 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7522 }
7523
7524 poly_int64 frame_size = cfun->machine->frame.frame_size;
7525
7526 /* We should always have a positive probe threshold. */
7527 gcc_assert (min_probe_threshold > 0);
7528
7529 if (flag_stack_clash_protection && !final_adjustment_p)
7530 {
7531 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7532 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7533 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7534
7535 if (known_eq (frame_size, 0))
7536 {
7537 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7538 }
7539 else if (known_lt (initial_adjust + sve_callee_adjust,
7540 guard_size - guard_used_by_caller)
7541 && known_lt (final_adjust, guard_used_by_caller))
7542 {
7543 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7544 }
7545 }
7546
7547 /* If SIZE is not large enough to require probing, just adjust the stack and
7548 exit. */
7549 if (known_lt (poly_size, min_probe_threshold)
7550 || !flag_stack_clash_protection)
7551 {
7552 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7553 return;
7554 }
7555
7556 HOST_WIDE_INT size;
7557 /* Handle the SVE non-constant case first. */
7558 if (!poly_size.is_constant (&size))
7559 {
7560 if (dump_file)
7561 {
7562 fprintf (dump_file, "Stack clash SVE prologue: ");
7563 print_dec (poly_size, dump_file);
7564 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7565 }
7566
7567 /* First calculate the amount of bytes we're actually spilling. */
7568 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7569 poly_size, temp1, temp2, false, true);
7570
7571 rtx_insn *insn = get_last_insn ();
7572
7573 if (frame_related_p)
7574 {
7575 /* This is done to provide unwinding information for the stack
7576 adjustments we're about to do, however to prevent the optimizers
7577 from removing the R11 move and leaving the CFA note (which would be
7578 very wrong) we tie the old and new stack pointer together.
7579 The tie will expand to nothing but the optimizers will not touch
7580 the instruction. */
7581 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7582 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7583 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7584
7585 /* We want the CFA independent of the stack pointer for the
7586 duration of the loop. */
7587 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7588 RTX_FRAME_RELATED_P (insn) = 1;
7589 }
7590
7591 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7592 rtx guard_const = gen_int_mode (guard_size, Pmode);
7593
7594 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7595 stack_pointer_rtx, temp1,
7596 probe_const, guard_const));
7597
7598 /* Now reset the CFA register if needed. */
7599 if (frame_related_p)
7600 {
7601 add_reg_note (insn, REG_CFA_DEF_CFA,
7602 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7603 gen_int_mode (poly_size, Pmode)));
7604 RTX_FRAME_RELATED_P (insn) = 1;
7605 }
7606
7607 return;
7608 }
7609
7610 if (dump_file)
7611 fprintf (dump_file,
7612 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7613 " bytes, probing will be required.\n", size);
7614
7615 /* Round size to the nearest multiple of guard_size, and calculate the
7616 residual as the difference between the original size and the rounded
7617 size. */
7618 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7619 HOST_WIDE_INT residual = size - rounded_size;
7620
7621 /* We can handle a small number of allocations/probes inline. Otherwise
7622 punt to a loop. */
7623 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7624 {
7625 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7626 {
7627 aarch64_sub_sp (NULL, temp2, guard_size, true);
7628 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7629 guard_used_by_caller));
7630 emit_insn (gen_blockage ());
7631 }
7632 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7633 }
7634 else
7635 {
7636 /* Compute the ending address. */
7637 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7638 temp1, NULL, false, true);
7639 rtx_insn *insn = get_last_insn ();
7640
7641 /* For the initial allocation, we don't have a frame pointer
7642 set up, so we always need CFI notes. If we're doing the
7643 final allocation, then we may have a frame pointer, in which
7644 case it is the CFA, otherwise we need CFI notes.
7645
7646 We can determine which allocation we are doing by looking at
7647 the value of FRAME_RELATED_P since the final allocations are not
7648 frame related. */
7649 if (frame_related_p)
7650 {
7651 /* We want the CFA independent of the stack pointer for the
7652 duration of the loop. */
7653 add_reg_note (insn, REG_CFA_DEF_CFA,
7654 plus_constant (Pmode, temp1, rounded_size));
7655 RTX_FRAME_RELATED_P (insn) = 1;
7656 }
7657
7658 /* This allocates and probes the stack. Note that this re-uses some of
7659 the existing Ada stack protection code. However we are guaranteed not
7660 to enter the non loop or residual branches of that code.
7661
7662 The non-loop part won't be entered because if our allocation amount
7663 doesn't require a loop, the case above would handle it.
7664
7665 The residual amount won't be entered because TEMP1 is a mutliple of
7666 the allocation size. The residual will always be 0. As such, the only
7667 part we are actually using from that code is the loop setup. The
7668 actual probing is done in aarch64_output_probe_stack_range. */
7669 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7670 stack_pointer_rtx, temp1));
7671
7672 /* Now reset the CFA register if needed. */
7673 if (frame_related_p)
7674 {
7675 add_reg_note (insn, REG_CFA_DEF_CFA,
7676 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7677 RTX_FRAME_RELATED_P (insn) = 1;
7678 }
7679
7680 emit_insn (gen_blockage ());
7681 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7682 }
7683
7684 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
7685 be probed. This maintains the requirement that each page is probed at
7686 least once. For initial probing we probe only if the allocation is
7687 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7688 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
7689 GUARD_SIZE. This works that for any allocation that is large enough to
7690 trigger a probe here, we'll have at least one, and if they're not large
7691 enough for this code to emit anything for them, The page would have been
7692 probed by the saving of FP/LR either by this function or any callees. If
7693 we don't have any callees then we won't have more stack adjustments and so
7694 are still safe. */
7695 if (residual)
7696 {
7697 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7698 /* If we're doing final adjustments, and we've done any full page
7699 allocations then any residual needs to be probed. */
7700 if (final_adjustment_p && rounded_size != 0)
7701 min_probe_threshold = 0;
7702 /* If doing a small final adjustment, we always probe at offset 0.
7703 This is done to avoid issues when LR is not at position 0 or when
7704 the final adjustment is smaller than the probing offset. */
7705 else if (final_adjustment_p && rounded_size == 0)
7706 residual_probe_offset = 0;
7707
7708 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7709 if (residual >= min_probe_threshold)
7710 {
7711 if (dump_file)
7712 fprintf (dump_file,
7713 "Stack clash AArch64 prologue residuals: "
7714 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7715 "\n", residual);
7716
7717 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7718 residual_probe_offset));
7719 emit_insn (gen_blockage ());
7720 }
7721 }
7722 }
7723
7724 /* Return 1 if the register is used by the epilogue. We need to say the
7725 return register is used, but only after epilogue generation is complete.
7726 Note that in the case of sibcalls, the values "used by the epilogue" are
7727 considered live at the start of the called function.
7728
7729 For SIMD functions we need to return 1 for FP registers that are saved and
7730 restored by a function but are not zero in call_used_regs. If we do not do
7731 this optimizations may remove the restore of the register. */
7732
7733 int
7734 aarch64_epilogue_uses (int regno)
7735 {
7736 if (epilogue_completed)
7737 {
7738 if (regno == LR_REGNUM)
7739 return 1;
7740 }
7741 return 0;
7742 }
7743
7744 /* AArch64 stack frames generated by this compiler look like:
7745
7746 +-------------------------------+
7747 | |
7748 | incoming stack arguments |
7749 | |
7750 +-------------------------------+
7751 | | <-- incoming stack pointer (aligned)
7752 | callee-allocated save area |
7753 | for register varargs |
7754 | |
7755 +-------------------------------+
7756 | local variables | <-- frame_pointer_rtx
7757 | |
7758 +-------------------------------+
7759 | padding | \
7760 +-------------------------------+ |
7761 | callee-saved registers | | frame.saved_regs_size
7762 +-------------------------------+ |
7763 | LR' | |
7764 +-------------------------------+ |
7765 | FP' | |
7766 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
7767 | SVE vector registers | | \
7768 +-------------------------------+ | | below_hard_fp_saved_regs_size
7769 | SVE predicate registers | / /
7770 +-------------------------------+
7771 | dynamic allocation |
7772 +-------------------------------+
7773 | padding |
7774 +-------------------------------+
7775 | outgoing stack arguments | <-- arg_pointer
7776 | |
7777 +-------------------------------+
7778 | | <-- stack_pointer_rtx (aligned)
7779
7780 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7781 but leave frame_pointer_rtx and hard_frame_pointer_rtx
7782 unchanged.
7783
7784 By default for stack-clash we assume the guard is at least 64KB, but this
7785 value is configurable to either 4KB or 64KB. We also force the guard size to
7786 be the same as the probing interval and both values are kept in sync.
7787
7788 With those assumptions the callee can allocate up to 63KB (or 3KB depending
7789 on the guard size) of stack space without probing.
7790
7791 When probing is needed, we emit a probe at the start of the prologue
7792 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7793
7794 We have to track how much space has been allocated and the only stores
7795 to the stack we track as implicit probes are the FP/LR stores.
7796
7797 For outgoing arguments we probe if the size is larger than 1KB, such that
7798 the ABI specified buffer is maintained for the next callee.
7799
7800 The following registers are reserved during frame layout and should not be
7801 used for any other purpose:
7802
7803 - r11: Used by stack clash protection when SVE is enabled, and also
7804 as an anchor register when saving and restoring registers
7805 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7806 - r14 and r15: Used for speculation tracking.
7807 - r16(IP0), r17(IP1): Used by indirect tailcalls.
7808 - r30(LR), r29(FP): Used by standard frame layout.
7809
7810 These registers must be avoided in frame layout related code unless the
7811 explicit intention is to interact with one of the features listed above. */
7812
7813 /* Generate the prologue instructions for entry into a function.
7814 Establish the stack frame by decreasing the stack pointer with a
7815 properly calculated size and, if necessary, create a frame record
7816 filled with the values of LR and previous frame pointer. The
7817 current FP is also set up if it is in use. */
7818
7819 void
7820 aarch64_expand_prologue (void)
7821 {
7822 poly_int64 frame_size = cfun->machine->frame.frame_size;
7823 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7824 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7825 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7826 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7827 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7828 poly_int64 below_hard_fp_saved_regs_size
7829 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7830 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7831 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7832 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7833 rtx_insn *insn;
7834
7835 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7836 {
7837 /* Fold the SVE allocation into the initial allocation.
7838 We don't do this in aarch64_layout_arg to avoid pessimizing
7839 the epilogue code. */
7840 initial_adjust += sve_callee_adjust;
7841 sve_callee_adjust = 0;
7842 }
7843
7844 /* Sign return address for functions. */
7845 if (aarch64_return_address_signing_enabled ())
7846 {
7847 switch (aarch64_ra_sign_key)
7848 {
7849 case AARCH64_KEY_A:
7850 insn = emit_insn (gen_paciasp ());
7851 break;
7852 case AARCH64_KEY_B:
7853 insn = emit_insn (gen_pacibsp ());
7854 break;
7855 default:
7856 gcc_unreachable ();
7857 }
7858 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7859 RTX_FRAME_RELATED_P (insn) = 1;
7860 }
7861
7862 if (flag_stack_usage_info)
7863 current_function_static_stack_size = constant_lower_bound (frame_size);
7864
7865 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7866 {
7867 if (crtl->is_leaf && !cfun->calls_alloca)
7868 {
7869 if (maybe_gt (frame_size, PROBE_INTERVAL)
7870 && maybe_gt (frame_size, get_stack_check_protect ()))
7871 aarch64_emit_probe_stack_range (get_stack_check_protect (),
7872 (frame_size
7873 - get_stack_check_protect ()));
7874 }
7875 else if (maybe_gt (frame_size, 0))
7876 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7877 }
7878
7879 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7880 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7881
7882 /* In theory we should never have both an initial adjustment
7883 and a callee save adjustment. Verify that is the case since the
7884 code below does not handle it for -fstack-clash-protection. */
7885 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
7886
7887 /* Will only probe if the initial adjustment is larger than the guard
7888 less the amount of the guard reserved for use by the caller's
7889 outgoing args. */
7890 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
7891 true, false);
7892
7893 if (callee_adjust != 0)
7894 aarch64_push_regs (reg1, reg2, callee_adjust);
7895
7896 /* The offset of the frame chain record (if any) from the current SP. */
7897 poly_int64 chain_offset = (initial_adjust + callee_adjust
7898 - cfun->machine->frame.hard_fp_offset);
7899 gcc_assert (known_ge (chain_offset, 0));
7900
7901 /* The offset of the bottom of the save area from the current SP. */
7902 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
7903
7904 if (emit_frame_chain)
7905 {
7906 if (callee_adjust == 0)
7907 {
7908 reg1 = R29_REGNUM;
7909 reg2 = R30_REGNUM;
7910 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
7911 false, false);
7912 }
7913 else
7914 gcc_assert (known_eq (chain_offset, 0));
7915 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
7916 stack_pointer_rtx, chain_offset,
7917 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
7918 if (frame_pointer_needed && !frame_size.is_constant ())
7919 {
7920 /* Variable-sized frames need to describe the save slot
7921 address using DW_CFA_expression rather than DW_CFA_offset.
7922 This means that, without taking further action, the
7923 locations of the registers that we've already saved would
7924 remain based on the stack pointer even after we redefine
7925 the CFA based on the frame pointer. We therefore need new
7926 DW_CFA_expressions to re-express the save slots with addresses
7927 based on the frame pointer. */
7928 rtx_insn *insn = get_last_insn ();
7929 gcc_assert (RTX_FRAME_RELATED_P (insn));
7930
7931 /* Add an explicit CFA definition if this was previously
7932 implicit. */
7933 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
7934 {
7935 rtx src = plus_constant (Pmode, stack_pointer_rtx,
7936 callee_offset);
7937 add_reg_note (insn, REG_CFA_ADJUST_CFA,
7938 gen_rtx_SET (hard_frame_pointer_rtx, src));
7939 }
7940
7941 /* Change the save slot expressions for the registers that
7942 we've already saved. */
7943 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
7944 hard_frame_pointer_rtx, UNITS_PER_WORD);
7945 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
7946 hard_frame_pointer_rtx, 0);
7947 }
7948 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
7949 }
7950
7951 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
7952 callee_adjust != 0 || emit_frame_chain,
7953 emit_frame_chain);
7954 if (maybe_ne (sve_callee_adjust, 0))
7955 {
7956 gcc_assert (!flag_stack_clash_protection
7957 || known_eq (initial_adjust, 0));
7958 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
7959 sve_callee_adjust,
7960 !frame_pointer_needed, false);
7961 saved_regs_offset += sve_callee_adjust;
7962 }
7963 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
7964 false, emit_frame_chain);
7965 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
7966 callee_adjust != 0 || emit_frame_chain,
7967 emit_frame_chain);
7968
7969 /* We may need to probe the final adjustment if it is larger than the guard
7970 that is assumed by the called. */
7971 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
7972 !frame_pointer_needed, true);
7973 }
7974
7975 /* Return TRUE if we can use a simple_return insn.
7976
7977 This function checks whether the callee saved stack is empty, which
7978 means no restore actions are need. The pro_and_epilogue will use
7979 this to check whether shrink-wrapping opt is feasible. */
7980
7981 bool
7982 aarch64_use_return_insn_p (void)
7983 {
7984 if (!reload_completed)
7985 return false;
7986
7987 if (crtl->profile)
7988 return false;
7989
7990 return known_eq (cfun->machine->frame.frame_size, 0);
7991 }
7992
7993 /* Generate the epilogue instructions for returning from a function.
7994 This is almost exactly the reverse of the prolog sequence, except
7995 that we need to insert barriers to avoid scheduling loads that read
7996 from a deallocated stack, and we optimize the unwind records by
7997 emitting them all together if possible. */
7998 void
7999 aarch64_expand_epilogue (bool for_sibcall)
8000 {
8001 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8002 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8003 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8004 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8005 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8006 poly_int64 below_hard_fp_saved_regs_size
8007 = cfun->machine->frame.below_hard_fp_saved_regs_size;
8008 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8009 unsigned reg2 = cfun->machine->frame.wb_candidate2;
8010 rtx cfi_ops = NULL;
8011 rtx_insn *insn;
8012 /* A stack clash protection prologue may not have left EP0_REGNUM or
8013 EP1_REGNUM in a usable state. The same is true for allocations
8014 with an SVE component, since we then need both temporary registers
8015 for each allocation. For stack clash we are in a usable state if
8016 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
8017 HOST_WIDE_INT guard_size
8018 = 1 << param_stack_clash_protection_guard_size;
8019 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8020
8021 /* We can re-use the registers when:
8022
8023 (a) the deallocation amount is the same as the corresponding
8024 allocation amount (which is false if we combine the initial
8025 and SVE callee save allocations in the prologue); and
8026
8027 (b) the allocation amount doesn't need a probe (which is false
8028 if the amount is guard_size - guard_used_by_caller or greater).
8029
8030 In such situations the register should remain live with the correct
8031 value. */
8032 bool can_inherit_p = (initial_adjust.is_constant ()
8033 && final_adjust.is_constant ()
8034 && (!flag_stack_clash_protection
8035 || (known_lt (initial_adjust,
8036 guard_size - guard_used_by_caller)
8037 && known_eq (sve_callee_adjust, 0))));
8038
8039 /* We need to add memory barrier to prevent read from deallocated stack. */
8040 bool need_barrier_p
8041 = maybe_ne (get_frame_size ()
8042 + cfun->machine->frame.saved_varargs_size, 0);
8043
8044 /* Emit a barrier to prevent loads from a deallocated stack. */
8045 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8046 || cfun->calls_alloca
8047 || crtl->calls_eh_return)
8048 {
8049 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8050 need_barrier_p = false;
8051 }
8052
8053 /* Restore the stack pointer from the frame pointer if it may not
8054 be the same as the stack pointer. */
8055 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8056 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8057 if (frame_pointer_needed
8058 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
8059 /* If writeback is used when restoring callee-saves, the CFA
8060 is restored on the instruction doing the writeback. */
8061 aarch64_add_offset (Pmode, stack_pointer_rtx,
8062 hard_frame_pointer_rtx,
8063 -callee_offset - below_hard_fp_saved_regs_size,
8064 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
8065 else
8066 /* The case where we need to re-use the register here is very rare, so
8067 avoid the complicated condition and just always emit a move if the
8068 immediate doesn't fit. */
8069 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
8070
8071 /* Restore the vector registers before the predicate registers,
8072 so that we can use P4 as a temporary for big-endian SVE frames. */
8073 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8074 callee_adjust != 0, &cfi_ops);
8075 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8076 false, &cfi_ops);
8077 if (maybe_ne (sve_callee_adjust, 0))
8078 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8079 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8080 R0_REGNUM, R30_REGNUM,
8081 callee_adjust != 0, &cfi_ops);
8082
8083 if (need_barrier_p)
8084 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8085
8086 if (callee_adjust != 0)
8087 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8088
8089 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
8090 {
8091 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
8092 insn = get_last_insn ();
8093 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8094 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
8095 RTX_FRAME_RELATED_P (insn) = 1;
8096 cfi_ops = NULL;
8097 }
8098
8099 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8100 add restriction on emit_move optimization to leaf functions. */
8101 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8102 (!can_inherit_p || !crtl->is_leaf
8103 || df_regs_ever_live_p (EP0_REGNUM)));
8104
8105 if (cfi_ops)
8106 {
8107 /* Emit delayed restores and reset the CFA to be SP. */
8108 insn = get_last_insn ();
8109 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8110 REG_NOTES (insn) = cfi_ops;
8111 RTX_FRAME_RELATED_P (insn) = 1;
8112 }
8113
8114 /* We prefer to emit the combined return/authenticate instruction RETAA,
8115 however there are three cases in which we must instead emit an explicit
8116 authentication instruction.
8117
8118 1) Sibcalls don't return in a normal way, so if we're about to call one
8119 we must authenticate.
8120
8121 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8122 generating code for !TARGET_ARMV8_3 we can't use it and must
8123 explicitly authenticate.
8124
8125 3) On an eh_return path we make extra stack adjustments to update the
8126 canonical frame address to be the exception handler's CFA. We want
8127 to authenticate using the CFA of the function which calls eh_return.
8128 */
8129 if (aarch64_return_address_signing_enabled ()
8130 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
8131 {
8132 switch (aarch64_ra_sign_key)
8133 {
8134 case AARCH64_KEY_A:
8135 insn = emit_insn (gen_autiasp ());
8136 break;
8137 case AARCH64_KEY_B:
8138 insn = emit_insn (gen_autibsp ());
8139 break;
8140 default:
8141 gcc_unreachable ();
8142 }
8143 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8144 RTX_FRAME_RELATED_P (insn) = 1;
8145 }
8146
8147 /* Stack adjustment for exception handler. */
8148 if (crtl->calls_eh_return && !for_sibcall)
8149 {
8150 /* We need to unwind the stack by the offset computed by
8151 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
8152 to be SP; letting the CFA move during this adjustment
8153 is just as correct as retaining the CFA from the body
8154 of the function. Therefore, do nothing special. */
8155 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
8156 }
8157
8158 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8159 if (!for_sibcall)
8160 emit_jump_insn (ret_rtx);
8161 }
8162
8163 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
8164 normally or return to a previous frame after unwinding.
8165
8166 An EH return uses a single shared return sequence. The epilogue is
8167 exactly like a normal epilogue except that it has an extra input
8168 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8169 that must be applied after the frame has been destroyed. An extra label
8170 is inserted before the epilogue which initializes this register to zero,
8171 and this is the entry point for a normal return.
8172
8173 An actual EH return updates the return address, initializes the stack
8174 adjustment and jumps directly into the epilogue (bypassing the zeroing
8175 of the adjustment). Since the return address is typically saved on the
8176 stack when a function makes a call, the saved LR must be updated outside
8177 the epilogue.
8178
8179 This poses problems as the store is generated well before the epilogue,
8180 so the offset of LR is not known yet. Also optimizations will remove the
8181 store as it appears dead, even after the epilogue is generated (as the
8182 base or offset for loading LR is different in many cases).
8183
8184 To avoid these problems this implementation forces the frame pointer
8185 in eh_return functions so that the location of LR is fixed and known early.
8186 It also marks the store volatile, so no optimization is permitted to
8187 remove the store. */
8188 rtx
8189 aarch64_eh_return_handler_rtx (void)
8190 {
8191 rtx tmp = gen_frame_mem (Pmode,
8192 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
8193
8194 /* Mark the store volatile, so no optimization is permitted to remove it. */
8195 MEM_VOLATILE_P (tmp) = true;
8196 return tmp;
8197 }
8198
8199 /* Output code to add DELTA to the first argument, and then jump
8200 to FUNCTION. Used for C++ multiple inheritance. */
8201 static void
8202 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
8203 HOST_WIDE_INT delta,
8204 HOST_WIDE_INT vcall_offset,
8205 tree function)
8206 {
8207 /* The this pointer is always in x0. Note that this differs from
8208 Arm where the this pointer maybe bumped to r1 if r0 is required
8209 to return a pointer to an aggregate. On AArch64 a result value
8210 pointer will be in x8. */
8211 int this_regno = R0_REGNUM;
8212 rtx this_rtx, temp0, temp1, addr, funexp;
8213 rtx_insn *insn;
8214 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
8215
8216 if (aarch64_bti_enabled ())
8217 emit_insn (gen_bti_c());
8218
8219 reload_completed = 1;
8220 emit_note (NOTE_INSN_PROLOGUE_END);
8221
8222 this_rtx = gen_rtx_REG (Pmode, this_regno);
8223 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
8224 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
8225
8226 if (vcall_offset == 0)
8227 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
8228 else
8229 {
8230 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
8231
8232 addr = this_rtx;
8233 if (delta != 0)
8234 {
8235 if (delta >= -256 && delta < 256)
8236 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
8237 plus_constant (Pmode, this_rtx, delta));
8238 else
8239 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
8240 temp1, temp0, false);
8241 }
8242
8243 if (Pmode == ptr_mode)
8244 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
8245 else
8246 aarch64_emit_move (temp0,
8247 gen_rtx_ZERO_EXTEND (Pmode,
8248 gen_rtx_MEM (ptr_mode, addr)));
8249
8250 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
8251 addr = plus_constant (Pmode, temp0, vcall_offset);
8252 else
8253 {
8254 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
8255 Pmode);
8256 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
8257 }
8258
8259 if (Pmode == ptr_mode)
8260 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
8261 else
8262 aarch64_emit_move (temp1,
8263 gen_rtx_SIGN_EXTEND (Pmode,
8264 gen_rtx_MEM (ptr_mode, addr)));
8265
8266 emit_insn (gen_add2_insn (this_rtx, temp1));
8267 }
8268
8269 /* Generate a tail call to the target function. */
8270 if (!TREE_USED (function))
8271 {
8272 assemble_external (function);
8273 TREE_USED (function) = 1;
8274 }
8275 funexp = XEXP (DECL_RTL (function), 0);
8276 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
8277 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
8278 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
8279 SIBLING_CALL_P (insn) = 1;
8280
8281 insn = get_insns ();
8282 shorten_branches (insn);
8283
8284 assemble_start_function (thunk, fnname);
8285 final_start_function (insn, file, 1);
8286 final (insn, file, 1);
8287 final_end_function ();
8288 assemble_end_function (thunk, fnname);
8289
8290 /* Stop pretending to be a post-reload pass. */
8291 reload_completed = 0;
8292 }
8293
8294 static bool
8295 aarch64_tls_referenced_p (rtx x)
8296 {
8297 if (!TARGET_HAVE_TLS)
8298 return false;
8299 subrtx_iterator::array_type array;
8300 FOR_EACH_SUBRTX (iter, array, x, ALL)
8301 {
8302 const_rtx x = *iter;
8303 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
8304 return true;
8305 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
8306 TLS offsets, not real symbol references. */
8307 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8308 iter.skip_subrtxes ();
8309 }
8310 return false;
8311 }
8312
8313
8314 /* Return true if val can be encoded as a 12-bit unsigned immediate with
8315 a left shift of 0 or 12 bits. */
8316 bool
8317 aarch64_uimm12_shift (HOST_WIDE_INT val)
8318 {
8319 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
8320 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
8321 );
8322 }
8323
8324 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
8325 that can be created with a left shift of 0 or 12. */
8326 static HOST_WIDE_INT
8327 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
8328 {
8329 /* Check to see if the value fits in 24 bits, as that is the maximum we can
8330 handle correctly. */
8331 gcc_assert ((val & 0xffffff) == val);
8332
8333 if (((val & 0xfff) << 0) == val)
8334 return val;
8335
8336 return val & (0xfff << 12);
8337 }
8338
8339 /* Return true if val is an immediate that can be loaded into a
8340 register by a MOVZ instruction. */
8341 static bool
8342 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
8343 {
8344 if (GET_MODE_SIZE (mode) > 4)
8345 {
8346 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
8347 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
8348 return 1;
8349 }
8350 else
8351 {
8352 /* Ignore sign extension. */
8353 val &= (HOST_WIDE_INT) 0xffffffff;
8354 }
8355 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
8356 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
8357 }
8358
8359 /* Test whether:
8360
8361 X = (X & AND_VAL) | IOR_VAL;
8362
8363 can be implemented using:
8364
8365 MOVK X, #(IOR_VAL >> shift), LSL #shift
8366
8367 Return the shift if so, otherwise return -1. */
8368 int
8369 aarch64_movk_shift (const wide_int_ref &and_val,
8370 const wide_int_ref &ior_val)
8371 {
8372 unsigned int precision = and_val.get_precision ();
8373 unsigned HOST_WIDE_INT mask = 0xffff;
8374 for (unsigned int shift = 0; shift < precision; shift += 16)
8375 {
8376 if (and_val == ~mask && (ior_val & mask) == ior_val)
8377 return shift;
8378 mask <<= 16;
8379 }
8380 return -1;
8381 }
8382
8383 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
8384 64-bit (DImode) integer. */
8385
8386 static unsigned HOST_WIDE_INT
8387 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
8388 {
8389 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
8390 while (size < 64)
8391 {
8392 val &= (HOST_WIDE_INT_1U << size) - 1;
8393 val |= val << size;
8394 size *= 2;
8395 }
8396 return val;
8397 }
8398
8399 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
8400
8401 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
8402 {
8403 0x0000000100000001ull,
8404 0x0001000100010001ull,
8405 0x0101010101010101ull,
8406 0x1111111111111111ull,
8407 0x5555555555555555ull,
8408 };
8409
8410
8411 /* Return true if val is a valid bitmask immediate. */
8412
8413 bool
8414 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
8415 {
8416 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
8417 int bits;
8418
8419 /* Check for a single sequence of one bits and return quickly if so.
8420 The special cases of all ones and all zeroes returns false. */
8421 val = aarch64_replicate_bitmask_imm (val_in, mode);
8422 tmp = val + (val & -val);
8423
8424 if (tmp == (tmp & -tmp))
8425 return (val + 1) > 1;
8426
8427 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
8428 if (mode == SImode)
8429 val = (val << 32) | (val & 0xffffffff);
8430
8431 /* Invert if the immediate doesn't start with a zero bit - this means we
8432 only need to search for sequences of one bits. */
8433 if (val & 1)
8434 val = ~val;
8435
8436 /* Find the first set bit and set tmp to val with the first sequence of one
8437 bits removed. Return success if there is a single sequence of ones. */
8438 first_one = val & -val;
8439 tmp = val & (val + first_one);
8440
8441 if (tmp == 0)
8442 return true;
8443
8444 /* Find the next set bit and compute the difference in bit position. */
8445 next_one = tmp & -tmp;
8446 bits = clz_hwi (first_one) - clz_hwi (next_one);
8447 mask = val ^ tmp;
8448
8449 /* Check the bit position difference is a power of 2, and that the first
8450 sequence of one bits fits within 'bits' bits. */
8451 if ((mask >> bits) != 0 || bits != (bits & -bits))
8452 return false;
8453
8454 /* Check the sequence of one bits is repeated 64/bits times. */
8455 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
8456 }
8457
8458 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8459 Assumed precondition: VAL_IN Is not zero. */
8460
8461 unsigned HOST_WIDE_INT
8462 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8463 {
8464 int lowest_bit_set = ctz_hwi (val_in);
8465 int highest_bit_set = floor_log2 (val_in);
8466 gcc_assert (val_in != 0);
8467
8468 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8469 (HOST_WIDE_INT_1U << lowest_bit_set));
8470 }
8471
8472 /* Create constant where bits outside of lowest bit set to highest bit set
8473 are set to 1. */
8474
8475 unsigned HOST_WIDE_INT
8476 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8477 {
8478 return val_in | ~aarch64_and_split_imm1 (val_in);
8479 }
8480
8481 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
8482
8483 bool
8484 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8485 {
8486 scalar_int_mode int_mode;
8487 if (!is_a <scalar_int_mode> (mode, &int_mode))
8488 return false;
8489
8490 if (aarch64_bitmask_imm (val_in, int_mode))
8491 return false;
8492
8493 if (aarch64_move_imm (val_in, int_mode))
8494 return false;
8495
8496 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8497
8498 return aarch64_bitmask_imm (imm2, int_mode);
8499 }
8500
8501 /* Return true if val is an immediate that can be loaded into a
8502 register in a single instruction. */
8503 bool
8504 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8505 {
8506 scalar_int_mode int_mode;
8507 if (!is_a <scalar_int_mode> (mode, &int_mode))
8508 return false;
8509
8510 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8511 return 1;
8512 return aarch64_bitmask_imm (val, int_mode);
8513 }
8514
8515 static bool
8516 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8517 {
8518 rtx base, offset;
8519
8520 if (GET_CODE (x) == HIGH)
8521 return true;
8522
8523 /* There's no way to calculate VL-based values using relocations. */
8524 subrtx_iterator::array_type array;
8525 FOR_EACH_SUBRTX (iter, array, x, ALL)
8526 if (GET_CODE (*iter) == CONST_POLY_INT)
8527 return true;
8528
8529 split_const (x, &base, &offset);
8530 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
8531 {
8532 if (aarch64_classify_symbol (base, INTVAL (offset))
8533 != SYMBOL_FORCE_TO_MEM)
8534 return true;
8535 else
8536 /* Avoid generating a 64-bit relocation in ILP32; leave
8537 to aarch64_expand_mov_immediate to handle it properly. */
8538 return mode != ptr_mode;
8539 }
8540
8541 return aarch64_tls_referenced_p (x);
8542 }
8543
8544 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8545 The expansion for a table switch is quite expensive due to the number
8546 of instructions, the table lookup and hard to predict indirect jump.
8547 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8548 set, otherwise use tables for > 16 cases as a tradeoff between size and
8549 performance. When optimizing for size, use the default setting. */
8550
8551 static unsigned int
8552 aarch64_case_values_threshold (void)
8553 {
8554 /* Use the specified limit for the number of cases before using jump
8555 tables at higher optimization levels. */
8556 if (optimize > 2
8557 && selected_cpu->tune->max_case_values != 0)
8558 return selected_cpu->tune->max_case_values;
8559 else
8560 return optimize_size ? default_case_values_threshold () : 17;
8561 }
8562
8563 /* Return true if register REGNO is a valid index register.
8564 STRICT_P is true if REG_OK_STRICT is in effect. */
8565
8566 bool
8567 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8568 {
8569 if (!HARD_REGISTER_NUM_P (regno))
8570 {
8571 if (!strict_p)
8572 return true;
8573
8574 if (!reg_renumber)
8575 return false;
8576
8577 regno = reg_renumber[regno];
8578 }
8579 return GP_REGNUM_P (regno);
8580 }
8581
8582 /* Return true if register REGNO is a valid base register for mode MODE.
8583 STRICT_P is true if REG_OK_STRICT is in effect. */
8584
8585 bool
8586 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8587 {
8588 if (!HARD_REGISTER_NUM_P (regno))
8589 {
8590 if (!strict_p)
8591 return true;
8592
8593 if (!reg_renumber)
8594 return false;
8595
8596 regno = reg_renumber[regno];
8597 }
8598
8599 /* The fake registers will be eliminated to either the stack or
8600 hard frame pointer, both of which are usually valid base registers.
8601 Reload deals with the cases where the eliminated form isn't valid. */
8602 return (GP_REGNUM_P (regno)
8603 || regno == SP_REGNUM
8604 || regno == FRAME_POINTER_REGNUM
8605 || regno == ARG_POINTER_REGNUM);
8606 }
8607
8608 /* Return true if X is a valid base register for mode MODE.
8609 STRICT_P is true if REG_OK_STRICT is in effect. */
8610
8611 static bool
8612 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8613 {
8614 if (!strict_p
8615 && GET_CODE (x) == SUBREG
8616 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8617 x = SUBREG_REG (x);
8618
8619 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8620 }
8621
8622 /* Return true if address offset is a valid index. If it is, fill in INFO
8623 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8624
8625 static bool
8626 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8627 machine_mode mode, bool strict_p)
8628 {
8629 enum aarch64_address_type type;
8630 rtx index;
8631 int shift;
8632
8633 /* (reg:P) */
8634 if ((REG_P (x) || GET_CODE (x) == SUBREG)
8635 && GET_MODE (x) == Pmode)
8636 {
8637 type = ADDRESS_REG_REG;
8638 index = x;
8639 shift = 0;
8640 }
8641 /* (sign_extend:DI (reg:SI)) */
8642 else if ((GET_CODE (x) == SIGN_EXTEND
8643 || GET_CODE (x) == ZERO_EXTEND)
8644 && GET_MODE (x) == DImode
8645 && GET_MODE (XEXP (x, 0)) == SImode)
8646 {
8647 type = (GET_CODE (x) == SIGN_EXTEND)
8648 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8649 index = XEXP (x, 0);
8650 shift = 0;
8651 }
8652 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8653 else if (GET_CODE (x) == MULT
8654 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8655 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8656 && GET_MODE (XEXP (x, 0)) == DImode
8657 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8658 && CONST_INT_P (XEXP (x, 1)))
8659 {
8660 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8661 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8662 index = XEXP (XEXP (x, 0), 0);
8663 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8664 }
8665 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8666 else if (GET_CODE (x) == ASHIFT
8667 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8668 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8669 && GET_MODE (XEXP (x, 0)) == DImode
8670 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8671 && CONST_INT_P (XEXP (x, 1)))
8672 {
8673 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8674 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8675 index = XEXP (XEXP (x, 0), 0);
8676 shift = INTVAL (XEXP (x, 1));
8677 }
8678 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8679 else if ((GET_CODE (x) == SIGN_EXTRACT
8680 || GET_CODE (x) == ZERO_EXTRACT)
8681 && GET_MODE (x) == DImode
8682 && GET_CODE (XEXP (x, 0)) == MULT
8683 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8684 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8685 {
8686 type = (GET_CODE (x) == SIGN_EXTRACT)
8687 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8688 index = XEXP (XEXP (x, 0), 0);
8689 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8690 if (INTVAL (XEXP (x, 1)) != 32 + shift
8691 || INTVAL (XEXP (x, 2)) != 0)
8692 shift = -1;
8693 }
8694 /* (and:DI (mult:DI (reg:DI) (const_int scale))
8695 (const_int 0xffffffff<<shift)) */
8696 else if (GET_CODE (x) == AND
8697 && GET_MODE (x) == DImode
8698 && GET_CODE (XEXP (x, 0)) == MULT
8699 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8700 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8701 && CONST_INT_P (XEXP (x, 1)))
8702 {
8703 type = ADDRESS_REG_UXTW;
8704 index = XEXP (XEXP (x, 0), 0);
8705 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8706 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8707 shift = -1;
8708 }
8709 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8710 else if ((GET_CODE (x) == SIGN_EXTRACT
8711 || GET_CODE (x) == ZERO_EXTRACT)
8712 && GET_MODE (x) == DImode
8713 && GET_CODE (XEXP (x, 0)) == ASHIFT
8714 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8715 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8716 {
8717 type = (GET_CODE (x) == SIGN_EXTRACT)
8718 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8719 index = XEXP (XEXP (x, 0), 0);
8720 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8721 if (INTVAL (XEXP (x, 1)) != 32 + shift
8722 || INTVAL (XEXP (x, 2)) != 0)
8723 shift = -1;
8724 }
8725 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8726 (const_int 0xffffffff<<shift)) */
8727 else if (GET_CODE (x) == AND
8728 && GET_MODE (x) == DImode
8729 && GET_CODE (XEXP (x, 0)) == ASHIFT
8730 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8731 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8732 && CONST_INT_P (XEXP (x, 1)))
8733 {
8734 type = ADDRESS_REG_UXTW;
8735 index = XEXP (XEXP (x, 0), 0);
8736 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8737 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8738 shift = -1;
8739 }
8740 /* (mult:P (reg:P) (const_int scale)) */
8741 else if (GET_CODE (x) == MULT
8742 && GET_MODE (x) == Pmode
8743 && GET_MODE (XEXP (x, 0)) == Pmode
8744 && CONST_INT_P (XEXP (x, 1)))
8745 {
8746 type = ADDRESS_REG_REG;
8747 index = XEXP (x, 0);
8748 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8749 }
8750 /* (ashift:P (reg:P) (const_int shift)) */
8751 else if (GET_CODE (x) == ASHIFT
8752 && GET_MODE (x) == Pmode
8753 && GET_MODE (XEXP (x, 0)) == Pmode
8754 && CONST_INT_P (XEXP (x, 1)))
8755 {
8756 type = ADDRESS_REG_REG;
8757 index = XEXP (x, 0);
8758 shift = INTVAL (XEXP (x, 1));
8759 }
8760 else
8761 return false;
8762
8763 if (!strict_p
8764 && GET_CODE (index) == SUBREG
8765 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8766 index = SUBREG_REG (index);
8767
8768 if (aarch64_sve_data_mode_p (mode))
8769 {
8770 if (type != ADDRESS_REG_REG
8771 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8772 return false;
8773 }
8774 else
8775 {
8776 if (shift != 0
8777 && !(IN_RANGE (shift, 1, 3)
8778 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8779 return false;
8780 }
8781
8782 if (REG_P (index)
8783 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8784 {
8785 info->type = type;
8786 info->offset = index;
8787 info->shift = shift;
8788 return true;
8789 }
8790
8791 return false;
8792 }
8793
8794 /* Return true if MODE is one of the modes for which we
8795 support LDP/STP operations. */
8796
8797 static bool
8798 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8799 {
8800 return mode == SImode || mode == DImode
8801 || mode == SFmode || mode == DFmode
8802 || (aarch64_vector_mode_supported_p (mode)
8803 && (known_eq (GET_MODE_SIZE (mode), 8)
8804 || (known_eq (GET_MODE_SIZE (mode), 16)
8805 && (aarch64_tune_params.extra_tuning_flags
8806 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8807 }
8808
8809 /* Return true if REGNO is a virtual pointer register, or an eliminable
8810 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
8811 include stack_pointer or hard_frame_pointer. */
8812 static bool
8813 virt_or_elim_regno_p (unsigned regno)
8814 {
8815 return ((regno >= FIRST_VIRTUAL_REGISTER
8816 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8817 || regno == FRAME_POINTER_REGNUM
8818 || regno == ARG_POINTER_REGNUM);
8819 }
8820
8821 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8822 If it is, fill in INFO appropriately. STRICT_P is true if
8823 REG_OK_STRICT is in effect. */
8824
8825 bool
8826 aarch64_classify_address (struct aarch64_address_info *info,
8827 rtx x, machine_mode mode, bool strict_p,
8828 aarch64_addr_query_type type)
8829 {
8830 enum rtx_code code = GET_CODE (x);
8831 rtx op0, op1;
8832 poly_int64 offset;
8833
8834 HOST_WIDE_INT const_size;
8835
8836 /* Whether a vector mode is partial doesn't affect address legitimacy.
8837 Partial vectors like VNx8QImode allow the same indexed addressing
8838 mode and MUL VL addressing mode as full vectors like VNx16QImode;
8839 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
8840 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8841 vec_flags &= ~VEC_PARTIAL;
8842
8843 /* On BE, we use load/store pair for all large int mode load/stores.
8844 TI/TFmode may also use a load/store pair. */
8845 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8846 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8847 || type == ADDR_QUERY_LDP_STP_N
8848 || mode == TImode
8849 || mode == TFmode
8850 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8851
8852 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8853 corresponds to the actual size of the memory being loaded/stored and the
8854 mode of the corresponding addressing mode is half of that. */
8855 if (type == ADDR_QUERY_LDP_STP_N
8856 && known_eq (GET_MODE_SIZE (mode), 16))
8857 mode = DFmode;
8858
8859 bool allow_reg_index_p = (!load_store_pair_p
8860 && (known_lt (GET_MODE_SIZE (mode), 16)
8861 || vec_flags == VEC_ADVSIMD
8862 || vec_flags & VEC_SVE_DATA));
8863
8864 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8865 [Rn, #offset, MUL VL]. */
8866 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8867 && (code != REG && code != PLUS))
8868 return false;
8869
8870 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8871 REG addressing. */
8872 if (advsimd_struct_p
8873 && !BYTES_BIG_ENDIAN
8874 && (code != POST_INC && code != REG))
8875 return false;
8876
8877 gcc_checking_assert (GET_MODE (x) == VOIDmode
8878 || SCALAR_INT_MODE_P (GET_MODE (x)));
8879
8880 switch (code)
8881 {
8882 case REG:
8883 case SUBREG:
8884 info->type = ADDRESS_REG_IMM;
8885 info->base = x;
8886 info->offset = const0_rtx;
8887 info->const_offset = 0;
8888 return aarch64_base_register_rtx_p (x, strict_p);
8889
8890 case PLUS:
8891 op0 = XEXP (x, 0);
8892 op1 = XEXP (x, 1);
8893
8894 if (! strict_p
8895 && REG_P (op0)
8896 && virt_or_elim_regno_p (REGNO (op0))
8897 && poly_int_rtx_p (op1, &offset))
8898 {
8899 info->type = ADDRESS_REG_IMM;
8900 info->base = op0;
8901 info->offset = op1;
8902 info->const_offset = offset;
8903
8904 return true;
8905 }
8906
8907 if (maybe_ne (GET_MODE_SIZE (mode), 0)
8908 && aarch64_base_register_rtx_p (op0, strict_p)
8909 && poly_int_rtx_p (op1, &offset))
8910 {
8911 info->type = ADDRESS_REG_IMM;
8912 info->base = op0;
8913 info->offset = op1;
8914 info->const_offset = offset;
8915
8916 /* TImode and TFmode values are allowed in both pairs of X
8917 registers and individual Q registers. The available
8918 address modes are:
8919 X,X: 7-bit signed scaled offset
8920 Q: 9-bit signed offset
8921 We conservatively require an offset representable in either mode.
8922 When performing the check for pairs of X registers i.e. LDP/STP
8923 pass down DImode since that is the natural size of the LDP/STP
8924 instruction memory accesses. */
8925 if (mode == TImode || mode == TFmode)
8926 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
8927 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8928 || offset_12bit_unsigned_scaled_p (mode, offset)));
8929
8930 /* A 7bit offset check because OImode will emit a ldp/stp
8931 instruction (only big endian will get here).
8932 For ldp/stp instructions, the offset is scaled for the size of a
8933 single element of the pair. */
8934 if (mode == OImode)
8935 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
8936
8937 /* Three 9/12 bit offsets checks because CImode will emit three
8938 ldr/str instructions (only big endian will get here). */
8939 if (mode == CImode)
8940 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8941 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
8942 offset + 32)
8943 || offset_12bit_unsigned_scaled_p (V16QImode,
8944 offset + 32)));
8945
8946 /* Two 7bit offsets checks because XImode will emit two ldp/stp
8947 instructions (only big endian will get here). */
8948 if (mode == XImode)
8949 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8950 && aarch64_offset_7bit_signed_scaled_p (TImode,
8951 offset + 32));
8952
8953 /* Make "m" use the LD1 offset range for SVE data modes, so
8954 that pre-RTL optimizers like ivopts will work to that
8955 instead of the wider LDR/STR range. */
8956 if (vec_flags == VEC_SVE_DATA)
8957 return (type == ADDR_QUERY_M
8958 ? offset_4bit_signed_scaled_p (mode, offset)
8959 : offset_9bit_signed_scaled_p (mode, offset));
8960
8961 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
8962 {
8963 poly_int64 end_offset = (offset
8964 + GET_MODE_SIZE (mode)
8965 - BYTES_PER_SVE_VECTOR);
8966 return (type == ADDR_QUERY_M
8967 ? offset_4bit_signed_scaled_p (mode, offset)
8968 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
8969 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
8970 end_offset)));
8971 }
8972
8973 if (vec_flags == VEC_SVE_PRED)
8974 return offset_9bit_signed_scaled_p (mode, offset);
8975
8976 if (load_store_pair_p)
8977 return ((known_eq (GET_MODE_SIZE (mode), 4)
8978 || known_eq (GET_MODE_SIZE (mode), 8)
8979 || known_eq (GET_MODE_SIZE (mode), 16))
8980 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8981 else
8982 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8983 || offset_12bit_unsigned_scaled_p (mode, offset));
8984 }
8985
8986 if (allow_reg_index_p)
8987 {
8988 /* Look for base + (scaled/extended) index register. */
8989 if (aarch64_base_register_rtx_p (op0, strict_p)
8990 && aarch64_classify_index (info, op1, mode, strict_p))
8991 {
8992 info->base = op0;
8993 return true;
8994 }
8995 if (aarch64_base_register_rtx_p (op1, strict_p)
8996 && aarch64_classify_index (info, op0, mode, strict_p))
8997 {
8998 info->base = op1;
8999 return true;
9000 }
9001 }
9002
9003 return false;
9004
9005 case POST_INC:
9006 case POST_DEC:
9007 case PRE_INC:
9008 case PRE_DEC:
9009 info->type = ADDRESS_REG_WB;
9010 info->base = XEXP (x, 0);
9011 info->offset = NULL_RTX;
9012 return aarch64_base_register_rtx_p (info->base, strict_p);
9013
9014 case POST_MODIFY:
9015 case PRE_MODIFY:
9016 info->type = ADDRESS_REG_WB;
9017 info->base = XEXP (x, 0);
9018 if (GET_CODE (XEXP (x, 1)) == PLUS
9019 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
9020 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9021 && aarch64_base_register_rtx_p (info->base, strict_p))
9022 {
9023 info->offset = XEXP (XEXP (x, 1), 1);
9024 info->const_offset = offset;
9025
9026 /* TImode and TFmode values are allowed in both pairs of X
9027 registers and individual Q registers. The available
9028 address modes are:
9029 X,X: 7-bit signed scaled offset
9030 Q: 9-bit signed offset
9031 We conservatively require an offset representable in either mode.
9032 */
9033 if (mode == TImode || mode == TFmode)
9034 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
9035 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
9036
9037 if (load_store_pair_p)
9038 return ((known_eq (GET_MODE_SIZE (mode), 4)
9039 || known_eq (GET_MODE_SIZE (mode), 8)
9040 || known_eq (GET_MODE_SIZE (mode), 16))
9041 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9042 else
9043 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
9044 }
9045 return false;
9046
9047 case CONST:
9048 case SYMBOL_REF:
9049 case LABEL_REF:
9050 /* load literal: pc-relative constant pool entry. Only supported
9051 for SI mode or larger. */
9052 info->type = ADDRESS_SYMBOLIC;
9053
9054 if (!load_store_pair_p
9055 && GET_MODE_SIZE (mode).is_constant (&const_size)
9056 && const_size >= 4)
9057 {
9058 rtx sym, addend;
9059
9060 split_const (x, &sym, &addend);
9061 return ((GET_CODE (sym) == LABEL_REF
9062 || (GET_CODE (sym) == SYMBOL_REF
9063 && CONSTANT_POOL_ADDRESS_P (sym)
9064 && aarch64_pcrelative_literal_loads)));
9065 }
9066 return false;
9067
9068 case LO_SUM:
9069 info->type = ADDRESS_LO_SUM;
9070 info->base = XEXP (x, 0);
9071 info->offset = XEXP (x, 1);
9072 if (allow_reg_index_p
9073 && aarch64_base_register_rtx_p (info->base, strict_p))
9074 {
9075 rtx sym, offs;
9076 split_const (info->offset, &sym, &offs);
9077 if (GET_CODE (sym) == SYMBOL_REF
9078 && (aarch64_classify_symbol (sym, INTVAL (offs))
9079 == SYMBOL_SMALL_ABSOLUTE))
9080 {
9081 /* The symbol and offset must be aligned to the access size. */
9082 unsigned int align;
9083
9084 if (CONSTANT_POOL_ADDRESS_P (sym))
9085 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9086 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9087 {
9088 tree exp = SYMBOL_REF_DECL (sym);
9089 align = TYPE_ALIGN (TREE_TYPE (exp));
9090 align = aarch64_constant_alignment (exp, align);
9091 }
9092 else if (SYMBOL_REF_DECL (sym))
9093 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
9094 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9095 && SYMBOL_REF_BLOCK (sym) != NULL)
9096 align = SYMBOL_REF_BLOCK (sym)->alignment;
9097 else
9098 align = BITS_PER_UNIT;
9099
9100 poly_int64 ref_size = GET_MODE_SIZE (mode);
9101 if (known_eq (ref_size, 0))
9102 ref_size = GET_MODE_SIZE (DImode);
9103
9104 return (multiple_p (INTVAL (offs), ref_size)
9105 && multiple_p (align / BITS_PER_UNIT, ref_size));
9106 }
9107 }
9108 return false;
9109
9110 default:
9111 return false;
9112 }
9113 }
9114
9115 /* Return true if the address X is valid for a PRFM instruction.
9116 STRICT_P is true if we should do strict checking with
9117 aarch64_classify_address. */
9118
9119 bool
9120 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9121 {
9122 struct aarch64_address_info addr;
9123
9124 /* PRFM accepts the same addresses as DImode... */
9125 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9126 if (!res)
9127 return false;
9128
9129 /* ... except writeback forms. */
9130 return addr.type != ADDRESS_REG_WB;
9131 }
9132
9133 bool
9134 aarch64_symbolic_address_p (rtx x)
9135 {
9136 rtx offset;
9137
9138 split_const (x, &x, &offset);
9139 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
9140 }
9141
9142 /* Classify the base of symbolic expression X. */
9143
9144 enum aarch64_symbol_type
9145 aarch64_classify_symbolic_expression (rtx x)
9146 {
9147 rtx offset;
9148
9149 split_const (x, &x, &offset);
9150 return aarch64_classify_symbol (x, INTVAL (offset));
9151 }
9152
9153
9154 /* Return TRUE if X is a legitimate address for accessing memory in
9155 mode MODE. */
9156 static bool
9157 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
9158 {
9159 struct aarch64_address_info addr;
9160
9161 return aarch64_classify_address (&addr, x, mode, strict_p);
9162 }
9163
9164 /* Return TRUE if X is a legitimate address of type TYPE for accessing
9165 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
9166 bool
9167 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9168 aarch64_addr_query_type type)
9169 {
9170 struct aarch64_address_info addr;
9171
9172 return aarch64_classify_address (&addr, x, mode, strict_p, type);
9173 }
9174
9175 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
9176
9177 static bool
9178 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9179 poly_int64 orig_offset,
9180 machine_mode mode)
9181 {
9182 HOST_WIDE_INT size;
9183 if (GET_MODE_SIZE (mode).is_constant (&size))
9184 {
9185 HOST_WIDE_INT const_offset, second_offset;
9186
9187 /* A general SVE offset is A * VQ + B. Remove the A component from
9188 coefficient 0 in order to get the constant B. */
9189 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9190
9191 /* Split an out-of-range address displacement into a base and
9192 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
9193 range otherwise to increase opportunities for sharing the base
9194 address of different sizes. Unaligned accesses use the signed
9195 9-bit range, TImode/TFmode use the intersection of signed
9196 scaled 7-bit and signed 9-bit offset. */
9197 if (mode == TImode || mode == TFmode)
9198 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9199 else if ((const_offset & (size - 1)) != 0)
9200 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
9201 else
9202 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
9203
9204 if (second_offset == 0 || known_eq (orig_offset, second_offset))
9205 return false;
9206
9207 /* Split the offset into second_offset and the rest. */
9208 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9209 *offset2 = gen_int_mode (second_offset, Pmode);
9210 return true;
9211 }
9212 else
9213 {
9214 /* Get the mode we should use as the basis of the range. For structure
9215 modes this is the mode of one vector. */
9216 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9217 machine_mode step_mode
9218 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
9219
9220 /* Get the "mul vl" multiplier we'd like to use. */
9221 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
9222 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
9223 if (vec_flags & VEC_SVE_DATA)
9224 /* LDR supports a 9-bit range, but the move patterns for
9225 structure modes require all vectors to be in range of the
9226 same base. The simplest way of accomodating that while still
9227 promoting reuse of anchor points between different modes is
9228 to use an 8-bit range unconditionally. */
9229 vnum = ((vnum + 128) & 255) - 128;
9230 else
9231 /* Predicates are only handled singly, so we might as well use
9232 the full range. */
9233 vnum = ((vnum + 256) & 511) - 256;
9234 if (vnum == 0)
9235 return false;
9236
9237 /* Convert the "mul vl" multiplier into a byte offset. */
9238 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
9239 if (known_eq (second_offset, orig_offset))
9240 return false;
9241
9242 /* Split the offset into second_offset and the rest. */
9243 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9244 *offset2 = gen_int_mode (second_offset, Pmode);
9245 return true;
9246 }
9247 }
9248
9249 /* Return the binary representation of floating point constant VALUE in INTVAL.
9250 If the value cannot be converted, return false without setting INTVAL.
9251 The conversion is done in the given MODE. */
9252 bool
9253 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
9254 {
9255
9256 /* We make a general exception for 0. */
9257 if (aarch64_float_const_zero_rtx_p (value))
9258 {
9259 *intval = 0;
9260 return true;
9261 }
9262
9263 scalar_float_mode mode;
9264 if (GET_CODE (value) != CONST_DOUBLE
9265 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
9266 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
9267 /* Only support up to DF mode. */
9268 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
9269 return false;
9270
9271 unsigned HOST_WIDE_INT ival = 0;
9272
9273 long res[2];
9274 real_to_target (res,
9275 CONST_DOUBLE_REAL_VALUE (value),
9276 REAL_MODE_FORMAT (mode));
9277
9278 if (mode == DFmode)
9279 {
9280 int order = BYTES_BIG_ENDIAN ? 1 : 0;
9281 ival = zext_hwi (res[order], 32);
9282 ival |= (zext_hwi (res[1 - order], 32) << 32);
9283 }
9284 else
9285 ival = zext_hwi (res[0], 32);
9286
9287 *intval = ival;
9288 return true;
9289 }
9290
9291 /* Return TRUE if rtx X is an immediate constant that can be moved using a
9292 single MOV(+MOVK) followed by an FMOV. */
9293 bool
9294 aarch64_float_const_rtx_p (rtx x)
9295 {
9296 machine_mode mode = GET_MODE (x);
9297 if (mode == VOIDmode)
9298 return false;
9299
9300 /* Determine whether it's cheaper to write float constants as
9301 mov/movk pairs over ldr/adrp pairs. */
9302 unsigned HOST_WIDE_INT ival;
9303
9304 if (GET_CODE (x) == CONST_DOUBLE
9305 && SCALAR_FLOAT_MODE_P (mode)
9306 && aarch64_reinterpret_float_as_int (x, &ival))
9307 {
9308 scalar_int_mode imode = (mode == HFmode
9309 ? SImode
9310 : int_mode_for_mode (mode).require ());
9311 int num_instr = aarch64_internal_mov_immediate
9312 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9313 return num_instr < 3;
9314 }
9315
9316 return false;
9317 }
9318
9319 /* Return TRUE if rtx X is immediate constant 0.0 */
9320 bool
9321 aarch64_float_const_zero_rtx_p (rtx x)
9322 {
9323 if (GET_MODE (x) == VOIDmode)
9324 return false;
9325
9326 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
9327 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
9328 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
9329 }
9330
9331 /* Return TRUE if rtx X is immediate constant that fits in a single
9332 MOVI immediate operation. */
9333 bool
9334 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
9335 {
9336 if (!TARGET_SIMD)
9337 return false;
9338
9339 machine_mode vmode;
9340 scalar_int_mode imode;
9341 unsigned HOST_WIDE_INT ival;
9342
9343 if (GET_CODE (x) == CONST_DOUBLE
9344 && SCALAR_FLOAT_MODE_P (mode))
9345 {
9346 if (!aarch64_reinterpret_float_as_int (x, &ival))
9347 return false;
9348
9349 /* We make a general exception for 0. */
9350 if (aarch64_float_const_zero_rtx_p (x))
9351 return true;
9352
9353 imode = int_mode_for_mode (mode).require ();
9354 }
9355 else if (GET_CODE (x) == CONST_INT
9356 && is_a <scalar_int_mode> (mode, &imode))
9357 ival = INTVAL (x);
9358 else
9359 return false;
9360
9361 /* use a 64 bit mode for everything except for DI/DF mode, where we use
9362 a 128 bit vector mode. */
9363 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
9364
9365 vmode = aarch64_simd_container_mode (imode, width);
9366 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
9367
9368 return aarch64_simd_valid_immediate (v_op, NULL);
9369 }
9370
9371
9372 /* Return the fixed registers used for condition codes. */
9373
9374 static bool
9375 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
9376 {
9377 *p1 = CC_REGNUM;
9378 *p2 = INVALID_REGNUM;
9379 return true;
9380 }
9381
9382 /* This function is used by the call expanders of the machine description.
9383 RESULT is the register in which the result is returned. It's NULL for
9384 "call" and "sibcall".
9385 MEM is the location of the function call.
9386 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
9387 SIBCALL indicates whether this function call is normal call or sibling call.
9388 It will generate different pattern accordingly. */
9389
9390 void
9391 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
9392 {
9393 rtx call, callee, tmp;
9394 rtvec vec;
9395 machine_mode mode;
9396
9397 gcc_assert (MEM_P (mem));
9398 callee = XEXP (mem, 0);
9399 mode = GET_MODE (callee);
9400 gcc_assert (mode == Pmode);
9401
9402 /* Decide if we should generate indirect calls by loading the
9403 address of the callee into a register before performing
9404 the branch-and-link. */
9405 if (SYMBOL_REF_P (callee)
9406 ? (aarch64_is_long_call_p (callee)
9407 || aarch64_is_noplt_call_p (callee))
9408 : !REG_P (callee))
9409 XEXP (mem, 0) = force_reg (mode, callee);
9410
9411 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
9412
9413 if (result != NULL_RTX)
9414 call = gen_rtx_SET (result, call);
9415
9416 if (sibcall)
9417 tmp = ret_rtx;
9418 else
9419 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
9420
9421 gcc_assert (CONST_INT_P (callee_abi));
9422 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
9423 UNSPEC_CALLEE_ABI);
9424
9425 vec = gen_rtvec (3, call, callee_abi, tmp);
9426 call = gen_rtx_PARALLEL (VOIDmode, vec);
9427
9428 aarch64_emit_call_insn (call);
9429 }
9430
9431 /* Emit call insn with PAT and do aarch64-specific handling. */
9432
9433 void
9434 aarch64_emit_call_insn (rtx pat)
9435 {
9436 rtx insn = emit_call_insn (pat);
9437
9438 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
9439 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
9440 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
9441 }
9442
9443 machine_mode
9444 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9445 {
9446 machine_mode mode_x = GET_MODE (x);
9447 rtx_code code_x = GET_CODE (x);
9448
9449 /* All floating point compares return CCFP if it is an equality
9450 comparison, and CCFPE otherwise. */
9451 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
9452 {
9453 switch (code)
9454 {
9455 case EQ:
9456 case NE:
9457 case UNORDERED:
9458 case ORDERED:
9459 case UNLT:
9460 case UNLE:
9461 case UNGT:
9462 case UNGE:
9463 case UNEQ:
9464 return CCFPmode;
9465
9466 case LT:
9467 case LE:
9468 case GT:
9469 case GE:
9470 case LTGT:
9471 return CCFPEmode;
9472
9473 default:
9474 gcc_unreachable ();
9475 }
9476 }
9477
9478 /* Equality comparisons of short modes against zero can be performed
9479 using the TST instruction with the appropriate bitmask. */
9480 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
9481 && (code == EQ || code == NE)
9482 && (mode_x == HImode || mode_x == QImode))
9483 return CC_NZmode;
9484
9485 /* Similarly, comparisons of zero_extends from shorter modes can
9486 be performed using an ANDS with an immediate mask. */
9487 if (y == const0_rtx && code_x == ZERO_EXTEND
9488 && (mode_x == SImode || mode_x == DImode)
9489 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9490 && (code == EQ || code == NE))
9491 return CC_NZmode;
9492
9493 if ((mode_x == SImode || mode_x == DImode)
9494 && y == const0_rtx
9495 && (code == EQ || code == NE || code == LT || code == GE)
9496 && (code_x == PLUS || code_x == MINUS || code_x == AND
9497 || code_x == NEG
9498 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9499 && CONST_INT_P (XEXP (x, 2)))))
9500 return CC_NZmode;
9501
9502 /* A compare with a shifted operand. Because of canonicalization,
9503 the comparison will have to be swapped when we emit the assembly
9504 code. */
9505 if ((mode_x == SImode || mode_x == DImode)
9506 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9507 && (code_x == ASHIFT || code_x == ASHIFTRT
9508 || code_x == LSHIFTRT
9509 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9510 return CC_SWPmode;
9511
9512 /* Similarly for a negated operand, but we can only do this for
9513 equalities. */
9514 if ((mode_x == SImode || mode_x == DImode)
9515 && (REG_P (y) || GET_CODE (y) == SUBREG)
9516 && (code == EQ || code == NE)
9517 && code_x == NEG)
9518 return CC_Zmode;
9519
9520 /* A test for unsigned overflow from an addition. */
9521 if ((mode_x == DImode || mode_x == TImode)
9522 && (code == LTU || code == GEU)
9523 && code_x == PLUS
9524 && rtx_equal_p (XEXP (x, 0), y))
9525 return CC_Cmode;
9526
9527 /* A test for unsigned overflow from an add with carry. */
9528 if ((mode_x == DImode || mode_x == TImode)
9529 && (code == LTU || code == GEU)
9530 && code_x == PLUS
9531 && CONST_SCALAR_INT_P (y)
9532 && (rtx_mode_t (y, mode_x)
9533 == (wi::shwi (1, mode_x)
9534 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9535 return CC_ADCmode;
9536
9537 /* A test for signed overflow. */
9538 if ((mode_x == DImode || mode_x == TImode)
9539 && code == NE
9540 && code_x == PLUS
9541 && GET_CODE (y) == SIGN_EXTEND)
9542 return CC_Vmode;
9543
9544 /* For everything else, return CCmode. */
9545 return CCmode;
9546 }
9547
9548 static int
9549 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9550
9551 int
9552 aarch64_get_condition_code (rtx x)
9553 {
9554 machine_mode mode = GET_MODE (XEXP (x, 0));
9555 enum rtx_code comp_code = GET_CODE (x);
9556
9557 if (GET_MODE_CLASS (mode) != MODE_CC)
9558 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9559 return aarch64_get_condition_code_1 (mode, comp_code);
9560 }
9561
9562 static int
9563 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9564 {
9565 switch (mode)
9566 {
9567 case E_CCFPmode:
9568 case E_CCFPEmode:
9569 switch (comp_code)
9570 {
9571 case GE: return AARCH64_GE;
9572 case GT: return AARCH64_GT;
9573 case LE: return AARCH64_LS;
9574 case LT: return AARCH64_MI;
9575 case NE: return AARCH64_NE;
9576 case EQ: return AARCH64_EQ;
9577 case ORDERED: return AARCH64_VC;
9578 case UNORDERED: return AARCH64_VS;
9579 case UNLT: return AARCH64_LT;
9580 case UNLE: return AARCH64_LE;
9581 case UNGT: return AARCH64_HI;
9582 case UNGE: return AARCH64_PL;
9583 default: return -1;
9584 }
9585 break;
9586
9587 case E_CCmode:
9588 switch (comp_code)
9589 {
9590 case NE: return AARCH64_NE;
9591 case EQ: return AARCH64_EQ;
9592 case GE: return AARCH64_GE;
9593 case GT: return AARCH64_GT;
9594 case LE: return AARCH64_LE;
9595 case LT: return AARCH64_LT;
9596 case GEU: return AARCH64_CS;
9597 case GTU: return AARCH64_HI;
9598 case LEU: return AARCH64_LS;
9599 case LTU: return AARCH64_CC;
9600 default: return -1;
9601 }
9602 break;
9603
9604 case E_CC_SWPmode:
9605 switch (comp_code)
9606 {
9607 case NE: return AARCH64_NE;
9608 case EQ: return AARCH64_EQ;
9609 case GE: return AARCH64_LE;
9610 case GT: return AARCH64_LT;
9611 case LE: return AARCH64_GE;
9612 case LT: return AARCH64_GT;
9613 case GEU: return AARCH64_LS;
9614 case GTU: return AARCH64_CC;
9615 case LEU: return AARCH64_CS;
9616 case LTU: return AARCH64_HI;
9617 default: return -1;
9618 }
9619 break;
9620
9621 case E_CC_NZCmode:
9622 switch (comp_code)
9623 {
9624 case NE: return AARCH64_NE; /* = any */
9625 case EQ: return AARCH64_EQ; /* = none */
9626 case GE: return AARCH64_PL; /* = nfrst */
9627 case LT: return AARCH64_MI; /* = first */
9628 case GEU: return AARCH64_CS; /* = nlast */
9629 case GTU: return AARCH64_HI; /* = pmore */
9630 case LEU: return AARCH64_LS; /* = plast */
9631 case LTU: return AARCH64_CC; /* = last */
9632 default: return -1;
9633 }
9634 break;
9635
9636 case E_CC_NZmode:
9637 switch (comp_code)
9638 {
9639 case NE: return AARCH64_NE;
9640 case EQ: return AARCH64_EQ;
9641 case GE: return AARCH64_PL;
9642 case LT: return AARCH64_MI;
9643 default: return -1;
9644 }
9645 break;
9646
9647 case E_CC_Zmode:
9648 switch (comp_code)
9649 {
9650 case NE: return AARCH64_NE;
9651 case EQ: return AARCH64_EQ;
9652 default: return -1;
9653 }
9654 break;
9655
9656 case E_CC_Cmode:
9657 switch (comp_code)
9658 {
9659 case LTU: return AARCH64_CS;
9660 case GEU: return AARCH64_CC;
9661 default: return -1;
9662 }
9663 break;
9664
9665 case E_CC_ADCmode:
9666 switch (comp_code)
9667 {
9668 case GEU: return AARCH64_CS;
9669 case LTU: return AARCH64_CC;
9670 default: return -1;
9671 }
9672 break;
9673
9674 case E_CC_Vmode:
9675 switch (comp_code)
9676 {
9677 case NE: return AARCH64_VS;
9678 case EQ: return AARCH64_VC;
9679 default: return -1;
9680 }
9681 break;
9682
9683 default:
9684 return -1;
9685 }
9686
9687 return -1;
9688 }
9689
9690 bool
9691 aarch64_const_vec_all_same_in_range_p (rtx x,
9692 HOST_WIDE_INT minval,
9693 HOST_WIDE_INT maxval)
9694 {
9695 rtx elt;
9696 return (const_vec_duplicate_p (x, &elt)
9697 && CONST_INT_P (elt)
9698 && IN_RANGE (INTVAL (elt), minval, maxval));
9699 }
9700
9701 bool
9702 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9703 {
9704 return aarch64_const_vec_all_same_in_range_p (x, val, val);
9705 }
9706
9707 /* Return true if VEC is a constant in which every element is in the range
9708 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
9709
9710 static bool
9711 aarch64_const_vec_all_in_range_p (rtx vec,
9712 HOST_WIDE_INT minval,
9713 HOST_WIDE_INT maxval)
9714 {
9715 if (GET_CODE (vec) != CONST_VECTOR
9716 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9717 return false;
9718
9719 int nunits;
9720 if (!CONST_VECTOR_STEPPED_P (vec))
9721 nunits = const_vector_encoded_nelts (vec);
9722 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9723 return false;
9724
9725 for (int i = 0; i < nunits; i++)
9726 {
9727 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9728 if (!CONST_INT_P (vec_elem)
9729 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9730 return false;
9731 }
9732 return true;
9733 }
9734
9735 /* N Z C V. */
9736 #define AARCH64_CC_V 1
9737 #define AARCH64_CC_C (1 << 1)
9738 #define AARCH64_CC_Z (1 << 2)
9739 #define AARCH64_CC_N (1 << 3)
9740
9741 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
9742 static const int aarch64_nzcv_codes[] =
9743 {
9744 0, /* EQ, Z == 1. */
9745 AARCH64_CC_Z, /* NE, Z == 0. */
9746 0, /* CS, C == 1. */
9747 AARCH64_CC_C, /* CC, C == 0. */
9748 0, /* MI, N == 1. */
9749 AARCH64_CC_N, /* PL, N == 0. */
9750 0, /* VS, V == 1. */
9751 AARCH64_CC_V, /* VC, V == 0. */
9752 0, /* HI, C ==1 && Z == 0. */
9753 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
9754 AARCH64_CC_V, /* GE, N == V. */
9755 0, /* LT, N != V. */
9756 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
9757 0, /* LE, !(Z == 0 && N == V). */
9758 0, /* AL, Any. */
9759 0 /* NV, Any. */
9760 };
9761
9762 /* Print floating-point vector immediate operand X to F, negating it
9763 first if NEGATE is true. Return true on success, false if it isn't
9764 a constant we can handle. */
9765
9766 static bool
9767 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9768 {
9769 rtx elt;
9770
9771 if (!const_vec_duplicate_p (x, &elt))
9772 return false;
9773
9774 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9775 if (negate)
9776 r = real_value_negate (&r);
9777
9778 /* Handle the SVE single-bit immediates specially, since they have a
9779 fixed form in the assembly syntax. */
9780 if (real_equal (&r, &dconst0))
9781 asm_fprintf (f, "0.0");
9782 else if (real_equal (&r, &dconst2))
9783 asm_fprintf (f, "2.0");
9784 else if (real_equal (&r, &dconst1))
9785 asm_fprintf (f, "1.0");
9786 else if (real_equal (&r, &dconsthalf))
9787 asm_fprintf (f, "0.5");
9788 else
9789 {
9790 const int buf_size = 20;
9791 char float_buf[buf_size] = {'\0'};
9792 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9793 1, GET_MODE (elt));
9794 asm_fprintf (f, "%s", float_buf);
9795 }
9796
9797 return true;
9798 }
9799
9800 /* Return the equivalent letter for size. */
9801 static char
9802 sizetochar (int size)
9803 {
9804 switch (size)
9805 {
9806 case 64: return 'd';
9807 case 32: return 's';
9808 case 16: return 'h';
9809 case 8 : return 'b';
9810 default: gcc_unreachable ();
9811 }
9812 }
9813
9814 /* Print operand X to file F in a target specific manner according to CODE.
9815 The acceptable formatting commands given by CODE are:
9816 'c': An integer or symbol address without a preceding #
9817 sign.
9818 'C': Take the duplicated element in a vector constant
9819 and print it in hex.
9820 'D': Take the duplicated element in a vector constant
9821 and print it as an unsigned integer, in decimal.
9822 'e': Print the sign/zero-extend size as a character 8->b,
9823 16->h, 32->w. Can also be used for masks:
9824 0xff->b, 0xffff->h, 0xffffffff->w.
9825 'I': If the operand is a duplicated vector constant,
9826 replace it with the duplicated scalar. If the
9827 operand is then a floating-point constant, replace
9828 it with the integer bit representation. Print the
9829 transformed constant as a signed decimal number.
9830 'p': Prints N such that 2^N == X (X must be power of 2 and
9831 const int).
9832 'P': Print the number of non-zero bits in X (a const_int).
9833 'H': Print the higher numbered register of a pair (TImode)
9834 of regs.
9835 'm': Print a condition (eq, ne, etc).
9836 'M': Same as 'm', but invert condition.
9837 'N': Take the duplicated element in a vector constant
9838 and print the negative of it in decimal.
9839 'b/h/s/d/q': Print a scalar FP/SIMD register name.
9840 'S/T/U/V': Print a FP/SIMD register name for a register list.
9841 The register printed is the FP/SIMD register name
9842 of X + 0/1/2/3 for S/T/U/V.
9843 'R': Print a scalar Integer/FP/SIMD register name + 1.
9844 'X': Print bottom 16 bits of integer constant in hex.
9845 'w/x': Print a general register name or the zero register
9846 (32-bit or 64-bit).
9847 '0': Print a normal operand, if it's a general register,
9848 then we assume DImode.
9849 'k': Print NZCV for conditional compare instructions.
9850 'A': Output address constant representing the first
9851 argument of X, specifying a relocation offset
9852 if appropriate.
9853 'L': Output constant address specified by X
9854 with a relocation offset if appropriate.
9855 'G': Prints address of X, specifying a PC relative
9856 relocation mode if appropriate.
9857 'y': Output address of LDP or STP - this is used for
9858 some LDP/STPs which don't use a PARALLEL in their
9859 pattern (so the mode needs to be adjusted).
9860 'z': Output address of a typical LDP or STP. */
9861
9862 static void
9863 aarch64_print_operand (FILE *f, rtx x, int code)
9864 {
9865 rtx elt;
9866 switch (code)
9867 {
9868 case 'c':
9869 switch (GET_CODE (x))
9870 {
9871 case CONST_INT:
9872 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9873 break;
9874
9875 case SYMBOL_REF:
9876 output_addr_const (f, x);
9877 break;
9878
9879 case CONST:
9880 if (GET_CODE (XEXP (x, 0)) == PLUS
9881 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
9882 {
9883 output_addr_const (f, x);
9884 break;
9885 }
9886 /* Fall through. */
9887
9888 default:
9889 output_operand_lossage ("unsupported operand for code '%c'", code);
9890 }
9891 break;
9892
9893 case 'e':
9894 {
9895 x = unwrap_const_vec_duplicate (x);
9896 if (!CONST_INT_P (x))
9897 {
9898 output_operand_lossage ("invalid operand for '%%%c'", code);
9899 return;
9900 }
9901
9902 HOST_WIDE_INT val = INTVAL (x);
9903 if ((val & ~7) == 8 || val == 0xff)
9904 fputc ('b', f);
9905 else if ((val & ~7) == 16 || val == 0xffff)
9906 fputc ('h', f);
9907 else if ((val & ~7) == 32 || val == 0xffffffff)
9908 fputc ('w', f);
9909 else
9910 {
9911 output_operand_lossage ("invalid operand for '%%%c'", code);
9912 return;
9913 }
9914 }
9915 break;
9916
9917 case 'p':
9918 {
9919 int n;
9920
9921 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
9922 {
9923 output_operand_lossage ("invalid operand for '%%%c'", code);
9924 return;
9925 }
9926
9927 asm_fprintf (f, "%d", n);
9928 }
9929 break;
9930
9931 case 'P':
9932 if (!CONST_INT_P (x))
9933 {
9934 output_operand_lossage ("invalid operand for '%%%c'", code);
9935 return;
9936 }
9937
9938 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
9939 break;
9940
9941 case 'H':
9942 if (x == const0_rtx)
9943 {
9944 asm_fprintf (f, "xzr");
9945 break;
9946 }
9947
9948 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
9949 {
9950 output_operand_lossage ("invalid operand for '%%%c'", code);
9951 return;
9952 }
9953
9954 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
9955 break;
9956
9957 case 'I':
9958 {
9959 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
9960 if (CONST_INT_P (x))
9961 asm_fprintf (f, "%wd", INTVAL (x));
9962 else
9963 {
9964 output_operand_lossage ("invalid operand for '%%%c'", code);
9965 return;
9966 }
9967 break;
9968 }
9969
9970 case 'M':
9971 case 'm':
9972 {
9973 int cond_code;
9974 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
9975 if (x == const_true_rtx)
9976 {
9977 if (code == 'M')
9978 fputs ("nv", f);
9979 return;
9980 }
9981
9982 if (!COMPARISON_P (x))
9983 {
9984 output_operand_lossage ("invalid operand for '%%%c'", code);
9985 return;
9986 }
9987
9988 cond_code = aarch64_get_condition_code (x);
9989 gcc_assert (cond_code >= 0);
9990 if (code == 'M')
9991 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
9992 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
9993 fputs (aarch64_sve_condition_codes[cond_code], f);
9994 else
9995 fputs (aarch64_condition_codes[cond_code], f);
9996 }
9997 break;
9998
9999 case 'N':
10000 if (!const_vec_duplicate_p (x, &elt))
10001 {
10002 output_operand_lossage ("invalid vector constant");
10003 return;
10004 }
10005
10006 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10007 asm_fprintf (f, "%wd", -INTVAL (elt));
10008 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10009 && aarch64_print_vector_float_operand (f, x, true))
10010 ;
10011 else
10012 {
10013 output_operand_lossage ("invalid vector constant");
10014 return;
10015 }
10016 break;
10017
10018 case 'b':
10019 case 'h':
10020 case 's':
10021 case 'd':
10022 case 'q':
10023 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10024 {
10025 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10026 return;
10027 }
10028 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
10029 break;
10030
10031 case 'S':
10032 case 'T':
10033 case 'U':
10034 case 'V':
10035 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10036 {
10037 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10038 return;
10039 }
10040 asm_fprintf (f, "%c%d",
10041 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10042 REGNO (x) - V0_REGNUM + (code - 'S'));
10043 break;
10044
10045 case 'R':
10046 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10047 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10048 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10049 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10050 else
10051 output_operand_lossage ("incompatible register operand for '%%%c'",
10052 code);
10053 break;
10054
10055 case 'X':
10056 if (!CONST_INT_P (x))
10057 {
10058 output_operand_lossage ("invalid operand for '%%%c'", code);
10059 return;
10060 }
10061 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
10062 break;
10063
10064 case 'C':
10065 {
10066 /* Print a replicated constant in hex. */
10067 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10068 {
10069 output_operand_lossage ("invalid operand for '%%%c'", code);
10070 return;
10071 }
10072 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10073 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10074 }
10075 break;
10076
10077 case 'D':
10078 {
10079 /* Print a replicated constant in decimal, treating it as
10080 unsigned. */
10081 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10082 {
10083 output_operand_lossage ("invalid operand for '%%%c'", code);
10084 return;
10085 }
10086 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10087 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10088 }
10089 break;
10090
10091 case 'w':
10092 case 'x':
10093 if (x == const0_rtx
10094 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
10095 {
10096 asm_fprintf (f, "%czr", code);
10097 break;
10098 }
10099
10100 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10101 {
10102 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
10103 break;
10104 }
10105
10106 if (REG_P (x) && REGNO (x) == SP_REGNUM)
10107 {
10108 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
10109 break;
10110 }
10111
10112 /* Fall through */
10113
10114 case 0:
10115 if (x == NULL)
10116 {
10117 output_operand_lossage ("missing operand");
10118 return;
10119 }
10120
10121 switch (GET_CODE (x))
10122 {
10123 case REG:
10124 if (aarch64_sve_data_mode_p (GET_MODE (x)))
10125 {
10126 if (REG_NREGS (x) == 1)
10127 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10128 else
10129 {
10130 char suffix
10131 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10132 asm_fprintf (f, "{z%d.%c - z%d.%c}",
10133 REGNO (x) - V0_REGNUM, suffix,
10134 END_REGNO (x) - V0_REGNUM - 1, suffix);
10135 }
10136 }
10137 else
10138 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
10139 break;
10140
10141 case MEM:
10142 output_address (GET_MODE (x), XEXP (x, 0));
10143 break;
10144
10145 case LABEL_REF:
10146 case SYMBOL_REF:
10147 output_addr_const (asm_out_file, x);
10148 break;
10149
10150 case CONST_INT:
10151 asm_fprintf (f, "%wd", INTVAL (x));
10152 break;
10153
10154 case CONST:
10155 if (!VECTOR_MODE_P (GET_MODE (x)))
10156 {
10157 output_addr_const (asm_out_file, x);
10158 break;
10159 }
10160 /* fall through */
10161
10162 case CONST_VECTOR:
10163 if (!const_vec_duplicate_p (x, &elt))
10164 {
10165 output_operand_lossage ("invalid vector constant");
10166 return;
10167 }
10168
10169 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10170 asm_fprintf (f, "%wd", INTVAL (elt));
10171 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10172 && aarch64_print_vector_float_operand (f, x, false))
10173 ;
10174 else
10175 {
10176 output_operand_lossage ("invalid vector constant");
10177 return;
10178 }
10179 break;
10180
10181 case CONST_DOUBLE:
10182 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10183 be getting CONST_DOUBLEs holding integers. */
10184 gcc_assert (GET_MODE (x) != VOIDmode);
10185 if (aarch64_float_const_zero_rtx_p (x))
10186 {
10187 fputc ('0', f);
10188 break;
10189 }
10190 else if (aarch64_float_const_representable_p (x))
10191 {
10192 #define buf_size 20
10193 char float_buf[buf_size] = {'\0'};
10194 real_to_decimal_for_mode (float_buf,
10195 CONST_DOUBLE_REAL_VALUE (x),
10196 buf_size, buf_size,
10197 1, GET_MODE (x));
10198 asm_fprintf (asm_out_file, "%s", float_buf);
10199 break;
10200 #undef buf_size
10201 }
10202 output_operand_lossage ("invalid constant");
10203 return;
10204 default:
10205 output_operand_lossage ("invalid operand");
10206 return;
10207 }
10208 break;
10209
10210 case 'A':
10211 if (GET_CODE (x) == HIGH)
10212 x = XEXP (x, 0);
10213
10214 switch (aarch64_classify_symbolic_expression (x))
10215 {
10216 case SYMBOL_SMALL_GOT_4G:
10217 asm_fprintf (asm_out_file, ":got:");
10218 break;
10219
10220 case SYMBOL_SMALL_TLSGD:
10221 asm_fprintf (asm_out_file, ":tlsgd:");
10222 break;
10223
10224 case SYMBOL_SMALL_TLSDESC:
10225 asm_fprintf (asm_out_file, ":tlsdesc:");
10226 break;
10227
10228 case SYMBOL_SMALL_TLSIE:
10229 asm_fprintf (asm_out_file, ":gottprel:");
10230 break;
10231
10232 case SYMBOL_TLSLE24:
10233 asm_fprintf (asm_out_file, ":tprel:");
10234 break;
10235
10236 case SYMBOL_TINY_GOT:
10237 gcc_unreachable ();
10238 break;
10239
10240 default:
10241 break;
10242 }
10243 output_addr_const (asm_out_file, x);
10244 break;
10245
10246 case 'L':
10247 switch (aarch64_classify_symbolic_expression (x))
10248 {
10249 case SYMBOL_SMALL_GOT_4G:
10250 asm_fprintf (asm_out_file, ":lo12:");
10251 break;
10252
10253 case SYMBOL_SMALL_TLSGD:
10254 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
10255 break;
10256
10257 case SYMBOL_SMALL_TLSDESC:
10258 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
10259 break;
10260
10261 case SYMBOL_SMALL_TLSIE:
10262 asm_fprintf (asm_out_file, ":gottprel_lo12:");
10263 break;
10264
10265 case SYMBOL_TLSLE12:
10266 asm_fprintf (asm_out_file, ":tprel_lo12:");
10267 break;
10268
10269 case SYMBOL_TLSLE24:
10270 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
10271 break;
10272
10273 case SYMBOL_TINY_GOT:
10274 asm_fprintf (asm_out_file, ":got:");
10275 break;
10276
10277 case SYMBOL_TINY_TLSIE:
10278 asm_fprintf (asm_out_file, ":gottprel:");
10279 break;
10280
10281 default:
10282 break;
10283 }
10284 output_addr_const (asm_out_file, x);
10285 break;
10286
10287 case 'G':
10288 switch (aarch64_classify_symbolic_expression (x))
10289 {
10290 case SYMBOL_TLSLE24:
10291 asm_fprintf (asm_out_file, ":tprel_hi12:");
10292 break;
10293 default:
10294 break;
10295 }
10296 output_addr_const (asm_out_file, x);
10297 break;
10298
10299 case 'k':
10300 {
10301 HOST_WIDE_INT cond_code;
10302
10303 if (!CONST_INT_P (x))
10304 {
10305 output_operand_lossage ("invalid operand for '%%%c'", code);
10306 return;
10307 }
10308
10309 cond_code = INTVAL (x);
10310 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
10311 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
10312 }
10313 break;
10314
10315 case 'y':
10316 case 'z':
10317 {
10318 machine_mode mode = GET_MODE (x);
10319
10320 if (GET_CODE (x) != MEM
10321 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
10322 {
10323 output_operand_lossage ("invalid operand for '%%%c'", code);
10324 return;
10325 }
10326
10327 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
10328 code == 'y'
10329 ? ADDR_QUERY_LDP_STP_N
10330 : ADDR_QUERY_LDP_STP))
10331 output_operand_lossage ("invalid operand prefix '%%%c'", code);
10332 }
10333 break;
10334
10335 default:
10336 output_operand_lossage ("invalid operand prefix '%%%c'", code);
10337 return;
10338 }
10339 }
10340
10341 /* Print address 'x' of a memory access with mode 'mode'.
10342 'op' is the context required by aarch64_classify_address. It can either be
10343 MEM for a normal memory access or PARALLEL for LDP/STP. */
10344 static bool
10345 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
10346 aarch64_addr_query_type type)
10347 {
10348 struct aarch64_address_info addr;
10349 unsigned int size, vec_flags;
10350
10351 /* Check all addresses are Pmode - including ILP32. */
10352 if (GET_MODE (x) != Pmode
10353 && (!CONST_INT_P (x)
10354 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
10355 {
10356 output_operand_lossage ("invalid address mode");
10357 return false;
10358 }
10359
10360 if (aarch64_classify_address (&addr, x, mode, true, type))
10361 switch (addr.type)
10362 {
10363 case ADDRESS_REG_IMM:
10364 if (known_eq (addr.const_offset, 0))
10365 {
10366 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
10367 return true;
10368 }
10369
10370 vec_flags = aarch64_classify_vector_mode (mode);
10371 if (vec_flags & VEC_ANY_SVE)
10372 {
10373 HOST_WIDE_INT vnum
10374 = exact_div (addr.const_offset,
10375 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
10376 asm_fprintf (f, "[%s, #%wd, mul vl]",
10377 reg_names[REGNO (addr.base)], vnum);
10378 return true;
10379 }
10380
10381 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
10382 INTVAL (addr.offset));
10383 return true;
10384
10385 case ADDRESS_REG_REG:
10386 if (addr.shift == 0)
10387 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
10388 reg_names [REGNO (addr.offset)]);
10389 else
10390 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
10391 reg_names [REGNO (addr.offset)], addr.shift);
10392 return true;
10393
10394 case ADDRESS_REG_UXTW:
10395 if (addr.shift == 0)
10396 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
10397 REGNO (addr.offset) - R0_REGNUM);
10398 else
10399 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
10400 REGNO (addr.offset) - R0_REGNUM, addr.shift);
10401 return true;
10402
10403 case ADDRESS_REG_SXTW:
10404 if (addr.shift == 0)
10405 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
10406 REGNO (addr.offset) - R0_REGNUM);
10407 else
10408 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
10409 REGNO (addr.offset) - R0_REGNUM, addr.shift);
10410 return true;
10411
10412 case ADDRESS_REG_WB:
10413 /* Writeback is only supported for fixed-width modes. */
10414 size = GET_MODE_SIZE (mode).to_constant ();
10415 switch (GET_CODE (x))
10416 {
10417 case PRE_INC:
10418 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
10419 return true;
10420 case POST_INC:
10421 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
10422 return true;
10423 case PRE_DEC:
10424 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
10425 return true;
10426 case POST_DEC:
10427 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
10428 return true;
10429 case PRE_MODIFY:
10430 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
10431 INTVAL (addr.offset));
10432 return true;
10433 case POST_MODIFY:
10434 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
10435 INTVAL (addr.offset));
10436 return true;
10437 default:
10438 break;
10439 }
10440 break;
10441
10442 case ADDRESS_LO_SUM:
10443 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
10444 output_addr_const (f, addr.offset);
10445 asm_fprintf (f, "]");
10446 return true;
10447
10448 case ADDRESS_SYMBOLIC:
10449 output_addr_const (f, x);
10450 return true;
10451 }
10452
10453 return false;
10454 }
10455
10456 /* Print address 'x' of a memory access with mode 'mode'. */
10457 static void
10458 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10459 {
10460 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
10461 output_addr_const (f, x);
10462 }
10463
10464 bool
10465 aarch64_label_mentioned_p (rtx x)
10466 {
10467 const char *fmt;
10468 int i;
10469
10470 if (GET_CODE (x) == LABEL_REF)
10471 return true;
10472
10473 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10474 referencing instruction, but they are constant offsets, not
10475 symbols. */
10476 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10477 return false;
10478
10479 fmt = GET_RTX_FORMAT (GET_CODE (x));
10480 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10481 {
10482 if (fmt[i] == 'E')
10483 {
10484 int j;
10485
10486 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10487 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10488 return 1;
10489 }
10490 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10491 return 1;
10492 }
10493
10494 return 0;
10495 }
10496
10497 /* Implement REGNO_REG_CLASS. */
10498
10499 enum reg_class
10500 aarch64_regno_regclass (unsigned regno)
10501 {
10502 if (GP_REGNUM_P (regno))
10503 return GENERAL_REGS;
10504
10505 if (regno == SP_REGNUM)
10506 return STACK_REG;
10507
10508 if (regno == FRAME_POINTER_REGNUM
10509 || regno == ARG_POINTER_REGNUM)
10510 return POINTER_REGS;
10511
10512 if (FP_REGNUM_P (regno))
10513 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10514 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10515
10516 if (PR_REGNUM_P (regno))
10517 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10518
10519 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10520 return FFR_REGS;
10521
10522 return NO_REGS;
10523 }
10524
10525 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10526 If OFFSET is out of range, return an offset of an anchor point
10527 that is in range. Return 0 otherwise. */
10528
10529 static HOST_WIDE_INT
10530 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10531 machine_mode mode)
10532 {
10533 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10534 if (size > 16)
10535 return (offset + 0x400) & ~0x7f0;
10536
10537 /* For offsets that aren't a multiple of the access size, the limit is
10538 -256...255. */
10539 if (offset & (size - 1))
10540 {
10541 /* BLKmode typically uses LDP of X-registers. */
10542 if (mode == BLKmode)
10543 return (offset + 512) & ~0x3ff;
10544 return (offset + 0x100) & ~0x1ff;
10545 }
10546
10547 /* Small negative offsets are supported. */
10548 if (IN_RANGE (offset, -256, 0))
10549 return 0;
10550
10551 if (mode == TImode || mode == TFmode)
10552 return (offset + 0x100) & ~0x1ff;
10553
10554 /* Use 12-bit offset by access size. */
10555 return offset & (~0xfff * size);
10556 }
10557
10558 static rtx
10559 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
10560 {
10561 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10562 where mask is selected by alignment and size of the offset.
10563 We try to pick as large a range for the offset as possible to
10564 maximize the chance of a CSE. However, for aligned addresses
10565 we limit the range to 4k so that structures with different sized
10566 elements are likely to use the same base. We need to be careful
10567 not to split a CONST for some forms of address expression, otherwise
10568 it will generate sub-optimal code. */
10569
10570 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10571 {
10572 rtx base = XEXP (x, 0);
10573 rtx offset_rtx = XEXP (x, 1);
10574 HOST_WIDE_INT offset = INTVAL (offset_rtx);
10575
10576 if (GET_CODE (base) == PLUS)
10577 {
10578 rtx op0 = XEXP (base, 0);
10579 rtx op1 = XEXP (base, 1);
10580
10581 /* Force any scaling into a temp for CSE. */
10582 op0 = force_reg (Pmode, op0);
10583 op1 = force_reg (Pmode, op1);
10584
10585 /* Let the pointer register be in op0. */
10586 if (REG_POINTER (op1))
10587 std::swap (op0, op1);
10588
10589 /* If the pointer is virtual or frame related, then we know that
10590 virtual register instantiation or register elimination is going
10591 to apply a second constant. We want the two constants folded
10592 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10593 if (virt_or_elim_regno_p (REGNO (op0)))
10594 {
10595 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10596 NULL_RTX, true, OPTAB_DIRECT);
10597 return gen_rtx_PLUS (Pmode, base, op1);
10598 }
10599
10600 /* Otherwise, in order to encourage CSE (and thence loop strength
10601 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10602 base = expand_binop (Pmode, add_optab, op0, op1,
10603 NULL_RTX, true, OPTAB_DIRECT);
10604 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10605 }
10606
10607 HOST_WIDE_INT size;
10608 if (GET_MODE_SIZE (mode).is_constant (&size))
10609 {
10610 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10611 mode);
10612 if (base_offset != 0)
10613 {
10614 base = plus_constant (Pmode, base, base_offset);
10615 base = force_operand (base, NULL_RTX);
10616 return plus_constant (Pmode, base, offset - base_offset);
10617 }
10618 }
10619 }
10620
10621 return x;
10622 }
10623
10624 static reg_class_t
10625 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10626 reg_class_t rclass,
10627 machine_mode mode,
10628 secondary_reload_info *sri)
10629 {
10630 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10631 LDR and STR. See the comment at the head of aarch64-sve.md for
10632 more details about the big-endian handling. */
10633 if (reg_class_subset_p (rclass, FP_REGS)
10634 && !((REG_P (x) && HARD_REGISTER_P (x))
10635 || aarch64_simd_valid_immediate (x, NULL))
10636 && mode != VNx16QImode)
10637 {
10638 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10639 if ((vec_flags & VEC_SVE_DATA)
10640 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10641 {
10642 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10643 return NO_REGS;
10644 }
10645 }
10646
10647 /* If we have to disable direct literal pool loads and stores because the
10648 function is too big, then we need a scratch register. */
10649 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10650 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10651 || targetm.vector_mode_supported_p (GET_MODE (x)))
10652 && !aarch64_pcrelative_literal_loads)
10653 {
10654 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10655 return NO_REGS;
10656 }
10657
10658 /* Without the TARGET_SIMD instructions we cannot move a Q register
10659 to a Q register directly. We need a scratch. */
10660 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10661 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10662 && reg_class_subset_p (rclass, FP_REGS))
10663 {
10664 sri->icode = code_for_aarch64_reload_mov (mode);
10665 return NO_REGS;
10666 }
10667
10668 /* A TFmode or TImode memory access should be handled via an FP_REGS
10669 because AArch64 has richer addressing modes for LDR/STR instructions
10670 than LDP/STP instructions. */
10671 if (TARGET_FLOAT && rclass == GENERAL_REGS
10672 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10673 return FP_REGS;
10674
10675 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10676 return GENERAL_REGS;
10677
10678 return NO_REGS;
10679 }
10680
10681 static bool
10682 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10683 {
10684 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10685
10686 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10687 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
10688 if (frame_pointer_needed)
10689 return to == HARD_FRAME_POINTER_REGNUM;
10690 return true;
10691 }
10692
10693 poly_int64
10694 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10695 {
10696 if (to == HARD_FRAME_POINTER_REGNUM)
10697 {
10698 if (from == ARG_POINTER_REGNUM)
10699 return cfun->machine->frame.hard_fp_offset;
10700
10701 if (from == FRAME_POINTER_REGNUM)
10702 return cfun->machine->frame.hard_fp_offset
10703 - cfun->machine->frame.locals_offset;
10704 }
10705
10706 if (to == STACK_POINTER_REGNUM)
10707 {
10708 if (from == FRAME_POINTER_REGNUM)
10709 return cfun->machine->frame.frame_size
10710 - cfun->machine->frame.locals_offset;
10711 }
10712
10713 return cfun->machine->frame.frame_size;
10714 }
10715
10716 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
10717 previous frame. */
10718
10719 rtx
10720 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10721 {
10722 if (count != 0)
10723 return const0_rtx;
10724 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
10725 }
10726
10727
10728 static void
10729 aarch64_asm_trampoline_template (FILE *f)
10730 {
10731 int offset1 = 16;
10732 int offset2 = 20;
10733
10734 if (aarch64_bti_enabled ())
10735 {
10736 asm_fprintf (f, "\thint\t34 // bti c\n");
10737 offset1 -= 4;
10738 offset2 -= 4;
10739 }
10740
10741 if (TARGET_ILP32)
10742 {
10743 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
10744 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
10745 offset1);
10746 }
10747 else
10748 {
10749 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
10750 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
10751 offset2);
10752 }
10753 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10754
10755 /* The trampoline needs an extra padding instruction. In case if BTI is
10756 enabled the padding instruction is replaced by the BTI instruction at
10757 the beginning. */
10758 if (!aarch64_bti_enabled ())
10759 assemble_aligned_integer (4, const0_rtx);
10760
10761 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10762 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10763 }
10764
10765 static void
10766 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10767 {
10768 rtx fnaddr, mem, a_tramp;
10769 const int tramp_code_sz = 16;
10770
10771 /* Don't need to copy the trailing D-words, we fill those in below. */
10772 emit_block_move (m_tramp, assemble_trampoline_template (),
10773 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10774 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10775 fnaddr = XEXP (DECL_RTL (fndecl), 0);
10776 if (GET_MODE (fnaddr) != ptr_mode)
10777 fnaddr = convert_memory_address (ptr_mode, fnaddr);
10778 emit_move_insn (mem, fnaddr);
10779
10780 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10781 emit_move_insn (mem, chain_value);
10782
10783 /* XXX We should really define a "clear_cache" pattern and use
10784 gen_clear_cache(). */
10785 a_tramp = XEXP (m_tramp, 0);
10786 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10787 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10788 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10789 ptr_mode);
10790 }
10791
10792 static unsigned char
10793 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10794 {
10795 /* ??? Logically we should only need to provide a value when
10796 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10797 can hold MODE, but at the moment we need to handle all modes.
10798 Just ignore any runtime parts for registers that can't store them. */
10799 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10800 unsigned int nregs, vec_flags;
10801 switch (regclass)
10802 {
10803 case TAILCALL_ADDR_REGS:
10804 case POINTER_REGS:
10805 case GENERAL_REGS:
10806 case ALL_REGS:
10807 case POINTER_AND_FP_REGS:
10808 case FP_REGS:
10809 case FP_LO_REGS:
10810 case FP_LO8_REGS:
10811 vec_flags = aarch64_classify_vector_mode (mode);
10812 if ((vec_flags & VEC_SVE_DATA)
10813 && constant_multiple_p (GET_MODE_SIZE (mode),
10814 aarch64_vl_bytes (mode, vec_flags), &nregs))
10815 return nregs;
10816 return (vec_flags & VEC_ADVSIMD
10817 ? CEIL (lowest_size, UNITS_PER_VREG)
10818 : CEIL (lowest_size, UNITS_PER_WORD));
10819 case STACK_REG:
10820 case PR_REGS:
10821 case PR_LO_REGS:
10822 case PR_HI_REGS:
10823 case FFR_REGS:
10824 case PR_AND_FFR_REGS:
10825 return 1;
10826
10827 case NO_REGS:
10828 return 0;
10829
10830 default:
10831 break;
10832 }
10833 gcc_unreachable ();
10834 }
10835
10836 static reg_class_t
10837 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10838 {
10839 if (regclass == POINTER_REGS)
10840 return GENERAL_REGS;
10841
10842 if (regclass == STACK_REG)
10843 {
10844 if (REG_P(x)
10845 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10846 return regclass;
10847
10848 return NO_REGS;
10849 }
10850
10851 /* Register eliminiation can result in a request for
10852 SP+constant->FP_REGS. We cannot support such operations which
10853 use SP as source and an FP_REG as destination, so reject out
10854 right now. */
10855 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10856 {
10857 rtx lhs = XEXP (x, 0);
10858
10859 /* Look through a possible SUBREG introduced by ILP32. */
10860 if (GET_CODE (lhs) == SUBREG)
10861 lhs = SUBREG_REG (lhs);
10862
10863 gcc_assert (REG_P (lhs));
10864 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
10865 POINTER_REGS));
10866 return NO_REGS;
10867 }
10868
10869 return regclass;
10870 }
10871
10872 void
10873 aarch64_asm_output_labelref (FILE* f, const char *name)
10874 {
10875 asm_fprintf (f, "%U%s", name);
10876 }
10877
10878 static void
10879 aarch64_elf_asm_constructor (rtx symbol, int priority)
10880 {
10881 if (priority == DEFAULT_INIT_PRIORITY)
10882 default_ctor_section_asm_out_constructor (symbol, priority);
10883 else
10884 {
10885 section *s;
10886 /* While priority is known to be in range [0, 65535], so 18 bytes
10887 would be enough, the compiler might not know that. To avoid
10888 -Wformat-truncation false positive, use a larger size. */
10889 char buf[23];
10890 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
10891 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10892 switch_to_section (s);
10893 assemble_align (POINTER_SIZE);
10894 assemble_aligned_integer (POINTER_BYTES, symbol);
10895 }
10896 }
10897
10898 static void
10899 aarch64_elf_asm_destructor (rtx symbol, int priority)
10900 {
10901 if (priority == DEFAULT_INIT_PRIORITY)
10902 default_dtor_section_asm_out_destructor (symbol, priority);
10903 else
10904 {
10905 section *s;
10906 /* While priority is known to be in range [0, 65535], so 18 bytes
10907 would be enough, the compiler might not know that. To avoid
10908 -Wformat-truncation false positive, use a larger size. */
10909 char buf[23];
10910 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
10911 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10912 switch_to_section (s);
10913 assemble_align (POINTER_SIZE);
10914 assemble_aligned_integer (POINTER_BYTES, symbol);
10915 }
10916 }
10917
10918 const char*
10919 aarch64_output_casesi (rtx *operands)
10920 {
10921 char buf[100];
10922 char label[100];
10923 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
10924 int index;
10925 static const char *const patterns[4][2] =
10926 {
10927 {
10928 "ldrb\t%w3, [%0,%w1,uxtw]",
10929 "add\t%3, %4, %w3, sxtb #2"
10930 },
10931 {
10932 "ldrh\t%w3, [%0,%w1,uxtw #1]",
10933 "add\t%3, %4, %w3, sxth #2"
10934 },
10935 {
10936 "ldr\t%w3, [%0,%w1,uxtw #2]",
10937 "add\t%3, %4, %w3, sxtw #2"
10938 },
10939 /* We assume that DImode is only generated when not optimizing and
10940 that we don't really need 64-bit address offsets. That would
10941 imply an object file with 8GB of code in a single function! */
10942 {
10943 "ldr\t%w3, [%0,%w1,uxtw #2]",
10944 "add\t%3, %4, %w3, sxtw #2"
10945 }
10946 };
10947
10948 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
10949
10950 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
10951 index = exact_log2 (GET_MODE_SIZE (mode));
10952
10953 gcc_assert (index >= 0 && index <= 3);
10954
10955 /* Need to implement table size reduction, by chaning the code below. */
10956 output_asm_insn (patterns[index][0], operands);
10957 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
10958 snprintf (buf, sizeof (buf),
10959 "adr\t%%4, %s", targetm.strip_name_encoding (label));
10960 output_asm_insn (buf, operands);
10961 output_asm_insn (patterns[index][1], operands);
10962 output_asm_insn ("br\t%3", operands);
10963 assemble_label (asm_out_file, label);
10964 return "";
10965 }
10966
10967
10968 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10969 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10970 operator. */
10971
10972 int
10973 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
10974 {
10975 if (shift >= 0 && shift <= 3)
10976 {
10977 int size;
10978 for (size = 8; size <= 32; size *= 2)
10979 {
10980 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
10981 if (mask == bits << shift)
10982 return size;
10983 }
10984 }
10985 return 0;
10986 }
10987
10988 /* Constant pools are per function only when PC relative
10989 literal loads are true or we are in the large memory
10990 model. */
10991
10992 static inline bool
10993 aarch64_can_use_per_function_literal_pools_p (void)
10994 {
10995 return (aarch64_pcrelative_literal_loads
10996 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
10997 }
10998
10999 static bool
11000 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
11001 {
11002 /* We can't use blocks for constants when we're using a per-function
11003 constant pool. */
11004 return !aarch64_can_use_per_function_literal_pools_p ();
11005 }
11006
11007 /* Select appropriate section for constants depending
11008 on where we place literal pools. */
11009
11010 static section *
11011 aarch64_select_rtx_section (machine_mode mode,
11012 rtx x,
11013 unsigned HOST_WIDE_INT align)
11014 {
11015 if (aarch64_can_use_per_function_literal_pools_p ())
11016 return function_section (current_function_decl);
11017
11018 return default_elf_select_rtx_section (mode, x, align);
11019 }
11020
11021 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
11022 void
11023 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11024 HOST_WIDE_INT offset)
11025 {
11026 /* When using per-function literal pools, we must ensure that any code
11027 section is aligned to the minimal instruction length, lest we get
11028 errors from the assembler re "unaligned instructions". */
11029 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11030 ASM_OUTPUT_ALIGN (f, 2);
11031 }
11032
11033 /* Costs. */
11034
11035 /* Helper function for rtx cost calculation. Strip a shift expression
11036 from X. Returns the inner operand if successful, or the original
11037 expression on failure. */
11038 static rtx
11039 aarch64_strip_shift (rtx x)
11040 {
11041 rtx op = x;
11042
11043 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11044 we can convert both to ROR during final output. */
11045 if ((GET_CODE (op) == ASHIFT
11046 || GET_CODE (op) == ASHIFTRT
11047 || GET_CODE (op) == LSHIFTRT
11048 || GET_CODE (op) == ROTATERT
11049 || GET_CODE (op) == ROTATE)
11050 && CONST_INT_P (XEXP (op, 1)))
11051 return XEXP (op, 0);
11052
11053 if (GET_CODE (op) == MULT
11054 && CONST_INT_P (XEXP (op, 1))
11055 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11056 return XEXP (op, 0);
11057
11058 return x;
11059 }
11060
11061 /* Helper function for rtx cost calculation. Strip an extend
11062 expression from X. Returns the inner operand if successful, or the
11063 original expression on failure. We deal with a number of possible
11064 canonicalization variations here. If STRIP_SHIFT is true, then
11065 we can strip off a shift also. */
11066 static rtx
11067 aarch64_strip_extend (rtx x, bool strip_shift)
11068 {
11069 scalar_int_mode mode;
11070 rtx op = x;
11071
11072 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11073 return op;
11074
11075 /* Zero and sign extraction of a widened value. */
11076 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
11077 && XEXP (op, 2) == const0_rtx
11078 && GET_CODE (XEXP (op, 0)) == MULT
11079 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
11080 XEXP (op, 1)))
11081 return XEXP (XEXP (op, 0), 0);
11082
11083 /* It can also be represented (for zero-extend) as an AND with an
11084 immediate. */
11085 if (GET_CODE (op) == AND
11086 && GET_CODE (XEXP (op, 0)) == MULT
11087 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11088 && CONST_INT_P (XEXP (op, 1))
11089 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11090 INTVAL (XEXP (op, 1))) != 0)
11091 return XEXP (XEXP (op, 0), 0);
11092
11093 /* Now handle extended register, as this may also have an optional
11094 left shift by 1..4. */
11095 if (strip_shift
11096 && GET_CODE (op) == ASHIFT
11097 && CONST_INT_P (XEXP (op, 1))
11098 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11099 op = XEXP (op, 0);
11100
11101 if (GET_CODE (op) == ZERO_EXTEND
11102 || GET_CODE (op) == SIGN_EXTEND)
11103 op = XEXP (op, 0);
11104
11105 if (op != x)
11106 return op;
11107
11108 return x;
11109 }
11110
11111 /* Return true iff CODE is a shift supported in combination
11112 with arithmetic instructions. */
11113
11114 static bool
11115 aarch64_shift_p (enum rtx_code code)
11116 {
11117 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11118 }
11119
11120
11121 /* Return true iff X is a cheap shift without a sign extend. */
11122
11123 static bool
11124 aarch64_cheap_mult_shift_p (rtx x)
11125 {
11126 rtx op0, op1;
11127
11128 op0 = XEXP (x, 0);
11129 op1 = XEXP (x, 1);
11130
11131 if (!(aarch64_tune_params.extra_tuning_flags
11132 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11133 return false;
11134
11135 if (GET_CODE (op0) == SIGN_EXTEND)
11136 return false;
11137
11138 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11139 && UINTVAL (op1) <= 4)
11140 return true;
11141
11142 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11143 return false;
11144
11145 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11146
11147 if (l2 > 0 && l2 <= 4)
11148 return true;
11149
11150 return false;
11151 }
11152
11153 /* Helper function for rtx cost calculation. Calculate the cost of
11154 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11155 Return the calculated cost of the expression, recursing manually in to
11156 operands where needed. */
11157
11158 static int
11159 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
11160 {
11161 rtx op0, op1;
11162 const struct cpu_cost_table *extra_cost
11163 = aarch64_tune_params.insn_extra_cost;
11164 int cost = 0;
11165 bool compound_p = (outer == PLUS || outer == MINUS);
11166 machine_mode mode = GET_MODE (x);
11167
11168 gcc_checking_assert (code == MULT);
11169
11170 op0 = XEXP (x, 0);
11171 op1 = XEXP (x, 1);
11172
11173 if (VECTOR_MODE_P (mode))
11174 mode = GET_MODE_INNER (mode);
11175
11176 /* Integer multiply/fma. */
11177 if (GET_MODE_CLASS (mode) == MODE_INT)
11178 {
11179 /* The multiply will be canonicalized as a shift, cost it as such. */
11180 if (aarch64_shift_p (GET_CODE (x))
11181 || (CONST_INT_P (op1)
11182 && exact_log2 (INTVAL (op1)) > 0))
11183 {
11184 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
11185 || GET_CODE (op0) == SIGN_EXTEND;
11186 if (speed)
11187 {
11188 if (compound_p)
11189 {
11190 /* If the shift is considered cheap,
11191 then don't add any cost. */
11192 if (aarch64_cheap_mult_shift_p (x))
11193 ;
11194 else if (REG_P (op1))
11195 /* ARITH + shift-by-register. */
11196 cost += extra_cost->alu.arith_shift_reg;
11197 else if (is_extend)
11198 /* ARITH + extended register. We don't have a cost field
11199 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
11200 cost += extra_cost->alu.extend_arith;
11201 else
11202 /* ARITH + shift-by-immediate. */
11203 cost += extra_cost->alu.arith_shift;
11204 }
11205 else
11206 /* LSL (immediate). */
11207 cost += extra_cost->alu.shift;
11208
11209 }
11210 /* Strip extends as we will have costed them in the case above. */
11211 if (is_extend)
11212 op0 = aarch64_strip_extend (op0, true);
11213
11214 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
11215
11216 return cost;
11217 }
11218
11219 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
11220 compound and let the below cases handle it. After all, MNEG is a
11221 special-case alias of MSUB. */
11222 if (GET_CODE (op0) == NEG)
11223 {
11224 op0 = XEXP (op0, 0);
11225 compound_p = true;
11226 }
11227
11228 /* Integer multiplies or FMAs have zero/sign extending variants. */
11229 if ((GET_CODE (op0) == ZERO_EXTEND
11230 && GET_CODE (op1) == ZERO_EXTEND)
11231 || (GET_CODE (op0) == SIGN_EXTEND
11232 && GET_CODE (op1) == SIGN_EXTEND))
11233 {
11234 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
11235 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
11236
11237 if (speed)
11238 {
11239 if (compound_p)
11240 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
11241 cost += extra_cost->mult[0].extend_add;
11242 else
11243 /* MUL/SMULL/UMULL. */
11244 cost += extra_cost->mult[0].extend;
11245 }
11246
11247 return cost;
11248 }
11249
11250 /* This is either an integer multiply or a MADD. In both cases
11251 we want to recurse and cost the operands. */
11252 cost += rtx_cost (op0, mode, MULT, 0, speed);
11253 cost += rtx_cost (op1, mode, MULT, 1, speed);
11254
11255 if (speed)
11256 {
11257 if (compound_p)
11258 /* MADD/MSUB. */
11259 cost += extra_cost->mult[mode == DImode].add;
11260 else
11261 /* MUL. */
11262 cost += extra_cost->mult[mode == DImode].simple;
11263 }
11264
11265 return cost;
11266 }
11267 else
11268 {
11269 if (speed)
11270 {
11271 /* Floating-point FMA/FMUL can also support negations of the
11272 operands, unless the rounding mode is upward or downward in
11273 which case FNMUL is different than FMUL with operand negation. */
11274 bool neg0 = GET_CODE (op0) == NEG;
11275 bool neg1 = GET_CODE (op1) == NEG;
11276 if (compound_p || !flag_rounding_math || (neg0 && neg1))
11277 {
11278 if (neg0)
11279 op0 = XEXP (op0, 0);
11280 if (neg1)
11281 op1 = XEXP (op1, 0);
11282 }
11283
11284 if (compound_p)
11285 /* FMADD/FNMADD/FNMSUB/FMSUB. */
11286 cost += extra_cost->fp[mode == DFmode].fma;
11287 else
11288 /* FMUL/FNMUL. */
11289 cost += extra_cost->fp[mode == DFmode].mult;
11290 }
11291
11292 cost += rtx_cost (op0, mode, MULT, 0, speed);
11293 cost += rtx_cost (op1, mode, MULT, 1, speed);
11294 return cost;
11295 }
11296 }
11297
11298 static int
11299 aarch64_address_cost (rtx x,
11300 machine_mode mode,
11301 addr_space_t as ATTRIBUTE_UNUSED,
11302 bool speed)
11303 {
11304 enum rtx_code c = GET_CODE (x);
11305 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
11306 struct aarch64_address_info info;
11307 int cost = 0;
11308 info.shift = 0;
11309
11310 if (!aarch64_classify_address (&info, x, mode, false))
11311 {
11312 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
11313 {
11314 /* This is a CONST or SYMBOL ref which will be split
11315 in a different way depending on the code model in use.
11316 Cost it through the generic infrastructure. */
11317 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
11318 /* Divide through by the cost of one instruction to
11319 bring it to the same units as the address costs. */
11320 cost_symbol_ref /= COSTS_N_INSNS (1);
11321 /* The cost is then the cost of preparing the address,
11322 followed by an immediate (possibly 0) offset. */
11323 return cost_symbol_ref + addr_cost->imm_offset;
11324 }
11325 else
11326 {
11327 /* This is most likely a jump table from a case
11328 statement. */
11329 return addr_cost->register_offset;
11330 }
11331 }
11332
11333 switch (info.type)
11334 {
11335 case ADDRESS_LO_SUM:
11336 case ADDRESS_SYMBOLIC:
11337 case ADDRESS_REG_IMM:
11338 cost += addr_cost->imm_offset;
11339 break;
11340
11341 case ADDRESS_REG_WB:
11342 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
11343 cost += addr_cost->pre_modify;
11344 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
11345 cost += addr_cost->post_modify;
11346 else
11347 gcc_unreachable ();
11348
11349 break;
11350
11351 case ADDRESS_REG_REG:
11352 cost += addr_cost->register_offset;
11353 break;
11354
11355 case ADDRESS_REG_SXTW:
11356 cost += addr_cost->register_sextend;
11357 break;
11358
11359 case ADDRESS_REG_UXTW:
11360 cost += addr_cost->register_zextend;
11361 break;
11362
11363 default:
11364 gcc_unreachable ();
11365 }
11366
11367
11368 if (info.shift > 0)
11369 {
11370 /* For the sake of calculating the cost of the shifted register
11371 component, we can treat same sized modes in the same way. */
11372 if (known_eq (GET_MODE_BITSIZE (mode), 16))
11373 cost += addr_cost->addr_scale_costs.hi;
11374 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
11375 cost += addr_cost->addr_scale_costs.si;
11376 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
11377 cost += addr_cost->addr_scale_costs.di;
11378 else
11379 /* We can't tell, or this is a 128-bit vector. */
11380 cost += addr_cost->addr_scale_costs.ti;
11381 }
11382
11383 return cost;
11384 }
11385
11386 /* Return the cost of a branch. If SPEED_P is true then the compiler is
11387 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
11388 to be taken. */
11389
11390 int
11391 aarch64_branch_cost (bool speed_p, bool predictable_p)
11392 {
11393 /* When optimizing for speed, use the cost of unpredictable branches. */
11394 const struct cpu_branch_cost *branch_costs =
11395 aarch64_tune_params.branch_costs;
11396
11397 if (!speed_p || predictable_p)
11398 return branch_costs->predictable;
11399 else
11400 return branch_costs->unpredictable;
11401 }
11402
11403 /* Return true if the RTX X in mode MODE is a zero or sign extract
11404 usable in an ADD or SUB (extended register) instruction. */
11405 static bool
11406 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
11407 {
11408 /* Catch add with a sign extract.
11409 This is add_<optab><mode>_multp2. */
11410 if (GET_CODE (x) == SIGN_EXTRACT
11411 || GET_CODE (x) == ZERO_EXTRACT)
11412 {
11413 rtx op0 = XEXP (x, 0);
11414 rtx op1 = XEXP (x, 1);
11415 rtx op2 = XEXP (x, 2);
11416
11417 if (GET_CODE (op0) == MULT
11418 && CONST_INT_P (op1)
11419 && op2 == const0_rtx
11420 && CONST_INT_P (XEXP (op0, 1))
11421 && aarch64_is_extend_from_extract (mode,
11422 XEXP (op0, 1),
11423 op1))
11424 {
11425 return true;
11426 }
11427 }
11428 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
11429 No shift. */
11430 else if (GET_CODE (x) == SIGN_EXTEND
11431 || GET_CODE (x) == ZERO_EXTEND)
11432 return REG_P (XEXP (x, 0));
11433
11434 return false;
11435 }
11436
11437 static bool
11438 aarch64_frint_unspec_p (unsigned int u)
11439 {
11440 switch (u)
11441 {
11442 case UNSPEC_FRINTZ:
11443 case UNSPEC_FRINTP:
11444 case UNSPEC_FRINTM:
11445 case UNSPEC_FRINTA:
11446 case UNSPEC_FRINTN:
11447 case UNSPEC_FRINTX:
11448 case UNSPEC_FRINTI:
11449 return true;
11450
11451 default:
11452 return false;
11453 }
11454 }
11455
11456 /* Return true iff X is an rtx that will match an extr instruction
11457 i.e. as described in the *extr<mode>5_insn family of patterns.
11458 OP0 and OP1 will be set to the operands of the shifts involved
11459 on success and will be NULL_RTX otherwise. */
11460
11461 static bool
11462 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11463 {
11464 rtx op0, op1;
11465 scalar_int_mode mode;
11466 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11467 return false;
11468
11469 *res_op0 = NULL_RTX;
11470 *res_op1 = NULL_RTX;
11471
11472 if (GET_CODE (x) != IOR)
11473 return false;
11474
11475 op0 = XEXP (x, 0);
11476 op1 = XEXP (x, 1);
11477
11478 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11479 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11480 {
11481 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
11482 if (GET_CODE (op1) == ASHIFT)
11483 std::swap (op0, op1);
11484
11485 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11486 return false;
11487
11488 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11489 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11490
11491 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11492 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11493 {
11494 *res_op0 = XEXP (op0, 0);
11495 *res_op1 = XEXP (op1, 0);
11496 return true;
11497 }
11498 }
11499
11500 return false;
11501 }
11502
11503 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11504 storing it in *COST. Result is true if the total cost of the operation
11505 has now been calculated. */
11506 static bool
11507 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11508 {
11509 rtx inner;
11510 rtx comparator;
11511 enum rtx_code cmpcode;
11512 const struct cpu_cost_table *extra_cost
11513 = aarch64_tune_params.insn_extra_cost;
11514
11515 if (COMPARISON_P (op0))
11516 {
11517 inner = XEXP (op0, 0);
11518 comparator = XEXP (op0, 1);
11519 cmpcode = GET_CODE (op0);
11520 }
11521 else
11522 {
11523 inner = op0;
11524 comparator = const0_rtx;
11525 cmpcode = NE;
11526 }
11527
11528 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11529 {
11530 /* Conditional branch. */
11531 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11532 return true;
11533 else
11534 {
11535 if (cmpcode == NE || cmpcode == EQ)
11536 {
11537 if (comparator == const0_rtx)
11538 {
11539 /* TBZ/TBNZ/CBZ/CBNZ. */
11540 if (GET_CODE (inner) == ZERO_EXTRACT)
11541 /* TBZ/TBNZ. */
11542 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11543 ZERO_EXTRACT, 0, speed);
11544 else
11545 /* CBZ/CBNZ. */
11546 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11547
11548 return true;
11549 }
11550 if (register_operand (inner, VOIDmode)
11551 && aarch64_imm24 (comparator, VOIDmode))
11552 {
11553 /* SUB and SUBS. */
11554 *cost += COSTS_N_INSNS (2);
11555 if (speed)
11556 *cost += extra_cost->alu.arith * 2;
11557 return true;
11558 }
11559 }
11560 else if (cmpcode == LT || cmpcode == GE)
11561 {
11562 /* TBZ/TBNZ. */
11563 if (comparator == const0_rtx)
11564 return true;
11565 }
11566 }
11567 }
11568 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11569 {
11570 /* CCMP. */
11571 if (GET_CODE (op1) == COMPARE)
11572 {
11573 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11574 if (XEXP (op1, 1) == const0_rtx)
11575 *cost += 1;
11576 if (speed)
11577 {
11578 machine_mode mode = GET_MODE (XEXP (op1, 0));
11579 const struct cpu_cost_table *extra_cost
11580 = aarch64_tune_params.insn_extra_cost;
11581
11582 if (GET_MODE_CLASS (mode) == MODE_INT)
11583 *cost += extra_cost->alu.arith;
11584 else
11585 *cost += extra_cost->fp[mode == DFmode].compare;
11586 }
11587 return true;
11588 }
11589
11590 /* It's a conditional operation based on the status flags,
11591 so it must be some flavor of CSEL. */
11592
11593 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11594 if (GET_CODE (op1) == NEG
11595 || GET_CODE (op1) == NOT
11596 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11597 op1 = XEXP (op1, 0);
11598 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11599 {
11600 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11601 op1 = XEXP (op1, 0);
11602 op2 = XEXP (op2, 0);
11603 }
11604
11605 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11606 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11607 return true;
11608 }
11609
11610 /* We don't know what this is, cost all operands. */
11611 return false;
11612 }
11613
11614 /* Check whether X is a bitfield operation of the form shift + extend that
11615 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11616 operand to which the bitfield operation is applied. Otherwise return
11617 NULL_RTX. */
11618
11619 static rtx
11620 aarch64_extend_bitfield_pattern_p (rtx x)
11621 {
11622 rtx_code outer_code = GET_CODE (x);
11623 machine_mode outer_mode = GET_MODE (x);
11624
11625 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11626 && outer_mode != SImode && outer_mode != DImode)
11627 return NULL_RTX;
11628
11629 rtx inner = XEXP (x, 0);
11630 rtx_code inner_code = GET_CODE (inner);
11631 machine_mode inner_mode = GET_MODE (inner);
11632 rtx op = NULL_RTX;
11633
11634 switch (inner_code)
11635 {
11636 case ASHIFT:
11637 if (CONST_INT_P (XEXP (inner, 1))
11638 && (inner_mode == QImode || inner_mode == HImode))
11639 op = XEXP (inner, 0);
11640 break;
11641 case LSHIFTRT:
11642 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11643 && (inner_mode == QImode || inner_mode == HImode))
11644 op = XEXP (inner, 0);
11645 break;
11646 case ASHIFTRT:
11647 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11648 && (inner_mode == QImode || inner_mode == HImode))
11649 op = XEXP (inner, 0);
11650 break;
11651 default:
11652 break;
11653 }
11654
11655 return op;
11656 }
11657
11658 /* Return true if the mask and a shift amount from an RTX of the form
11659 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11660 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11661
11662 bool
11663 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11664 rtx shft_amnt)
11665 {
11666 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11667 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11668 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11669 && (INTVAL (mask)
11670 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11671 }
11672
11673 /* Return true if the masks and a shift amount from an RTX of the form
11674 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11675 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11676
11677 bool
11678 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11679 unsigned HOST_WIDE_INT mask1,
11680 unsigned HOST_WIDE_INT shft_amnt,
11681 unsigned HOST_WIDE_INT mask2)
11682 {
11683 unsigned HOST_WIDE_INT t;
11684
11685 /* Verify that there is no overlap in what bits are set in the two masks. */
11686 if (mask1 != ~mask2)
11687 return false;
11688
11689 /* Verify that mask2 is not all zeros or ones. */
11690 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11691 return false;
11692
11693 /* The shift amount should always be less than the mode size. */
11694 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11695
11696 /* Verify that the mask being shifted is contiguous and would be in the
11697 least significant bits after shifting by shft_amnt. */
11698 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11699 return (t == (t & -t));
11700 }
11701
11702 /* Calculate the cost of calculating X, storing it in *COST. Result
11703 is true if the total cost of the operation has now been calculated. */
11704 static bool
11705 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11706 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11707 {
11708 rtx op0, op1, op2;
11709 const struct cpu_cost_table *extra_cost
11710 = aarch64_tune_params.insn_extra_cost;
11711 int code = GET_CODE (x);
11712 scalar_int_mode int_mode;
11713
11714 /* By default, assume that everything has equivalent cost to the
11715 cheapest instruction. Any additional costs are applied as a delta
11716 above this default. */
11717 *cost = COSTS_N_INSNS (1);
11718
11719 switch (code)
11720 {
11721 case SET:
11722 /* The cost depends entirely on the operands to SET. */
11723 *cost = 0;
11724 op0 = SET_DEST (x);
11725 op1 = SET_SRC (x);
11726
11727 switch (GET_CODE (op0))
11728 {
11729 case MEM:
11730 if (speed)
11731 {
11732 rtx address = XEXP (op0, 0);
11733 if (VECTOR_MODE_P (mode))
11734 *cost += extra_cost->ldst.storev;
11735 else if (GET_MODE_CLASS (mode) == MODE_INT)
11736 *cost += extra_cost->ldst.store;
11737 else if (mode == SFmode)
11738 *cost += extra_cost->ldst.storef;
11739 else if (mode == DFmode)
11740 *cost += extra_cost->ldst.stored;
11741
11742 *cost +=
11743 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11744 0, speed));
11745 }
11746
11747 *cost += rtx_cost (op1, mode, SET, 1, speed);
11748 return true;
11749
11750 case SUBREG:
11751 if (! REG_P (SUBREG_REG (op0)))
11752 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11753
11754 /* Fall through. */
11755 case REG:
11756 /* The cost is one per vector-register copied. */
11757 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11758 {
11759 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11760 *cost = COSTS_N_INSNS (nregs);
11761 }
11762 /* const0_rtx is in general free, but we will use an
11763 instruction to set a register to 0. */
11764 else if (REG_P (op1) || op1 == const0_rtx)
11765 {
11766 /* The cost is 1 per register copied. */
11767 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11768 *cost = COSTS_N_INSNS (nregs);
11769 }
11770 else
11771 /* Cost is just the cost of the RHS of the set. */
11772 *cost += rtx_cost (op1, mode, SET, 1, speed);
11773 return true;
11774
11775 case ZERO_EXTRACT:
11776 case SIGN_EXTRACT:
11777 /* Bit-field insertion. Strip any redundant widening of
11778 the RHS to meet the width of the target. */
11779 if (GET_CODE (op1) == SUBREG)
11780 op1 = SUBREG_REG (op1);
11781 if ((GET_CODE (op1) == ZERO_EXTEND
11782 || GET_CODE (op1) == SIGN_EXTEND)
11783 && CONST_INT_P (XEXP (op0, 1))
11784 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11785 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11786 op1 = XEXP (op1, 0);
11787
11788 if (CONST_INT_P (op1))
11789 {
11790 /* MOV immediate is assumed to always be cheap. */
11791 *cost = COSTS_N_INSNS (1);
11792 }
11793 else
11794 {
11795 /* BFM. */
11796 if (speed)
11797 *cost += extra_cost->alu.bfi;
11798 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11799 }
11800
11801 return true;
11802
11803 default:
11804 /* We can't make sense of this, assume default cost. */
11805 *cost = COSTS_N_INSNS (1);
11806 return false;
11807 }
11808 return false;
11809
11810 case CONST_INT:
11811 /* If an instruction can incorporate a constant within the
11812 instruction, the instruction's expression avoids calling
11813 rtx_cost() on the constant. If rtx_cost() is called on a
11814 constant, then it is usually because the constant must be
11815 moved into a register by one or more instructions.
11816
11817 The exception is constant 0, which can be expressed
11818 as XZR/WZR and is therefore free. The exception to this is
11819 if we have (set (reg) (const0_rtx)) in which case we must cost
11820 the move. However, we can catch that when we cost the SET, so
11821 we don't need to consider that here. */
11822 if (x == const0_rtx)
11823 *cost = 0;
11824 else
11825 {
11826 /* To an approximation, building any other constant is
11827 proportionally expensive to the number of instructions
11828 required to build that constant. This is true whether we
11829 are compiling for SPEED or otherwise. */
11830 if (!is_a <scalar_int_mode> (mode, &int_mode))
11831 int_mode = word_mode;
11832 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
11833 (NULL_RTX, x, false, int_mode));
11834 }
11835 return true;
11836
11837 case CONST_DOUBLE:
11838
11839 /* First determine number of instructions to do the move
11840 as an integer constant. */
11841 if (!aarch64_float_const_representable_p (x)
11842 && !aarch64_can_const_movi_rtx_p (x, mode)
11843 && aarch64_float_const_rtx_p (x))
11844 {
11845 unsigned HOST_WIDE_INT ival;
11846 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
11847 gcc_assert (succeed);
11848
11849 scalar_int_mode imode = (mode == HFmode
11850 ? SImode
11851 : int_mode_for_mode (mode).require ());
11852 int ncost = aarch64_internal_mov_immediate
11853 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11854 *cost += COSTS_N_INSNS (ncost);
11855 return true;
11856 }
11857
11858 if (speed)
11859 {
11860 /* mov[df,sf]_aarch64. */
11861 if (aarch64_float_const_representable_p (x))
11862 /* FMOV (scalar immediate). */
11863 *cost += extra_cost->fp[mode == DFmode].fpconst;
11864 else if (!aarch64_float_const_zero_rtx_p (x))
11865 {
11866 /* This will be a load from memory. */
11867 if (mode == DFmode)
11868 *cost += extra_cost->ldst.loadd;
11869 else
11870 *cost += extra_cost->ldst.loadf;
11871 }
11872 else
11873 /* Otherwise this is +0.0. We get this using MOVI d0, #0
11874 or MOV v0.s[0], wzr - neither of which are modeled by the
11875 cost tables. Just use the default cost. */
11876 {
11877 }
11878 }
11879
11880 return true;
11881
11882 case MEM:
11883 if (speed)
11884 {
11885 /* For loads we want the base cost of a load, plus an
11886 approximation for the additional cost of the addressing
11887 mode. */
11888 rtx address = XEXP (x, 0);
11889 if (VECTOR_MODE_P (mode))
11890 *cost += extra_cost->ldst.loadv;
11891 else if (GET_MODE_CLASS (mode) == MODE_INT)
11892 *cost += extra_cost->ldst.load;
11893 else if (mode == SFmode)
11894 *cost += extra_cost->ldst.loadf;
11895 else if (mode == DFmode)
11896 *cost += extra_cost->ldst.loadd;
11897
11898 *cost +=
11899 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11900 0, speed));
11901 }
11902
11903 return true;
11904
11905 case NEG:
11906 op0 = XEXP (x, 0);
11907
11908 if (VECTOR_MODE_P (mode))
11909 {
11910 if (speed)
11911 {
11912 /* FNEG. */
11913 *cost += extra_cost->vect.alu;
11914 }
11915 return false;
11916 }
11917
11918 if (GET_MODE_CLASS (mode) == MODE_INT)
11919 {
11920 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11921 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11922 {
11923 /* CSETM. */
11924 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
11925 return true;
11926 }
11927
11928 /* Cost this as SUB wzr, X. */
11929 op0 = CONST0_RTX (mode);
11930 op1 = XEXP (x, 0);
11931 goto cost_minus;
11932 }
11933
11934 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11935 {
11936 /* Support (neg(fma...)) as a single instruction only if
11937 sign of zeros is unimportant. This matches the decision
11938 making in aarch64.md. */
11939 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
11940 {
11941 /* FNMADD. */
11942 *cost = rtx_cost (op0, mode, NEG, 0, speed);
11943 return true;
11944 }
11945 if (GET_CODE (op0) == MULT)
11946 {
11947 /* FNMUL. */
11948 *cost = rtx_cost (op0, mode, NEG, 0, speed);
11949 return true;
11950 }
11951 if (speed)
11952 /* FNEG. */
11953 *cost += extra_cost->fp[mode == DFmode].neg;
11954 return false;
11955 }
11956
11957 return false;
11958
11959 case CLRSB:
11960 case CLZ:
11961 if (speed)
11962 {
11963 if (VECTOR_MODE_P (mode))
11964 *cost += extra_cost->vect.alu;
11965 else
11966 *cost += extra_cost->alu.clz;
11967 }
11968
11969 return false;
11970
11971 case CTZ:
11972 *cost = COSTS_N_INSNS (2);
11973
11974 if (speed)
11975 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
11976 return false;
11977
11978 case COMPARE:
11979 op0 = XEXP (x, 0);
11980 op1 = XEXP (x, 1);
11981
11982 if (op1 == const0_rtx
11983 && GET_CODE (op0) == AND)
11984 {
11985 x = op0;
11986 mode = GET_MODE (op0);
11987 goto cost_logic;
11988 }
11989
11990 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
11991 {
11992 /* TODO: A write to the CC flags possibly costs extra, this
11993 needs encoding in the cost tables. */
11994
11995 mode = GET_MODE (op0);
11996 /* ANDS. */
11997 if (GET_CODE (op0) == AND)
11998 {
11999 x = op0;
12000 goto cost_logic;
12001 }
12002
12003 if (GET_CODE (op0) == PLUS)
12004 {
12005 /* ADDS (and CMN alias). */
12006 x = op0;
12007 goto cost_plus;
12008 }
12009
12010 if (GET_CODE (op0) == MINUS)
12011 {
12012 /* SUBS. */
12013 x = op0;
12014 goto cost_minus;
12015 }
12016
12017 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12018 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12019 && CONST_INT_P (XEXP (op0, 2)))
12020 {
12021 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12022 Handle it here directly rather than going to cost_logic
12023 since we know the immediate generated for the TST is valid
12024 so we can avoid creating an intermediate rtx for it only
12025 for costing purposes. */
12026 if (speed)
12027 *cost += extra_cost->alu.logical;
12028
12029 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12030 ZERO_EXTRACT, 0, speed);
12031 return true;
12032 }
12033
12034 if (GET_CODE (op1) == NEG)
12035 {
12036 /* CMN. */
12037 if (speed)
12038 *cost += extra_cost->alu.arith;
12039
12040 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12041 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
12042 return true;
12043 }
12044
12045 /* CMP.
12046
12047 Compare can freely swap the order of operands, and
12048 canonicalization puts the more complex operation first.
12049 But the integer MINUS logic expects the shift/extend
12050 operation in op1. */
12051 if (! (REG_P (op0)
12052 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12053 {
12054 op0 = XEXP (x, 1);
12055 op1 = XEXP (x, 0);
12056 }
12057 goto cost_minus;
12058 }
12059
12060 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12061 {
12062 /* FCMP. */
12063 if (speed)
12064 *cost += extra_cost->fp[mode == DFmode].compare;
12065
12066 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12067 {
12068 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
12069 /* FCMP supports constant 0.0 for no extra cost. */
12070 return true;
12071 }
12072 return false;
12073 }
12074
12075 if (VECTOR_MODE_P (mode))
12076 {
12077 /* Vector compare. */
12078 if (speed)
12079 *cost += extra_cost->vect.alu;
12080
12081 if (aarch64_float_const_zero_rtx_p (op1))
12082 {
12083 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12084 cost. */
12085 return true;
12086 }
12087 return false;
12088 }
12089 return false;
12090
12091 case MINUS:
12092 {
12093 op0 = XEXP (x, 0);
12094 op1 = XEXP (x, 1);
12095
12096 cost_minus:
12097 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
12098
12099 /* Detect valid immediates. */
12100 if ((GET_MODE_CLASS (mode) == MODE_INT
12101 || (GET_MODE_CLASS (mode) == MODE_CC
12102 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12103 && CONST_INT_P (op1)
12104 && aarch64_uimm12_shift (INTVAL (op1)))
12105 {
12106 if (speed)
12107 /* SUB(S) (immediate). */
12108 *cost += extra_cost->alu.arith;
12109 return true;
12110 }
12111
12112 /* Look for SUB (extended register). */
12113 if (is_a <scalar_int_mode> (mode, &int_mode)
12114 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
12115 {
12116 if (speed)
12117 *cost += extra_cost->alu.extend_arith;
12118
12119 op1 = aarch64_strip_extend (op1, true);
12120 *cost += rtx_cost (op1, VOIDmode,
12121 (enum rtx_code) GET_CODE (op1), 0, speed);
12122 return true;
12123 }
12124
12125 rtx new_op1 = aarch64_strip_extend (op1, false);
12126
12127 /* Cost this as an FMA-alike operation. */
12128 if ((GET_CODE (new_op1) == MULT
12129 || aarch64_shift_p (GET_CODE (new_op1)))
12130 && code != COMPARE)
12131 {
12132 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12133 (enum rtx_code) code,
12134 speed);
12135 return true;
12136 }
12137
12138 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
12139
12140 if (speed)
12141 {
12142 if (VECTOR_MODE_P (mode))
12143 {
12144 /* Vector SUB. */
12145 *cost += extra_cost->vect.alu;
12146 }
12147 else if (GET_MODE_CLASS (mode) == MODE_INT)
12148 {
12149 /* SUB(S). */
12150 *cost += extra_cost->alu.arith;
12151 }
12152 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12153 {
12154 /* FSUB. */
12155 *cost += extra_cost->fp[mode == DFmode].addsub;
12156 }
12157 }
12158 return true;
12159 }
12160
12161 case PLUS:
12162 {
12163 rtx new_op0;
12164
12165 op0 = XEXP (x, 0);
12166 op1 = XEXP (x, 1);
12167
12168 cost_plus:
12169 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12170 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12171 {
12172 /* CSINC. */
12173 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12174 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12175 return true;
12176 }
12177
12178 if (GET_MODE_CLASS (mode) == MODE_INT
12179 && (aarch64_plus_immediate (op1, mode)
12180 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
12181 {
12182 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
12183
12184 if (speed)
12185 /* ADD (immediate). */
12186 *cost += extra_cost->alu.arith;
12187 return true;
12188 }
12189
12190 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12191
12192 /* Look for ADD (extended register). */
12193 if (is_a <scalar_int_mode> (mode, &int_mode)
12194 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
12195 {
12196 if (speed)
12197 *cost += extra_cost->alu.extend_arith;
12198
12199 op0 = aarch64_strip_extend (op0, true);
12200 *cost += rtx_cost (op0, VOIDmode,
12201 (enum rtx_code) GET_CODE (op0), 0, speed);
12202 return true;
12203 }
12204
12205 /* Strip any extend, leave shifts behind as we will
12206 cost them through mult_cost. */
12207 new_op0 = aarch64_strip_extend (op0, false);
12208
12209 if (GET_CODE (new_op0) == MULT
12210 || aarch64_shift_p (GET_CODE (new_op0)))
12211 {
12212 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
12213 speed);
12214 return true;
12215 }
12216
12217 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
12218
12219 if (speed)
12220 {
12221 if (VECTOR_MODE_P (mode))
12222 {
12223 /* Vector ADD. */
12224 *cost += extra_cost->vect.alu;
12225 }
12226 else if (GET_MODE_CLASS (mode) == MODE_INT)
12227 {
12228 /* ADD. */
12229 *cost += extra_cost->alu.arith;
12230 }
12231 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12232 {
12233 /* FADD. */
12234 *cost += extra_cost->fp[mode == DFmode].addsub;
12235 }
12236 }
12237 return true;
12238 }
12239
12240 case BSWAP:
12241 *cost = COSTS_N_INSNS (1);
12242
12243 if (speed)
12244 {
12245 if (VECTOR_MODE_P (mode))
12246 *cost += extra_cost->vect.alu;
12247 else
12248 *cost += extra_cost->alu.rev;
12249 }
12250 return false;
12251
12252 case IOR:
12253 if (aarch_rev16_p (x))
12254 {
12255 *cost = COSTS_N_INSNS (1);
12256
12257 if (speed)
12258 {
12259 if (VECTOR_MODE_P (mode))
12260 *cost += extra_cost->vect.alu;
12261 else
12262 *cost += extra_cost->alu.rev;
12263 }
12264 return true;
12265 }
12266
12267 if (aarch64_extr_rtx_p (x, &op0, &op1))
12268 {
12269 *cost += rtx_cost (op0, mode, IOR, 0, speed);
12270 *cost += rtx_cost (op1, mode, IOR, 1, speed);
12271 if (speed)
12272 *cost += extra_cost->alu.shift;
12273
12274 return true;
12275 }
12276 /* Fall through. */
12277 case XOR:
12278 case AND:
12279 cost_logic:
12280 op0 = XEXP (x, 0);
12281 op1 = XEXP (x, 1);
12282
12283 if (VECTOR_MODE_P (mode))
12284 {
12285 if (speed)
12286 *cost += extra_cost->vect.alu;
12287 return true;
12288 }
12289
12290 if (code == AND
12291 && GET_CODE (op0) == MULT
12292 && CONST_INT_P (XEXP (op0, 1))
12293 && CONST_INT_P (op1)
12294 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
12295 INTVAL (op1)) != 0)
12296 {
12297 /* This is a UBFM/SBFM. */
12298 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
12299 if (speed)
12300 *cost += extra_cost->alu.bfx;
12301 return true;
12302 }
12303
12304 if (is_int_mode (mode, &int_mode))
12305 {
12306 if (CONST_INT_P (op1))
12307 {
12308 /* We have a mask + shift version of a UBFIZ
12309 i.e. the *andim_ashift<mode>_bfiz pattern. */
12310 if (GET_CODE (op0) == ASHIFT
12311 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
12312 XEXP (op0, 1)))
12313 {
12314 *cost += rtx_cost (XEXP (op0, 0), int_mode,
12315 (enum rtx_code) code, 0, speed);
12316 if (speed)
12317 *cost += extra_cost->alu.bfx;
12318
12319 return true;
12320 }
12321 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
12322 {
12323 /* We possibly get the immediate for free, this is not
12324 modelled. */
12325 *cost += rtx_cost (op0, int_mode,
12326 (enum rtx_code) code, 0, speed);
12327 if (speed)
12328 *cost += extra_cost->alu.logical;
12329
12330 return true;
12331 }
12332 }
12333 else
12334 {
12335 rtx new_op0 = op0;
12336
12337 /* Handle ORN, EON, or BIC. */
12338 if (GET_CODE (op0) == NOT)
12339 op0 = XEXP (op0, 0);
12340
12341 new_op0 = aarch64_strip_shift (op0);
12342
12343 /* If we had a shift on op0 then this is a logical-shift-
12344 by-register/immediate operation. Otherwise, this is just
12345 a logical operation. */
12346 if (speed)
12347 {
12348 if (new_op0 != op0)
12349 {
12350 /* Shift by immediate. */
12351 if (CONST_INT_P (XEXP (op0, 1)))
12352 *cost += extra_cost->alu.log_shift;
12353 else
12354 *cost += extra_cost->alu.log_shift_reg;
12355 }
12356 else
12357 *cost += extra_cost->alu.logical;
12358 }
12359
12360 /* In both cases we want to cost both operands. */
12361 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
12362 0, speed);
12363 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
12364 1, speed);
12365
12366 return true;
12367 }
12368 }
12369 return false;
12370
12371 case NOT:
12372 x = XEXP (x, 0);
12373 op0 = aarch64_strip_shift (x);
12374
12375 if (VECTOR_MODE_P (mode))
12376 {
12377 /* Vector NOT. */
12378 *cost += extra_cost->vect.alu;
12379 return false;
12380 }
12381
12382 /* MVN-shifted-reg. */
12383 if (op0 != x)
12384 {
12385 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12386
12387 if (speed)
12388 *cost += extra_cost->alu.log_shift;
12389
12390 return true;
12391 }
12392 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
12393 Handle the second form here taking care that 'a' in the above can
12394 be a shift. */
12395 else if (GET_CODE (op0) == XOR)
12396 {
12397 rtx newop0 = XEXP (op0, 0);
12398 rtx newop1 = XEXP (op0, 1);
12399 rtx op0_stripped = aarch64_strip_shift (newop0);
12400
12401 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
12402 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
12403
12404 if (speed)
12405 {
12406 if (op0_stripped != newop0)
12407 *cost += extra_cost->alu.log_shift;
12408 else
12409 *cost += extra_cost->alu.logical;
12410 }
12411
12412 return true;
12413 }
12414 /* MVN. */
12415 if (speed)
12416 *cost += extra_cost->alu.logical;
12417
12418 return false;
12419
12420 case ZERO_EXTEND:
12421
12422 op0 = XEXP (x, 0);
12423 /* If a value is written in SI mode, then zero extended to DI
12424 mode, the operation will in general be free as a write to
12425 a 'w' register implicitly zeroes the upper bits of an 'x'
12426 register. However, if this is
12427
12428 (set (reg) (zero_extend (reg)))
12429
12430 we must cost the explicit register move. */
12431 if (mode == DImode
12432 && GET_MODE (op0) == SImode
12433 && outer == SET)
12434 {
12435 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
12436
12437 /* If OP_COST is non-zero, then the cost of the zero extend
12438 is effectively the cost of the inner operation. Otherwise
12439 we have a MOV instruction and we take the cost from the MOV
12440 itself. This is true independently of whether we are
12441 optimizing for space or time. */
12442 if (op_cost)
12443 *cost = op_cost;
12444
12445 return true;
12446 }
12447 else if (MEM_P (op0))
12448 {
12449 /* All loads can zero extend to any size for free. */
12450 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
12451 return true;
12452 }
12453
12454 op0 = aarch64_extend_bitfield_pattern_p (x);
12455 if (op0)
12456 {
12457 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12458 if (speed)
12459 *cost += extra_cost->alu.bfx;
12460 return true;
12461 }
12462
12463 if (speed)
12464 {
12465 if (VECTOR_MODE_P (mode))
12466 {
12467 /* UMOV. */
12468 *cost += extra_cost->vect.alu;
12469 }
12470 else
12471 {
12472 /* We generate an AND instead of UXTB/UXTH. */
12473 *cost += extra_cost->alu.logical;
12474 }
12475 }
12476 return false;
12477
12478 case SIGN_EXTEND:
12479 if (MEM_P (XEXP (x, 0)))
12480 {
12481 /* LDRSH. */
12482 if (speed)
12483 {
12484 rtx address = XEXP (XEXP (x, 0), 0);
12485 *cost += extra_cost->ldst.load_sign_extend;
12486
12487 *cost +=
12488 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12489 0, speed));
12490 }
12491 return true;
12492 }
12493
12494 op0 = aarch64_extend_bitfield_pattern_p (x);
12495 if (op0)
12496 {
12497 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
12498 if (speed)
12499 *cost += extra_cost->alu.bfx;
12500 return true;
12501 }
12502
12503 if (speed)
12504 {
12505 if (VECTOR_MODE_P (mode))
12506 *cost += extra_cost->vect.alu;
12507 else
12508 *cost += extra_cost->alu.extend;
12509 }
12510 return false;
12511
12512 case ASHIFT:
12513 op0 = XEXP (x, 0);
12514 op1 = XEXP (x, 1);
12515
12516 if (CONST_INT_P (op1))
12517 {
12518 if (speed)
12519 {
12520 if (VECTOR_MODE_P (mode))
12521 {
12522 /* Vector shift (immediate). */
12523 *cost += extra_cost->vect.alu;
12524 }
12525 else
12526 {
12527 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12528 aliases. */
12529 *cost += extra_cost->alu.shift;
12530 }
12531 }
12532
12533 /* We can incorporate zero/sign extend for free. */
12534 if (GET_CODE (op0) == ZERO_EXTEND
12535 || GET_CODE (op0) == SIGN_EXTEND)
12536 op0 = XEXP (op0, 0);
12537
12538 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12539 return true;
12540 }
12541 else
12542 {
12543 if (VECTOR_MODE_P (mode))
12544 {
12545 if (speed)
12546 /* Vector shift (register). */
12547 *cost += extra_cost->vect.alu;
12548 }
12549 else
12550 {
12551 if (speed)
12552 /* LSLV. */
12553 *cost += extra_cost->alu.shift_reg;
12554
12555 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12556 && CONST_INT_P (XEXP (op1, 1))
12557 && known_eq (INTVAL (XEXP (op1, 1)),
12558 GET_MODE_BITSIZE (mode) - 1))
12559 {
12560 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12561 /* We already demanded XEXP (op1, 0) to be REG_P, so
12562 don't recurse into it. */
12563 return true;
12564 }
12565 }
12566 return false; /* All arguments need to be in registers. */
12567 }
12568
12569 case ROTATE:
12570 case ROTATERT:
12571 case LSHIFTRT:
12572 case ASHIFTRT:
12573 op0 = XEXP (x, 0);
12574 op1 = XEXP (x, 1);
12575
12576 if (CONST_INT_P (op1))
12577 {
12578 /* ASR (immediate) and friends. */
12579 if (speed)
12580 {
12581 if (VECTOR_MODE_P (mode))
12582 *cost += extra_cost->vect.alu;
12583 else
12584 *cost += extra_cost->alu.shift;
12585 }
12586
12587 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12588 return true;
12589 }
12590 else
12591 {
12592 if (VECTOR_MODE_P (mode))
12593 {
12594 if (speed)
12595 /* Vector shift (register). */
12596 *cost += extra_cost->vect.alu;
12597 }
12598 else
12599 {
12600 if (speed)
12601 /* ASR (register) and friends. */
12602 *cost += extra_cost->alu.shift_reg;
12603
12604 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12605 && CONST_INT_P (XEXP (op1, 1))
12606 && known_eq (INTVAL (XEXP (op1, 1)),
12607 GET_MODE_BITSIZE (mode) - 1))
12608 {
12609 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12610 /* We already demanded XEXP (op1, 0) to be REG_P, so
12611 don't recurse into it. */
12612 return true;
12613 }
12614 }
12615 return false; /* All arguments need to be in registers. */
12616 }
12617
12618 case SYMBOL_REF:
12619
12620 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12621 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12622 {
12623 /* LDR. */
12624 if (speed)
12625 *cost += extra_cost->ldst.load;
12626 }
12627 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12628 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12629 {
12630 /* ADRP, followed by ADD. */
12631 *cost += COSTS_N_INSNS (1);
12632 if (speed)
12633 *cost += 2 * extra_cost->alu.arith;
12634 }
12635 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12636 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12637 {
12638 /* ADR. */
12639 if (speed)
12640 *cost += extra_cost->alu.arith;
12641 }
12642
12643 if (flag_pic)
12644 {
12645 /* One extra load instruction, after accessing the GOT. */
12646 *cost += COSTS_N_INSNS (1);
12647 if (speed)
12648 *cost += extra_cost->ldst.load;
12649 }
12650 return true;
12651
12652 case HIGH:
12653 case LO_SUM:
12654 /* ADRP/ADD (immediate). */
12655 if (speed)
12656 *cost += extra_cost->alu.arith;
12657 return true;
12658
12659 case ZERO_EXTRACT:
12660 case SIGN_EXTRACT:
12661 /* UBFX/SBFX. */
12662 if (speed)
12663 {
12664 if (VECTOR_MODE_P (mode))
12665 *cost += extra_cost->vect.alu;
12666 else
12667 *cost += extra_cost->alu.bfx;
12668 }
12669
12670 /* We can trust that the immediates used will be correct (there
12671 are no by-register forms), so we need only cost op0. */
12672 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12673 return true;
12674
12675 case MULT:
12676 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12677 /* aarch64_rtx_mult_cost always handles recursion to its
12678 operands. */
12679 return true;
12680
12681 case MOD:
12682 /* We can expand signed mod by power of 2 using a NEGS, two parallel
12683 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
12684 an unconditional negate. This case should only ever be reached through
12685 the set_smod_pow2_cheap check in expmed.c. */
12686 if (CONST_INT_P (XEXP (x, 1))
12687 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12688 && (mode == SImode || mode == DImode))
12689 {
12690 /* We expand to 4 instructions. Reset the baseline. */
12691 *cost = COSTS_N_INSNS (4);
12692
12693 if (speed)
12694 *cost += 2 * extra_cost->alu.logical
12695 + 2 * extra_cost->alu.arith;
12696
12697 return true;
12698 }
12699
12700 /* Fall-through. */
12701 case UMOD:
12702 if (speed)
12703 {
12704 /* Slighly prefer UMOD over SMOD. */
12705 if (VECTOR_MODE_P (mode))
12706 *cost += extra_cost->vect.alu;
12707 else if (GET_MODE_CLASS (mode) == MODE_INT)
12708 *cost += (extra_cost->mult[mode == DImode].add
12709 + extra_cost->mult[mode == DImode].idiv
12710 + (code == MOD ? 1 : 0));
12711 }
12712 return false; /* All arguments need to be in registers. */
12713
12714 case DIV:
12715 case UDIV:
12716 case SQRT:
12717 if (speed)
12718 {
12719 if (VECTOR_MODE_P (mode))
12720 *cost += extra_cost->vect.alu;
12721 else if (GET_MODE_CLASS (mode) == MODE_INT)
12722 /* There is no integer SQRT, so only DIV and UDIV can get
12723 here. */
12724 *cost += (extra_cost->mult[mode == DImode].idiv
12725 /* Slighly prefer UDIV over SDIV. */
12726 + (code == DIV ? 1 : 0));
12727 else
12728 *cost += extra_cost->fp[mode == DFmode].div;
12729 }
12730 return false; /* All arguments need to be in registers. */
12731
12732 case IF_THEN_ELSE:
12733 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12734 XEXP (x, 2), cost, speed);
12735
12736 case EQ:
12737 case NE:
12738 case GT:
12739 case GTU:
12740 case LT:
12741 case LTU:
12742 case GE:
12743 case GEU:
12744 case LE:
12745 case LEU:
12746
12747 return false; /* All arguments must be in registers. */
12748
12749 case FMA:
12750 op0 = XEXP (x, 0);
12751 op1 = XEXP (x, 1);
12752 op2 = XEXP (x, 2);
12753
12754 if (speed)
12755 {
12756 if (VECTOR_MODE_P (mode))
12757 *cost += extra_cost->vect.alu;
12758 else
12759 *cost += extra_cost->fp[mode == DFmode].fma;
12760 }
12761
12762 /* FMSUB, FNMADD, and FNMSUB are free. */
12763 if (GET_CODE (op0) == NEG)
12764 op0 = XEXP (op0, 0);
12765
12766 if (GET_CODE (op2) == NEG)
12767 op2 = XEXP (op2, 0);
12768
12769 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12770 and the by-element operand as operand 0. */
12771 if (GET_CODE (op1) == NEG)
12772 op1 = XEXP (op1, 0);
12773
12774 /* Catch vector-by-element operations. The by-element operand can
12775 either be (vec_duplicate (vec_select (x))) or just
12776 (vec_select (x)), depending on whether we are multiplying by
12777 a vector or a scalar.
12778
12779 Canonicalization is not very good in these cases, FMA4 will put the
12780 by-element operand as operand 0, FNMA4 will have it as operand 1. */
12781 if (GET_CODE (op0) == VEC_DUPLICATE)
12782 op0 = XEXP (op0, 0);
12783 else if (GET_CODE (op1) == VEC_DUPLICATE)
12784 op1 = XEXP (op1, 0);
12785
12786 if (GET_CODE (op0) == VEC_SELECT)
12787 op0 = XEXP (op0, 0);
12788 else if (GET_CODE (op1) == VEC_SELECT)
12789 op1 = XEXP (op1, 0);
12790
12791 /* If the remaining parameters are not registers,
12792 get the cost to put them into registers. */
12793 *cost += rtx_cost (op0, mode, FMA, 0, speed);
12794 *cost += rtx_cost (op1, mode, FMA, 1, speed);
12795 *cost += rtx_cost (op2, mode, FMA, 2, speed);
12796 return true;
12797
12798 case FLOAT:
12799 case UNSIGNED_FLOAT:
12800 if (speed)
12801 *cost += extra_cost->fp[mode == DFmode].fromint;
12802 return false;
12803
12804 case FLOAT_EXTEND:
12805 if (speed)
12806 {
12807 if (VECTOR_MODE_P (mode))
12808 {
12809 /*Vector truncate. */
12810 *cost += extra_cost->vect.alu;
12811 }
12812 else
12813 *cost += extra_cost->fp[mode == DFmode].widen;
12814 }
12815 return false;
12816
12817 case FLOAT_TRUNCATE:
12818 if (speed)
12819 {
12820 if (VECTOR_MODE_P (mode))
12821 {
12822 /*Vector conversion. */
12823 *cost += extra_cost->vect.alu;
12824 }
12825 else
12826 *cost += extra_cost->fp[mode == DFmode].narrow;
12827 }
12828 return false;
12829
12830 case FIX:
12831 case UNSIGNED_FIX:
12832 x = XEXP (x, 0);
12833 /* Strip the rounding part. They will all be implemented
12834 by the fcvt* family of instructions anyway. */
12835 if (GET_CODE (x) == UNSPEC)
12836 {
12837 unsigned int uns_code = XINT (x, 1);
12838
12839 if (uns_code == UNSPEC_FRINTA
12840 || uns_code == UNSPEC_FRINTM
12841 || uns_code == UNSPEC_FRINTN
12842 || uns_code == UNSPEC_FRINTP
12843 || uns_code == UNSPEC_FRINTZ)
12844 x = XVECEXP (x, 0, 0);
12845 }
12846
12847 if (speed)
12848 {
12849 if (VECTOR_MODE_P (mode))
12850 *cost += extra_cost->vect.alu;
12851 else
12852 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
12853 }
12854
12855 /* We can combine fmul by a power of 2 followed by a fcvt into a single
12856 fixed-point fcvt. */
12857 if (GET_CODE (x) == MULT
12858 && ((VECTOR_MODE_P (mode)
12859 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
12860 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
12861 {
12862 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
12863 0, speed);
12864 return true;
12865 }
12866
12867 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
12868 return true;
12869
12870 case ABS:
12871 if (VECTOR_MODE_P (mode))
12872 {
12873 /* ABS (vector). */
12874 if (speed)
12875 *cost += extra_cost->vect.alu;
12876 }
12877 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12878 {
12879 op0 = XEXP (x, 0);
12880
12881 /* FABD, which is analogous to FADD. */
12882 if (GET_CODE (op0) == MINUS)
12883 {
12884 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
12885 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
12886 if (speed)
12887 *cost += extra_cost->fp[mode == DFmode].addsub;
12888
12889 return true;
12890 }
12891 /* Simple FABS is analogous to FNEG. */
12892 if (speed)
12893 *cost += extra_cost->fp[mode == DFmode].neg;
12894 }
12895 else
12896 {
12897 /* Integer ABS will either be split to
12898 two arithmetic instructions, or will be an ABS
12899 (scalar), which we don't model. */
12900 *cost = COSTS_N_INSNS (2);
12901 if (speed)
12902 *cost += 2 * extra_cost->alu.arith;
12903 }
12904 return false;
12905
12906 case SMAX:
12907 case SMIN:
12908 if (speed)
12909 {
12910 if (VECTOR_MODE_P (mode))
12911 *cost += extra_cost->vect.alu;
12912 else
12913 {
12914 /* FMAXNM/FMINNM/FMAX/FMIN.
12915 TODO: This may not be accurate for all implementations, but
12916 we do not model this in the cost tables. */
12917 *cost += extra_cost->fp[mode == DFmode].addsub;
12918 }
12919 }
12920 return false;
12921
12922 case UNSPEC:
12923 /* The floating point round to integer frint* instructions. */
12924 if (aarch64_frint_unspec_p (XINT (x, 1)))
12925 {
12926 if (speed)
12927 *cost += extra_cost->fp[mode == DFmode].roundint;
12928
12929 return false;
12930 }
12931
12932 if (XINT (x, 1) == UNSPEC_RBIT)
12933 {
12934 if (speed)
12935 *cost += extra_cost->alu.rev;
12936
12937 return false;
12938 }
12939 break;
12940
12941 case TRUNCATE:
12942
12943 /* Decompose <su>muldi3_highpart. */
12944 if (/* (truncate:DI */
12945 mode == DImode
12946 /* (lshiftrt:TI */
12947 && GET_MODE (XEXP (x, 0)) == TImode
12948 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
12949 /* (mult:TI */
12950 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12951 /* (ANY_EXTEND:TI (reg:DI))
12952 (ANY_EXTEND:TI (reg:DI))) */
12953 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
12954 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
12955 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
12956 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
12957 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
12958 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
12959 /* (const_int 64) */
12960 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12961 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
12962 {
12963 /* UMULH/SMULH. */
12964 if (speed)
12965 *cost += extra_cost->mult[mode == DImode].extend;
12966 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
12967 mode, MULT, 0, speed);
12968 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
12969 mode, MULT, 1, speed);
12970 return true;
12971 }
12972
12973 /* Fall through. */
12974 default:
12975 break;
12976 }
12977
12978 if (dump_file
12979 && flag_aarch64_verbose_cost)
12980 fprintf (dump_file,
12981 "\nFailed to cost RTX. Assuming default cost.\n");
12982
12983 return true;
12984 }
12985
12986 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12987 calculated for X. This cost is stored in *COST. Returns true
12988 if the total cost of X was calculated. */
12989 static bool
12990 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
12991 int param, int *cost, bool speed)
12992 {
12993 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
12994
12995 if (dump_file
12996 && flag_aarch64_verbose_cost)
12997 {
12998 print_rtl_single (dump_file, x);
12999 fprintf (dump_file, "\n%s cost: %d (%s)\n",
13000 speed ? "Hot" : "Cold",
13001 *cost, result ? "final" : "partial");
13002 }
13003
13004 return result;
13005 }
13006
13007 static int
13008 aarch64_register_move_cost (machine_mode mode,
13009 reg_class_t from_i, reg_class_t to_i)
13010 {
13011 enum reg_class from = (enum reg_class) from_i;
13012 enum reg_class to = (enum reg_class) to_i;
13013 const struct cpu_regmove_cost *regmove_cost
13014 = aarch64_tune_params.regmove_cost;
13015
13016 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
13017 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
13018 to = GENERAL_REGS;
13019
13020 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
13021 from = GENERAL_REGS;
13022
13023 /* Make RDFFR very expensive. In particular, if we know that the FFR
13024 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13025 as a way of obtaining a PTRUE. */
13026 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13027 && hard_reg_set_subset_p (reg_class_contents[from_i],
13028 reg_class_contents[FFR_REGS]))
13029 return 80;
13030
13031 /* Moving between GPR and stack cost is the same as GP2GP. */
13032 if ((from == GENERAL_REGS && to == STACK_REG)
13033 || (to == GENERAL_REGS && from == STACK_REG))
13034 return regmove_cost->GP2GP;
13035
13036 /* To/From the stack register, we move via the gprs. */
13037 if (to == STACK_REG || from == STACK_REG)
13038 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13039 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13040
13041 if (known_eq (GET_MODE_SIZE (mode), 16))
13042 {
13043 /* 128-bit operations on general registers require 2 instructions. */
13044 if (from == GENERAL_REGS && to == GENERAL_REGS)
13045 return regmove_cost->GP2GP * 2;
13046 else if (from == GENERAL_REGS)
13047 return regmove_cost->GP2FP * 2;
13048 else if (to == GENERAL_REGS)
13049 return regmove_cost->FP2GP * 2;
13050
13051 /* When AdvSIMD instructions are disabled it is not possible to move
13052 a 128-bit value directly between Q registers. This is handled in
13053 secondary reload. A general register is used as a scratch to move
13054 the upper DI value and the lower DI value is moved directly,
13055 hence the cost is the sum of three moves. */
13056 if (! TARGET_SIMD)
13057 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13058
13059 return regmove_cost->FP2FP;
13060 }
13061
13062 if (from == GENERAL_REGS && to == GENERAL_REGS)
13063 return regmove_cost->GP2GP;
13064 else if (from == GENERAL_REGS)
13065 return regmove_cost->GP2FP;
13066 else if (to == GENERAL_REGS)
13067 return regmove_cost->FP2GP;
13068
13069 return regmove_cost->FP2FP;
13070 }
13071
13072 static int
13073 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
13074 reg_class_t rclass ATTRIBUTE_UNUSED,
13075 bool in ATTRIBUTE_UNUSED)
13076 {
13077 return aarch64_tune_params.memmov_cost;
13078 }
13079
13080 /* Implement TARGET_INIT_BUILTINS. */
13081 static void
13082 aarch64_init_builtins ()
13083 {
13084 aarch64_general_init_builtins ();
13085 aarch64_sve::init_builtins ();
13086 }
13087
13088 /* Implement TARGET_FOLD_BUILTIN. */
13089 static tree
13090 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13091 {
13092 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13093 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13094 tree type = TREE_TYPE (TREE_TYPE (fndecl));
13095 switch (code & AARCH64_BUILTIN_CLASS)
13096 {
13097 case AARCH64_BUILTIN_GENERAL:
13098 return aarch64_general_fold_builtin (subcode, type, nargs, args);
13099
13100 case AARCH64_BUILTIN_SVE:
13101 return NULL_TREE;
13102 }
13103 gcc_unreachable ();
13104 }
13105
13106 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
13107 static bool
13108 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13109 {
13110 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13111 tree fndecl = gimple_call_fndecl (stmt);
13112 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13113 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13114 gimple *new_stmt = NULL;
13115 switch (code & AARCH64_BUILTIN_CLASS)
13116 {
13117 case AARCH64_BUILTIN_GENERAL:
13118 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13119 break;
13120
13121 case AARCH64_BUILTIN_SVE:
13122 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13123 break;
13124 }
13125
13126 if (!new_stmt)
13127 return false;
13128
13129 gsi_replace (gsi, new_stmt, true);
13130 return true;
13131 }
13132
13133 /* Implement TARGET_EXPAND_BUILTIN. */
13134 static rtx
13135 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
13136 {
13137 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13138 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13139 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13140 switch (code & AARCH64_BUILTIN_CLASS)
13141 {
13142 case AARCH64_BUILTIN_GENERAL:
13143 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
13144
13145 case AARCH64_BUILTIN_SVE:
13146 return aarch64_sve::expand_builtin (subcode, exp, target);
13147 }
13148 gcc_unreachable ();
13149 }
13150
13151 /* Implement TARGET_BUILTIN_DECL. */
13152 static tree
13153 aarch64_builtin_decl (unsigned int code, bool initialize_p)
13154 {
13155 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13156 switch (code & AARCH64_BUILTIN_CLASS)
13157 {
13158 case AARCH64_BUILTIN_GENERAL:
13159 return aarch64_general_builtin_decl (subcode, initialize_p);
13160
13161 case AARCH64_BUILTIN_SVE:
13162 return aarch64_sve::builtin_decl (subcode, initialize_p);
13163 }
13164 gcc_unreachable ();
13165 }
13166
13167 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
13168 to optimize 1.0/sqrt. */
13169
13170 static bool
13171 use_rsqrt_p (machine_mode mode)
13172 {
13173 return (!flag_trapping_math
13174 && flag_unsafe_math_optimizations
13175 && ((aarch64_tune_params.approx_modes->recip_sqrt
13176 & AARCH64_APPROX_MODE (mode))
13177 || flag_mrecip_low_precision_sqrt));
13178 }
13179
13180 /* Function to decide when to use the approximate reciprocal square root
13181 builtin. */
13182
13183 static tree
13184 aarch64_builtin_reciprocal (tree fndecl)
13185 {
13186 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
13187
13188 if (!use_rsqrt_p (mode))
13189 return NULL_TREE;
13190 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13191 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13192 switch (code & AARCH64_BUILTIN_CLASS)
13193 {
13194 case AARCH64_BUILTIN_GENERAL:
13195 return aarch64_general_builtin_rsqrt (subcode);
13196
13197 case AARCH64_BUILTIN_SVE:
13198 return NULL_TREE;
13199 }
13200 gcc_unreachable ();
13201 }
13202
13203 /* Emit code to perform the floating-point operation:
13204
13205 DST = SRC1 * SRC2
13206
13207 where all three operands are already known to be registers.
13208 If the operation is an SVE one, PTRUE is a suitable all-true
13209 predicate. */
13210
13211 static void
13212 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
13213 {
13214 if (ptrue)
13215 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
13216 dst, ptrue, src1, src2,
13217 gen_int_mode (SVE_RELAXED_GP, SImode)));
13218 else
13219 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
13220 }
13221
13222 /* Emit instruction sequence to compute either the approximate square root
13223 or its approximate reciprocal, depending on the flag RECP, and return
13224 whether the sequence was emitted or not. */
13225
13226 bool
13227 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
13228 {
13229 machine_mode mode = GET_MODE (dst);
13230
13231 if (GET_MODE_INNER (mode) == HFmode)
13232 {
13233 gcc_assert (!recp);
13234 return false;
13235 }
13236
13237 if (!recp)
13238 {
13239 if (!(flag_mlow_precision_sqrt
13240 || (aarch64_tune_params.approx_modes->sqrt
13241 & AARCH64_APPROX_MODE (mode))))
13242 return false;
13243
13244 if (!flag_finite_math_only
13245 || flag_trapping_math
13246 || !flag_unsafe_math_optimizations
13247 || optimize_function_for_size_p (cfun))
13248 return false;
13249 }
13250 else
13251 /* Caller assumes we cannot fail. */
13252 gcc_assert (use_rsqrt_p (mode));
13253
13254 rtx pg = NULL_RTX;
13255 if (aarch64_sve_mode_p (mode))
13256 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13257 machine_mode mmsk = (VECTOR_MODE_P (mode)
13258 ? related_int_vector_mode (mode).require ()
13259 : int_mode_for_mode (mode).require ());
13260 rtx xmsk = NULL_RTX;
13261 if (!recp)
13262 {
13263 /* When calculating the approximate square root, compare the
13264 argument with 0.0 and create a mask. */
13265 rtx zero = CONST0_RTX (mode);
13266 if (pg)
13267 {
13268 xmsk = gen_reg_rtx (GET_MODE (pg));
13269 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
13270 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
13271 xmsk, pg, hint, src, zero));
13272 }
13273 else
13274 {
13275 xmsk = gen_reg_rtx (mmsk);
13276 emit_insn (gen_rtx_SET (xmsk,
13277 gen_rtx_NEG (mmsk,
13278 gen_rtx_EQ (mmsk, src, zero))));
13279 }
13280 }
13281
13282 /* Estimate the approximate reciprocal square root. */
13283 rtx xdst = gen_reg_rtx (mode);
13284 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
13285
13286 /* Iterate over the series twice for SF and thrice for DF. */
13287 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13288
13289 /* Optionally iterate over the series once less for faster performance
13290 while sacrificing the accuracy. */
13291 if ((recp && flag_mrecip_low_precision_sqrt)
13292 || (!recp && flag_mlow_precision_sqrt))
13293 iterations--;
13294
13295 /* Iterate over the series to calculate the approximate reciprocal square
13296 root. */
13297 rtx x1 = gen_reg_rtx (mode);
13298 while (iterations--)
13299 {
13300 rtx x2 = gen_reg_rtx (mode);
13301 aarch64_emit_mult (x2, pg, xdst, xdst);
13302
13303 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
13304
13305 if (iterations > 0)
13306 aarch64_emit_mult (xdst, pg, xdst, x1);
13307 }
13308
13309 if (!recp)
13310 {
13311 if (pg)
13312 /* Multiply nonzero source values by the corresponding intermediate
13313 result elements, so that the final calculation is the approximate
13314 square root rather than its reciprocal. Select a zero result for
13315 zero source values, to avoid the Inf * 0 -> NaN that we'd get
13316 otherwise. */
13317 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
13318 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
13319 else
13320 {
13321 /* Qualify the approximate reciprocal square root when the
13322 argument is 0.0 by squashing the intermediary result to 0.0. */
13323 rtx xtmp = gen_reg_rtx (mmsk);
13324 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
13325 gen_rtx_SUBREG (mmsk, xdst, 0)));
13326 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
13327
13328 /* Calculate the approximate square root. */
13329 aarch64_emit_mult (xdst, pg, xdst, src);
13330 }
13331 }
13332
13333 /* Finalize the approximation. */
13334 aarch64_emit_mult (dst, pg, xdst, x1);
13335
13336 return true;
13337 }
13338
13339 /* Emit the instruction sequence to compute the approximation for the division
13340 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
13341
13342 bool
13343 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
13344 {
13345 machine_mode mode = GET_MODE (quo);
13346
13347 if (GET_MODE_INNER (mode) == HFmode)
13348 return false;
13349
13350 bool use_approx_division_p = (flag_mlow_precision_div
13351 || (aarch64_tune_params.approx_modes->division
13352 & AARCH64_APPROX_MODE (mode)));
13353
13354 if (!flag_finite_math_only
13355 || flag_trapping_math
13356 || !flag_unsafe_math_optimizations
13357 || optimize_function_for_size_p (cfun)
13358 || !use_approx_division_p)
13359 return false;
13360
13361 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
13362 return false;
13363
13364 rtx pg = NULL_RTX;
13365 if (aarch64_sve_mode_p (mode))
13366 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13367
13368 /* Estimate the approximate reciprocal. */
13369 rtx xrcp = gen_reg_rtx (mode);
13370 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
13371
13372 /* Iterate over the series twice for SF and thrice for DF. */
13373 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13374
13375 /* Optionally iterate over the series less for faster performance,
13376 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
13377 if (flag_mlow_precision_div)
13378 iterations = (GET_MODE_INNER (mode) == DFmode
13379 ? aarch64_double_recp_precision
13380 : aarch64_float_recp_precision);
13381
13382 /* Iterate over the series to calculate the approximate reciprocal. */
13383 rtx xtmp = gen_reg_rtx (mode);
13384 while (iterations--)
13385 {
13386 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
13387
13388 if (iterations > 0)
13389 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
13390 }
13391
13392 if (num != CONST1_RTX (mode))
13393 {
13394 /* As the approximate reciprocal of DEN is already calculated, only
13395 calculate the approximate division when NUM is not 1.0. */
13396 rtx xnum = force_reg (mode, num);
13397 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
13398 }
13399
13400 /* Finalize the approximation. */
13401 aarch64_emit_mult (quo, pg, xrcp, xtmp);
13402 return true;
13403 }
13404
13405 /* Return the number of instructions that can be issued per cycle. */
13406 static int
13407 aarch64_sched_issue_rate (void)
13408 {
13409 return aarch64_tune_params.issue_rate;
13410 }
13411
13412 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
13413 static int
13414 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
13415 {
13416 if (DEBUG_INSN_P (insn))
13417 return more;
13418
13419 rtx_code code = GET_CODE (PATTERN (insn));
13420 if (code == USE || code == CLOBBER)
13421 return more;
13422
13423 if (get_attr_type (insn) == TYPE_NO_INSN)
13424 return more;
13425
13426 return more - 1;
13427 }
13428
13429 static int
13430 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
13431 {
13432 int issue_rate = aarch64_sched_issue_rate ();
13433
13434 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
13435 }
13436
13437
13438 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
13439 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
13440 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
13441
13442 static int
13443 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13444 int ready_index)
13445 {
13446 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13447 }
13448
13449
13450 /* Vectorizer cost model target hooks. */
13451
13452 /* Implement targetm.vectorize.builtin_vectorization_cost. */
13453 static int
13454 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13455 tree vectype,
13456 int misalign ATTRIBUTE_UNUSED)
13457 {
13458 unsigned elements;
13459 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13460 bool fp = false;
13461
13462 if (vectype != NULL)
13463 fp = FLOAT_TYPE_P (vectype);
13464
13465 switch (type_of_cost)
13466 {
13467 case scalar_stmt:
13468 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
13469
13470 case scalar_load:
13471 return costs->scalar_load_cost;
13472
13473 case scalar_store:
13474 return costs->scalar_store_cost;
13475
13476 case vector_stmt:
13477 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13478
13479 case vector_load:
13480 return costs->vec_align_load_cost;
13481
13482 case vector_store:
13483 return costs->vec_store_cost;
13484
13485 case vec_to_scalar:
13486 return costs->vec_to_scalar_cost;
13487
13488 case scalar_to_vec:
13489 return costs->scalar_to_vec_cost;
13490
13491 case unaligned_load:
13492 case vector_gather_load:
13493 return costs->vec_unalign_load_cost;
13494
13495 case unaligned_store:
13496 case vector_scatter_store:
13497 return costs->vec_unalign_store_cost;
13498
13499 case cond_branch_taken:
13500 return costs->cond_taken_branch_cost;
13501
13502 case cond_branch_not_taken:
13503 return costs->cond_not_taken_branch_cost;
13504
13505 case vec_perm:
13506 return costs->vec_permute_cost;
13507
13508 case vec_promote_demote:
13509 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13510
13511 case vec_construct:
13512 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
13513 return elements / 2 + 1;
13514
13515 default:
13516 gcc_unreachable ();
13517 }
13518 }
13519
13520 /* Return true if STMT_INFO extends the result of a load. */
13521 static bool
13522 aarch64_extending_load_p (stmt_vec_info stmt_info)
13523 {
13524 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13525 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13526 return false;
13527
13528 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
13529 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13530 tree rhs_type = TREE_TYPE (rhs);
13531 if (!INTEGRAL_TYPE_P (lhs_type)
13532 || !INTEGRAL_TYPE_P (rhs_type)
13533 || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
13534 return false;
13535
13536 stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs);
13537 return (def_stmt_info
13538 && STMT_VINFO_DATA_REF (def_stmt_info)
13539 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
13540 }
13541
13542 /* Return true if STMT_INFO is an integer truncation. */
13543 static bool
13544 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
13545 {
13546 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13547 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13548 return false;
13549
13550 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13551 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
13552 return (INTEGRAL_TYPE_P (lhs_type)
13553 && INTEGRAL_TYPE_P (rhs_type)
13554 && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
13555 }
13556
13557 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13558 for STMT_INFO, which has cost kind KIND. Adjust the cost as necessary
13559 for SVE targets. */
13560 static unsigned int
13561 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
13562 unsigned int stmt_cost)
13563 {
13564 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13565 vector register size or number of units. Integer promotions of this
13566 type therefore map to SXT[BHW] or UXT[BHW].
13567
13568 Most loads have extending forms that can do the sign or zero extension
13569 on the fly. Optimistically assume that a load followed by an extension
13570 will fold to this form during combine, and that the extension therefore
13571 comes for free. */
13572 if (kind == vector_stmt && aarch64_extending_load_p (stmt_info))
13573 stmt_cost = 0;
13574
13575 /* For similar reasons, vector_stmt integer truncations are a no-op,
13576 because we can just ignore the unused upper bits of the source. */
13577 if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13578 stmt_cost = 0;
13579
13580 return stmt_cost;
13581 }
13582
13583 /* Implement targetm.vectorize.add_stmt_cost. */
13584 static unsigned
13585 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
13586 struct _stmt_vec_info *stmt_info, int misalign,
13587 enum vect_cost_model_location where)
13588 {
13589 unsigned *cost = (unsigned *) data;
13590 unsigned retval = 0;
13591
13592 if (flag_vect_cost_model)
13593 {
13594 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
13595 int stmt_cost =
13596 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13597
13598 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13599 stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, stmt_cost);
13600
13601 /* Statements in an inner loop relative to the loop being
13602 vectorized are weighted more heavily. The value here is
13603 arbitrary and could potentially be improved with analysis. */
13604 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
13605 count *= 50; /* FIXME */
13606
13607 retval = (unsigned) (count * stmt_cost);
13608 cost[where] += retval;
13609 }
13610
13611 return retval;
13612 }
13613
13614 static void initialize_aarch64_code_model (struct gcc_options *);
13615
13616 /* Parse the TO_PARSE string and put the architecture struct that it
13617 selects into RES and the architectural features into ISA_FLAGS.
13618 Return an aarch64_parse_opt_result describing the parse result.
13619 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13620 When the TO_PARSE string contains an invalid extension,
13621 a copy of the string is created and stored to INVALID_EXTENSION. */
13622
13623 static enum aarch64_parse_opt_result
13624 aarch64_parse_arch (const char *to_parse, const struct processor **res,
13625 uint64_t *isa_flags, std::string *invalid_extension)
13626 {
13627 const char *ext;
13628 const struct processor *arch;
13629 size_t len;
13630
13631 ext = strchr (to_parse, '+');
13632
13633 if (ext != NULL)
13634 len = ext - to_parse;
13635 else
13636 len = strlen (to_parse);
13637
13638 if (len == 0)
13639 return AARCH64_PARSE_MISSING_ARG;
13640
13641
13642 /* Loop through the list of supported ARCHes to find a match. */
13643 for (arch = all_architectures; arch->name != NULL; arch++)
13644 {
13645 if (strlen (arch->name) == len
13646 && strncmp (arch->name, to_parse, len) == 0)
13647 {
13648 uint64_t isa_temp = arch->flags;
13649
13650 if (ext != NULL)
13651 {
13652 /* TO_PARSE string contains at least one extension. */
13653 enum aarch64_parse_opt_result ext_res
13654 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13655
13656 if (ext_res != AARCH64_PARSE_OK)
13657 return ext_res;
13658 }
13659 /* Extension parsing was successful. Confirm the result
13660 arch and ISA flags. */
13661 *res = arch;
13662 *isa_flags = isa_temp;
13663 return AARCH64_PARSE_OK;
13664 }
13665 }
13666
13667 /* ARCH name not found in list. */
13668 return AARCH64_PARSE_INVALID_ARG;
13669 }
13670
13671 /* Parse the TO_PARSE string and put the result tuning in RES and the
13672 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
13673 describing the parse result. If there is an error parsing, RES and
13674 ISA_FLAGS are left unchanged.
13675 When the TO_PARSE string contains an invalid extension,
13676 a copy of the string is created and stored to INVALID_EXTENSION. */
13677
13678 static enum aarch64_parse_opt_result
13679 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
13680 uint64_t *isa_flags, std::string *invalid_extension)
13681 {
13682 const char *ext;
13683 const struct processor *cpu;
13684 size_t len;
13685
13686 ext = strchr (to_parse, '+');
13687
13688 if (ext != NULL)
13689 len = ext - to_parse;
13690 else
13691 len = strlen (to_parse);
13692
13693 if (len == 0)
13694 return AARCH64_PARSE_MISSING_ARG;
13695
13696
13697 /* Loop through the list of supported CPUs to find a match. */
13698 for (cpu = all_cores; cpu->name != NULL; cpu++)
13699 {
13700 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
13701 {
13702 uint64_t isa_temp = cpu->flags;
13703
13704
13705 if (ext != NULL)
13706 {
13707 /* TO_PARSE string contains at least one extension. */
13708 enum aarch64_parse_opt_result ext_res
13709 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13710
13711 if (ext_res != AARCH64_PARSE_OK)
13712 return ext_res;
13713 }
13714 /* Extension parsing was successfull. Confirm the result
13715 cpu and ISA flags. */
13716 *res = cpu;
13717 *isa_flags = isa_temp;
13718 return AARCH64_PARSE_OK;
13719 }
13720 }
13721
13722 /* CPU name not found in list. */
13723 return AARCH64_PARSE_INVALID_ARG;
13724 }
13725
13726 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13727 Return an aarch64_parse_opt_result describing the parse result.
13728 If the parsing fails the RES does not change. */
13729
13730 static enum aarch64_parse_opt_result
13731 aarch64_parse_tune (const char *to_parse, const struct processor **res)
13732 {
13733 const struct processor *cpu;
13734
13735 /* Loop through the list of supported CPUs to find a match. */
13736 for (cpu = all_cores; cpu->name != NULL; cpu++)
13737 {
13738 if (strcmp (cpu->name, to_parse) == 0)
13739 {
13740 *res = cpu;
13741 return AARCH64_PARSE_OK;
13742 }
13743 }
13744
13745 /* CPU name not found in list. */
13746 return AARCH64_PARSE_INVALID_ARG;
13747 }
13748
13749 /* Parse TOKEN, which has length LENGTH to see if it is an option
13750 described in FLAG. If it is, return the index bit for that fusion type.
13751 If not, error (printing OPTION_NAME) and return zero. */
13752
13753 static unsigned int
13754 aarch64_parse_one_option_token (const char *token,
13755 size_t length,
13756 const struct aarch64_flag_desc *flag,
13757 const char *option_name)
13758 {
13759 for (; flag->name != NULL; flag++)
13760 {
13761 if (length == strlen (flag->name)
13762 && !strncmp (flag->name, token, length))
13763 return flag->flag;
13764 }
13765
13766 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
13767 return 0;
13768 }
13769
13770 /* Parse OPTION which is a comma-separated list of flags to enable.
13771 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13772 default state we inherit from the CPU tuning structures. OPTION_NAME
13773 gives the top-level option we are parsing in the -moverride string,
13774 for use in error messages. */
13775
13776 static unsigned int
13777 aarch64_parse_boolean_options (const char *option,
13778 const struct aarch64_flag_desc *flags,
13779 unsigned int initial_state,
13780 const char *option_name)
13781 {
13782 const char separator = '.';
13783 const char* specs = option;
13784 const char* ntoken = option;
13785 unsigned int found_flags = initial_state;
13786
13787 while ((ntoken = strchr (specs, separator)))
13788 {
13789 size_t token_length = ntoken - specs;
13790 unsigned token_ops = aarch64_parse_one_option_token (specs,
13791 token_length,
13792 flags,
13793 option_name);
13794 /* If we find "none" (or, for simplicity's sake, an error) anywhere
13795 in the token stream, reset the supported operations. So:
13796
13797 adrp+add.cmp+branch.none.adrp+add
13798
13799 would have the result of turning on only adrp+add fusion. */
13800 if (!token_ops)
13801 found_flags = 0;
13802
13803 found_flags |= token_ops;
13804 specs = ++ntoken;
13805 }
13806
13807 /* We ended with a comma, print something. */
13808 if (!(*specs))
13809 {
13810 error ("%s string ill-formed\n", option_name);
13811 return 0;
13812 }
13813
13814 /* We still have one more token to parse. */
13815 size_t token_length = strlen (specs);
13816 unsigned token_ops = aarch64_parse_one_option_token (specs,
13817 token_length,
13818 flags,
13819 option_name);
13820 if (!token_ops)
13821 found_flags = 0;
13822
13823 found_flags |= token_ops;
13824 return found_flags;
13825 }
13826
13827 /* Support for overriding instruction fusion. */
13828
13829 static void
13830 aarch64_parse_fuse_string (const char *fuse_string,
13831 struct tune_params *tune)
13832 {
13833 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
13834 aarch64_fusible_pairs,
13835 tune->fusible_ops,
13836 "fuse=");
13837 }
13838
13839 /* Support for overriding other tuning flags. */
13840
13841 static void
13842 aarch64_parse_tune_string (const char *tune_string,
13843 struct tune_params *tune)
13844 {
13845 tune->extra_tuning_flags
13846 = aarch64_parse_boolean_options (tune_string,
13847 aarch64_tuning_flags,
13848 tune->extra_tuning_flags,
13849 "tune=");
13850 }
13851
13852 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13853 Accept the valid SVE vector widths allowed by
13854 aarch64_sve_vector_bits_enum and use it to override sve_width
13855 in TUNE. */
13856
13857 static void
13858 aarch64_parse_sve_width_string (const char *tune_string,
13859 struct tune_params *tune)
13860 {
13861 int width = -1;
13862
13863 int n = sscanf (tune_string, "%d", &width);
13864 if (n == EOF)
13865 {
13866 error ("invalid format for sve_width");
13867 return;
13868 }
13869 switch (width)
13870 {
13871 case SVE_128:
13872 case SVE_256:
13873 case SVE_512:
13874 case SVE_1024:
13875 case SVE_2048:
13876 break;
13877 default:
13878 error ("invalid sve_width value: %d", width);
13879 }
13880 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
13881 }
13882
13883 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13884 we understand. If it is, extract the option string and handoff to
13885 the appropriate function. */
13886
13887 void
13888 aarch64_parse_one_override_token (const char* token,
13889 size_t length,
13890 struct tune_params *tune)
13891 {
13892 const struct aarch64_tuning_override_function *fn
13893 = aarch64_tuning_override_functions;
13894
13895 const char *option_part = strchr (token, '=');
13896 if (!option_part)
13897 {
13898 error ("tuning string missing in option (%s)", token);
13899 return;
13900 }
13901
13902 /* Get the length of the option name. */
13903 length = option_part - token;
13904 /* Skip the '=' to get to the option string. */
13905 option_part++;
13906
13907 for (; fn->name != NULL; fn++)
13908 {
13909 if (!strncmp (fn->name, token, length))
13910 {
13911 fn->parse_override (option_part, tune);
13912 return;
13913 }
13914 }
13915
13916 error ("unknown tuning option (%s)",token);
13917 return;
13918 }
13919
13920 /* A checking mechanism for the implementation of the tls size. */
13921
13922 static void
13923 initialize_aarch64_tls_size (struct gcc_options *opts)
13924 {
13925 if (aarch64_tls_size == 0)
13926 aarch64_tls_size = 24;
13927
13928 switch (opts->x_aarch64_cmodel_var)
13929 {
13930 case AARCH64_CMODEL_TINY:
13931 /* Both the default and maximum TLS size allowed under tiny is 1M which
13932 needs two instructions to address, so we clamp the size to 24. */
13933 if (aarch64_tls_size > 24)
13934 aarch64_tls_size = 24;
13935 break;
13936 case AARCH64_CMODEL_SMALL:
13937 /* The maximum TLS size allowed under small is 4G. */
13938 if (aarch64_tls_size > 32)
13939 aarch64_tls_size = 32;
13940 break;
13941 case AARCH64_CMODEL_LARGE:
13942 /* The maximum TLS size allowed under large is 16E.
13943 FIXME: 16E should be 64bit, we only support 48bit offset now. */
13944 if (aarch64_tls_size > 48)
13945 aarch64_tls_size = 48;
13946 break;
13947 default:
13948 gcc_unreachable ();
13949 }
13950
13951 return;
13952 }
13953
13954 /* Parse STRING looking for options in the format:
13955 string :: option:string
13956 option :: name=substring
13957 name :: {a-z}
13958 substring :: defined by option. */
13959
13960 static void
13961 aarch64_parse_override_string (const char* input_string,
13962 struct tune_params* tune)
13963 {
13964 const char separator = ':';
13965 size_t string_length = strlen (input_string) + 1;
13966 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
13967 char *string = string_root;
13968 strncpy (string, input_string, string_length);
13969 string[string_length - 1] = '\0';
13970
13971 char* ntoken = string;
13972
13973 while ((ntoken = strchr (string, separator)))
13974 {
13975 size_t token_length = ntoken - string;
13976 /* Make this substring look like a string. */
13977 *ntoken = '\0';
13978 aarch64_parse_one_override_token (string, token_length, tune);
13979 string = ++ntoken;
13980 }
13981
13982 /* One last option to parse. */
13983 aarch64_parse_one_override_token (string, strlen (string), tune);
13984 free (string_root);
13985 }
13986
13987
13988 static void
13989 aarch64_override_options_after_change_1 (struct gcc_options *opts)
13990 {
13991 if (accepted_branch_protection_string)
13992 {
13993 opts->x_aarch64_branch_protection_string
13994 = xstrdup (accepted_branch_protection_string);
13995 }
13996
13997 /* PR 70044: We have to be careful about being called multiple times for the
13998 same function. This means all changes should be repeatable. */
13999
14000 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
14001 Disable the frame pointer flag so the mid-end will not use a frame
14002 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
14003 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
14004 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
14005 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
14006 if (opts->x_flag_omit_frame_pointer == 0)
14007 opts->x_flag_omit_frame_pointer = 2;
14008
14009 /* If not optimizing for size, set the default
14010 alignment to what the target wants. */
14011 if (!opts->x_optimize_size)
14012 {
14013 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
14014 opts->x_str_align_loops = aarch64_tune_params.loop_align;
14015 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
14016 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
14017 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
14018 opts->x_str_align_functions = aarch64_tune_params.function_align;
14019 }
14020
14021 /* We default to no pc-relative literal loads. */
14022
14023 aarch64_pcrelative_literal_loads = false;
14024
14025 /* If -mpc-relative-literal-loads is set on the command line, this
14026 implies that the user asked for PC relative literal loads. */
14027 if (opts->x_pcrelative_literal_loads == 1)
14028 aarch64_pcrelative_literal_loads = true;
14029
14030 /* In the tiny memory model it makes no sense to disallow PC relative
14031 literal pool loads. */
14032 if (aarch64_cmodel == AARCH64_CMODEL_TINY
14033 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14034 aarch64_pcrelative_literal_loads = true;
14035
14036 /* When enabling the lower precision Newton series for the square root, also
14037 enable it for the reciprocal square root, since the latter is an
14038 intermediary step for the former. */
14039 if (flag_mlow_precision_sqrt)
14040 flag_mrecip_low_precision_sqrt = true;
14041 }
14042
14043 /* 'Unpack' up the internal tuning structs and update the options
14044 in OPTS. The caller must have set up selected_tune and selected_arch
14045 as all the other target-specific codegen decisions are
14046 derived from them. */
14047
14048 void
14049 aarch64_override_options_internal (struct gcc_options *opts)
14050 {
14051 aarch64_tune_flags = selected_tune->flags;
14052 aarch64_tune = selected_tune->sched_core;
14053 /* Make a copy of the tuning parameters attached to the core, which
14054 we may later overwrite. */
14055 aarch64_tune_params = *(selected_tune->tune);
14056 aarch64_architecture_version = selected_arch->architecture_version;
14057
14058 if (opts->x_aarch64_override_tune_string)
14059 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
14060 &aarch64_tune_params);
14061
14062 /* This target defaults to strict volatile bitfields. */
14063 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
14064 opts->x_flag_strict_volatile_bitfields = 1;
14065
14066 if (aarch64_stack_protector_guard == SSP_GLOBAL
14067 && opts->x_aarch64_stack_protector_guard_offset_str)
14068 {
14069 error ("incompatible options %<-mstack-protector-guard=global%> and "
14070 "%<-mstack-protector-guard-offset=%s%>",
14071 aarch64_stack_protector_guard_offset_str);
14072 }
14073
14074 if (aarch64_stack_protector_guard == SSP_SYSREG
14075 && !(opts->x_aarch64_stack_protector_guard_offset_str
14076 && opts->x_aarch64_stack_protector_guard_reg_str))
14077 {
14078 error ("both %<-mstack-protector-guard-offset%> and "
14079 "%<-mstack-protector-guard-reg%> must be used "
14080 "with %<-mstack-protector-guard=sysreg%>");
14081 }
14082
14083 if (opts->x_aarch64_stack_protector_guard_reg_str)
14084 {
14085 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
14086 error ("specify a system register with a small string length.");
14087 }
14088
14089 if (opts->x_aarch64_stack_protector_guard_offset_str)
14090 {
14091 char *end;
14092 const char *str = aarch64_stack_protector_guard_offset_str;
14093 errno = 0;
14094 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
14095 if (!*str || *end || errno)
14096 error ("%qs is not a valid offset in %qs", str,
14097 "-mstack-protector-guard-offset=");
14098 aarch64_stack_protector_guard_offset = offs;
14099 }
14100
14101 initialize_aarch64_code_model (opts);
14102 initialize_aarch64_tls_size (opts);
14103
14104 int queue_depth = 0;
14105 switch (aarch64_tune_params.autoprefetcher_model)
14106 {
14107 case tune_params::AUTOPREFETCHER_OFF:
14108 queue_depth = -1;
14109 break;
14110 case tune_params::AUTOPREFETCHER_WEAK:
14111 queue_depth = 0;
14112 break;
14113 case tune_params::AUTOPREFETCHER_STRONG:
14114 queue_depth = max_insn_queue_index + 1;
14115 break;
14116 default:
14117 gcc_unreachable ();
14118 }
14119
14120 /* We don't mind passing in global_options_set here as we don't use
14121 the *options_set structs anyway. */
14122 SET_OPTION_IF_UNSET (opts, &global_options_set,
14123 param_sched_autopref_queue_depth, queue_depth);
14124
14125 /* Set up parameters to be used in prefetching algorithm. Do not
14126 override the defaults unless we are tuning for a core we have
14127 researched values for. */
14128 if (aarch64_tune_params.prefetch->num_slots > 0)
14129 SET_OPTION_IF_UNSET (opts, &global_options_set,
14130 param_simultaneous_prefetches,
14131 aarch64_tune_params.prefetch->num_slots);
14132 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
14133 SET_OPTION_IF_UNSET (opts, &global_options_set,
14134 param_l1_cache_size,
14135 aarch64_tune_params.prefetch->l1_cache_size);
14136 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
14137 SET_OPTION_IF_UNSET (opts, &global_options_set,
14138 param_l1_cache_line_size,
14139 aarch64_tune_params.prefetch->l1_cache_line_size);
14140 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
14141 SET_OPTION_IF_UNSET (opts, &global_options_set,
14142 param_l2_cache_size,
14143 aarch64_tune_params.prefetch->l2_cache_size);
14144 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
14145 SET_OPTION_IF_UNSET (opts, &global_options_set,
14146 param_prefetch_dynamic_strides, 0);
14147 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
14148 SET_OPTION_IF_UNSET (opts, &global_options_set,
14149 param_prefetch_minimum_stride,
14150 aarch64_tune_params.prefetch->minimum_stride);
14151
14152 /* Use the alternative scheduling-pressure algorithm by default. */
14153 SET_OPTION_IF_UNSET (opts, &global_options_set,
14154 param_sched_pressure_algorithm,
14155 SCHED_PRESSURE_MODEL);
14156
14157 /* Validate the guard size. */
14158 int guard_size = param_stack_clash_protection_guard_size;
14159
14160 if (guard_size != 12 && guard_size != 16)
14161 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
14162 "size. Given value %d (%llu KB) is out of range",
14163 guard_size, (1ULL << guard_size) / 1024ULL);
14164
14165 /* Enforce that interval is the same size as size so the mid-end does the
14166 right thing. */
14167 SET_OPTION_IF_UNSET (opts, &global_options_set,
14168 param_stack_clash_protection_probe_interval,
14169 guard_size);
14170
14171 /* The maybe_set calls won't update the value if the user has explicitly set
14172 one. Which means we need to validate that probing interval and guard size
14173 are equal. */
14174 int probe_interval
14175 = param_stack_clash_protection_probe_interval;
14176 if (guard_size != probe_interval)
14177 error ("stack clash guard size %<%d%> must be equal to probing interval "
14178 "%<%d%>", guard_size, probe_interval);
14179
14180 /* Enable sw prefetching at specified optimization level for
14181 CPUS that have prefetch. Lower optimization level threshold by 1
14182 when profiling is enabled. */
14183 if (opts->x_flag_prefetch_loop_arrays < 0
14184 && !opts->x_optimize_size
14185 && aarch64_tune_params.prefetch->default_opt_level >= 0
14186 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
14187 opts->x_flag_prefetch_loop_arrays = 1;
14188
14189 if (opts->x_aarch64_arch_string == NULL)
14190 opts->x_aarch64_arch_string = selected_arch->name;
14191 if (opts->x_aarch64_cpu_string == NULL)
14192 opts->x_aarch64_cpu_string = selected_cpu->name;
14193 if (opts->x_aarch64_tune_string == NULL)
14194 opts->x_aarch64_tune_string = selected_tune->name;
14195
14196 aarch64_override_options_after_change_1 (opts);
14197 }
14198
14199 /* Print a hint with a suggestion for a core or architecture name that
14200 most closely resembles what the user passed in STR. ARCH is true if
14201 the user is asking for an architecture name. ARCH is false if the user
14202 is asking for a core name. */
14203
14204 static void
14205 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
14206 {
14207 auto_vec<const char *> candidates;
14208 const struct processor *entry = arch ? all_architectures : all_cores;
14209 for (; entry->name != NULL; entry++)
14210 candidates.safe_push (entry->name);
14211
14212 #ifdef HAVE_LOCAL_CPU_DETECT
14213 /* Add also "native" as possible value. */
14214 if (arch)
14215 candidates.safe_push ("native");
14216 #endif
14217
14218 char *s;
14219 const char *hint = candidates_list_and_hint (str, s, candidates);
14220 if (hint)
14221 inform (input_location, "valid arguments are: %s;"
14222 " did you mean %qs?", s, hint);
14223 else
14224 inform (input_location, "valid arguments are: %s", s);
14225
14226 XDELETEVEC (s);
14227 }
14228
14229 /* Print a hint with a suggestion for a core name that most closely resembles
14230 what the user passed in STR. */
14231
14232 inline static void
14233 aarch64_print_hint_for_core (const char *str)
14234 {
14235 aarch64_print_hint_for_core_or_arch (str, false);
14236 }
14237
14238 /* Print a hint with a suggestion for an architecture name that most closely
14239 resembles what the user passed in STR. */
14240
14241 inline static void
14242 aarch64_print_hint_for_arch (const char *str)
14243 {
14244 aarch64_print_hint_for_core_or_arch (str, true);
14245 }
14246
14247
14248 /* Print a hint with a suggestion for an extension name
14249 that most closely resembles what the user passed in STR. */
14250
14251 void
14252 aarch64_print_hint_for_extensions (const std::string &str)
14253 {
14254 auto_vec<const char *> candidates;
14255 aarch64_get_all_extension_candidates (&candidates);
14256 char *s;
14257 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
14258 if (hint)
14259 inform (input_location, "valid arguments are: %s;"
14260 " did you mean %qs?", s, hint);
14261 else
14262 inform (input_location, "valid arguments are: %s;", s);
14263
14264 XDELETEVEC (s);
14265 }
14266
14267 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
14268 specified in STR and throw errors if appropriate. Put the results if
14269 they are valid in RES and ISA_FLAGS. Return whether the option is
14270 valid. */
14271
14272 static bool
14273 aarch64_validate_mcpu (const char *str, const struct processor **res,
14274 uint64_t *isa_flags)
14275 {
14276 std::string invalid_extension;
14277 enum aarch64_parse_opt_result parse_res
14278 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
14279
14280 if (parse_res == AARCH64_PARSE_OK)
14281 return true;
14282
14283 switch (parse_res)
14284 {
14285 case AARCH64_PARSE_MISSING_ARG:
14286 error ("missing cpu name in %<-mcpu=%s%>", str);
14287 break;
14288 case AARCH64_PARSE_INVALID_ARG:
14289 error ("unknown value %qs for %<-mcpu%>", str);
14290 aarch64_print_hint_for_core (str);
14291 break;
14292 case AARCH64_PARSE_INVALID_FEATURE:
14293 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
14294 invalid_extension.c_str (), str);
14295 aarch64_print_hint_for_extensions (invalid_extension);
14296 break;
14297 default:
14298 gcc_unreachable ();
14299 }
14300
14301 return false;
14302 }
14303
14304 /* Parses CONST_STR for branch protection features specified in
14305 aarch64_branch_protect_types, and set any global variables required. Returns
14306 the parsing result and assigns LAST_STR to the last processed token from
14307 CONST_STR so that it can be used for error reporting. */
14308
14309 static enum
14310 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
14311 char** last_str)
14312 {
14313 char *str_root = xstrdup (const_str);
14314 char* token_save = NULL;
14315 char *str = strtok_r (str_root, "+", &token_save);
14316 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
14317 if (!str)
14318 res = AARCH64_PARSE_MISSING_ARG;
14319 else
14320 {
14321 char *next_str = strtok_r (NULL, "+", &token_save);
14322 /* Reset the branch protection features to their defaults. */
14323 aarch64_handle_no_branch_protection (NULL, NULL);
14324
14325 while (str && res == AARCH64_PARSE_OK)
14326 {
14327 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
14328 bool found = false;
14329 /* Search for this type. */
14330 while (type && type->name && !found && res == AARCH64_PARSE_OK)
14331 {
14332 if (strcmp (str, type->name) == 0)
14333 {
14334 found = true;
14335 res = type->handler (str, next_str);
14336 str = next_str;
14337 next_str = strtok_r (NULL, "+", &token_save);
14338 }
14339 else
14340 type++;
14341 }
14342 if (found && res == AARCH64_PARSE_OK)
14343 {
14344 bool found_subtype = true;
14345 /* Loop through each token until we find one that isn't a
14346 subtype. */
14347 while (found_subtype)
14348 {
14349 found_subtype = false;
14350 const aarch64_branch_protect_type *subtype = type->subtypes;
14351 /* Search for the subtype. */
14352 while (str && subtype && subtype->name && !found_subtype
14353 && res == AARCH64_PARSE_OK)
14354 {
14355 if (strcmp (str, subtype->name) == 0)
14356 {
14357 found_subtype = true;
14358 res = subtype->handler (str, next_str);
14359 str = next_str;
14360 next_str = strtok_r (NULL, "+", &token_save);
14361 }
14362 else
14363 subtype++;
14364 }
14365 }
14366 }
14367 else if (!found)
14368 res = AARCH64_PARSE_INVALID_ARG;
14369 }
14370 }
14371 /* Copy the last processed token into the argument to pass it back.
14372 Used by option and attribute validation to print the offending token. */
14373 if (last_str)
14374 {
14375 if (str) strcpy (*last_str, str);
14376 else *last_str = NULL;
14377 }
14378 if (res == AARCH64_PARSE_OK)
14379 {
14380 /* If needed, alloc the accepted string then copy in const_str.
14381 Used by override_option_after_change_1. */
14382 if (!accepted_branch_protection_string)
14383 accepted_branch_protection_string = (char *) xmalloc (
14384 BRANCH_PROTECT_STR_MAX
14385 + 1);
14386 strncpy (accepted_branch_protection_string, const_str,
14387 BRANCH_PROTECT_STR_MAX + 1);
14388 /* Forcibly null-terminate. */
14389 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
14390 }
14391 return res;
14392 }
14393
14394 static bool
14395 aarch64_validate_mbranch_protection (const char *const_str)
14396 {
14397 char *str = (char *) xmalloc (strlen (const_str));
14398 enum aarch64_parse_opt_result res =
14399 aarch64_parse_branch_protection (const_str, &str);
14400 if (res == AARCH64_PARSE_INVALID_ARG)
14401 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
14402 else if (res == AARCH64_PARSE_MISSING_ARG)
14403 error ("missing argument for %<-mbranch-protection=%>");
14404 free (str);
14405 return res == AARCH64_PARSE_OK;
14406 }
14407
14408 /* Validate a command-line -march option. Parse the arch and extensions
14409 (if any) specified in STR and throw errors if appropriate. Put the
14410 results, if they are valid, in RES and ISA_FLAGS. Return whether the
14411 option is valid. */
14412
14413 static bool
14414 aarch64_validate_march (const char *str, const struct processor **res,
14415 uint64_t *isa_flags)
14416 {
14417 std::string invalid_extension;
14418 enum aarch64_parse_opt_result parse_res
14419 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
14420
14421 if (parse_res == AARCH64_PARSE_OK)
14422 return true;
14423
14424 switch (parse_res)
14425 {
14426 case AARCH64_PARSE_MISSING_ARG:
14427 error ("missing arch name in %<-march=%s%>", str);
14428 break;
14429 case AARCH64_PARSE_INVALID_ARG:
14430 error ("unknown value %qs for %<-march%>", str);
14431 aarch64_print_hint_for_arch (str);
14432 break;
14433 case AARCH64_PARSE_INVALID_FEATURE:
14434 error ("invalid feature modifier %qs in %<-march=%s%>",
14435 invalid_extension.c_str (), str);
14436 aarch64_print_hint_for_extensions (invalid_extension);
14437 break;
14438 default:
14439 gcc_unreachable ();
14440 }
14441
14442 return false;
14443 }
14444
14445 /* Validate a command-line -mtune option. Parse the cpu
14446 specified in STR and throw errors if appropriate. Put the
14447 result, if it is valid, in RES. Return whether the option is
14448 valid. */
14449
14450 static bool
14451 aarch64_validate_mtune (const char *str, const struct processor **res)
14452 {
14453 enum aarch64_parse_opt_result parse_res
14454 = aarch64_parse_tune (str, res);
14455
14456 if (parse_res == AARCH64_PARSE_OK)
14457 return true;
14458
14459 switch (parse_res)
14460 {
14461 case AARCH64_PARSE_MISSING_ARG:
14462 error ("missing cpu name in %<-mtune=%s%>", str);
14463 break;
14464 case AARCH64_PARSE_INVALID_ARG:
14465 error ("unknown value %qs for %<-mtune%>", str);
14466 aarch64_print_hint_for_core (str);
14467 break;
14468 default:
14469 gcc_unreachable ();
14470 }
14471 return false;
14472 }
14473
14474 /* Return the CPU corresponding to the enum CPU.
14475 If it doesn't specify a cpu, return the default. */
14476
14477 static const struct processor *
14478 aarch64_get_tune_cpu (enum aarch64_processor cpu)
14479 {
14480 if (cpu != aarch64_none)
14481 return &all_cores[cpu];
14482
14483 /* The & 0x3f is to extract the bottom 6 bits that encode the
14484 default cpu as selected by the --with-cpu GCC configure option
14485 in config.gcc.
14486 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14487 flags mechanism should be reworked to make it more sane. */
14488 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14489 }
14490
14491 /* Return the architecture corresponding to the enum ARCH.
14492 If it doesn't specify a valid architecture, return the default. */
14493
14494 static const struct processor *
14495 aarch64_get_arch (enum aarch64_arch arch)
14496 {
14497 if (arch != aarch64_no_arch)
14498 return &all_architectures[arch];
14499
14500 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14501
14502 return &all_architectures[cpu->arch];
14503 }
14504
14505 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
14506
14507 static poly_uint16
14508 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
14509 {
14510 /* 128-bit SVE and Advanced SIMD modes use different register layouts
14511 on big-endian targets, so we would need to forbid subregs that convert
14512 from one to the other. By default a reinterpret sequence would then
14513 involve a store to memory in one mode and a load back in the other.
14514 Even if we optimize that sequence using reverse instructions,
14515 it would still be a significant potential overhead.
14516
14517 For now, it seems better to generate length-agnostic code for that
14518 case instead. */
14519 if (value == SVE_SCALABLE
14520 || (value == SVE_128 && BYTES_BIG_ENDIAN))
14521 return poly_uint16 (2, 2);
14522 else
14523 return (int) value / 64;
14524 }
14525
14526 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
14527 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
14528 tuning structs. In particular it must set selected_tune and
14529 aarch64_isa_flags that define the available ISA features and tuning
14530 decisions. It must also set selected_arch as this will be used to
14531 output the .arch asm tags for each function. */
14532
14533 static void
14534 aarch64_override_options (void)
14535 {
14536 uint64_t cpu_isa = 0;
14537 uint64_t arch_isa = 0;
14538 aarch64_isa_flags = 0;
14539
14540 bool valid_cpu = true;
14541 bool valid_tune = true;
14542 bool valid_arch = true;
14543
14544 selected_cpu = NULL;
14545 selected_arch = NULL;
14546 selected_tune = NULL;
14547
14548 if (aarch64_branch_protection_string)
14549 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
14550
14551 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
14552 If either of -march or -mtune is given, they override their
14553 respective component of -mcpu. */
14554 if (aarch64_cpu_string)
14555 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
14556 &cpu_isa);
14557
14558 if (aarch64_arch_string)
14559 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
14560 &arch_isa);
14561
14562 if (aarch64_tune_string)
14563 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
14564
14565 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14566 SUBTARGET_OVERRIDE_OPTIONS;
14567 #endif
14568
14569 /* If the user did not specify a processor, choose the default
14570 one for them. This will be the CPU set during configuration using
14571 --with-cpu, otherwise it is "generic". */
14572 if (!selected_cpu)
14573 {
14574 if (selected_arch)
14575 {
14576 selected_cpu = &all_cores[selected_arch->ident];
14577 aarch64_isa_flags = arch_isa;
14578 explicit_arch = selected_arch->arch;
14579 }
14580 else
14581 {
14582 /* Get default configure-time CPU. */
14583 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
14584 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14585 }
14586
14587 if (selected_tune)
14588 explicit_tune_core = selected_tune->ident;
14589 }
14590 /* If both -mcpu and -march are specified check that they are architecturally
14591 compatible, warn if they're not and prefer the -march ISA flags. */
14592 else if (selected_arch)
14593 {
14594 if (selected_arch->arch != selected_cpu->arch)
14595 {
14596 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14597 aarch64_cpu_string,
14598 aarch64_arch_string);
14599 }
14600 aarch64_isa_flags = arch_isa;
14601 explicit_arch = selected_arch->arch;
14602 explicit_tune_core = selected_tune ? selected_tune->ident
14603 : selected_cpu->ident;
14604 }
14605 else
14606 {
14607 /* -mcpu but no -march. */
14608 aarch64_isa_flags = cpu_isa;
14609 explicit_tune_core = selected_tune ? selected_tune->ident
14610 : selected_cpu->ident;
14611 gcc_assert (selected_cpu);
14612 selected_arch = &all_architectures[selected_cpu->arch];
14613 explicit_arch = selected_arch->arch;
14614 }
14615
14616 /* Set the arch as well as we will need it when outputing
14617 the .arch directive in assembly. */
14618 if (!selected_arch)
14619 {
14620 gcc_assert (selected_cpu);
14621 selected_arch = &all_architectures[selected_cpu->arch];
14622 }
14623
14624 if (!selected_tune)
14625 selected_tune = selected_cpu;
14626
14627 if (aarch64_enable_bti == 2)
14628 {
14629 #ifdef TARGET_ENABLE_BTI
14630 aarch64_enable_bti = 1;
14631 #else
14632 aarch64_enable_bti = 0;
14633 #endif
14634 }
14635
14636 /* Return address signing is currently not supported for ILP32 targets. For
14637 LP64 targets use the configured option in the absence of a command-line
14638 option for -mbranch-protection. */
14639 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14640 {
14641 #ifdef TARGET_ENABLE_PAC_RET
14642 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
14643 #else
14644 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14645 #endif
14646 }
14647
14648 #ifndef HAVE_AS_MABI_OPTION
14649 /* The compiler may have been configured with 2.23.* binutils, which does
14650 not have support for ILP32. */
14651 if (TARGET_ILP32)
14652 error ("assembler does not support %<-mabi=ilp32%>");
14653 #endif
14654
14655 /* Convert -msve-vector-bits to a VG count. */
14656 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14657
14658 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
14659 sorry ("return address signing is only supported for %<-mabi=lp64%>");
14660
14661 /* Make sure we properly set up the explicit options. */
14662 if ((aarch64_cpu_string && valid_cpu)
14663 || (aarch64_tune_string && valid_tune))
14664 gcc_assert (explicit_tune_core != aarch64_none);
14665
14666 if ((aarch64_cpu_string && valid_cpu)
14667 || (aarch64_arch_string && valid_arch))
14668 gcc_assert (explicit_arch != aarch64_no_arch);
14669
14670 /* The pass to insert speculation tracking runs before
14671 shrink-wrapping and the latter does not know how to update the
14672 tracking status. So disable it in this case. */
14673 if (aarch64_track_speculation)
14674 flag_shrink_wrap = 0;
14675
14676 aarch64_override_options_internal (&global_options);
14677
14678 /* Save these options as the default ones in case we push and pop them later
14679 while processing functions with potential target attributes. */
14680 target_option_default_node = target_option_current_node
14681 = build_target_option_node (&global_options);
14682 }
14683
14684 /* Implement targetm.override_options_after_change. */
14685
14686 static void
14687 aarch64_override_options_after_change (void)
14688 {
14689 aarch64_override_options_after_change_1 (&global_options);
14690 }
14691
14692 static struct machine_function *
14693 aarch64_init_machine_status (void)
14694 {
14695 struct machine_function *machine;
14696 machine = ggc_cleared_alloc<machine_function> ();
14697 return machine;
14698 }
14699
14700 void
14701 aarch64_init_expanders (void)
14702 {
14703 init_machine_status = aarch64_init_machine_status;
14704 }
14705
14706 /* A checking mechanism for the implementation of the various code models. */
14707 static void
14708 initialize_aarch64_code_model (struct gcc_options *opts)
14709 {
14710 if (opts->x_flag_pic)
14711 {
14712 switch (opts->x_aarch64_cmodel_var)
14713 {
14714 case AARCH64_CMODEL_TINY:
14715 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
14716 break;
14717 case AARCH64_CMODEL_SMALL:
14718 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14719 aarch64_cmodel = (flag_pic == 2
14720 ? AARCH64_CMODEL_SMALL_PIC
14721 : AARCH64_CMODEL_SMALL_SPIC);
14722 #else
14723 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
14724 #endif
14725 break;
14726 case AARCH64_CMODEL_LARGE:
14727 sorry ("code model %qs with %<-f%s%>", "large",
14728 opts->x_flag_pic > 1 ? "PIC" : "pic");
14729 break;
14730 default:
14731 gcc_unreachable ();
14732 }
14733 }
14734 else
14735 aarch64_cmodel = opts->x_aarch64_cmodel_var;
14736 }
14737
14738 /* Implement TARGET_OPTION_SAVE. */
14739
14740 static void
14741 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
14742 {
14743 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
14744 ptr->x_aarch64_branch_protection_string
14745 = opts->x_aarch64_branch_protection_string;
14746 }
14747
14748 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
14749 using the information saved in PTR. */
14750
14751 static void
14752 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
14753 {
14754 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
14755 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14756 opts->x_explicit_arch = ptr->x_explicit_arch;
14757 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
14758 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
14759 opts->x_aarch64_branch_protection_string
14760 = ptr->x_aarch64_branch_protection_string;
14761 if (opts->x_aarch64_branch_protection_string)
14762 {
14763 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
14764 NULL);
14765 }
14766
14767 aarch64_override_options_internal (opts);
14768 }
14769
14770 /* Implement TARGET_OPTION_PRINT. */
14771
14772 static void
14773 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
14774 {
14775 const struct processor *cpu
14776 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14777 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
14778 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
14779 std::string extension
14780 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
14781
14782 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
14783 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
14784 arch->name, extension.c_str ());
14785 }
14786
14787 static GTY(()) tree aarch64_previous_fndecl;
14788
14789 void
14790 aarch64_reset_previous_fndecl (void)
14791 {
14792 aarch64_previous_fndecl = NULL;
14793 }
14794
14795 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14796 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14797 make sure optab availability predicates are recomputed when necessary. */
14798
14799 void
14800 aarch64_save_restore_target_globals (tree new_tree)
14801 {
14802 if (TREE_TARGET_GLOBALS (new_tree))
14803 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
14804 else if (new_tree == target_option_default_node)
14805 restore_target_globals (&default_target_globals);
14806 else
14807 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
14808 }
14809
14810 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
14811 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14812 of the function, if such exists. This function may be called multiple
14813 times on a single function so use aarch64_previous_fndecl to avoid
14814 setting up identical state. */
14815
14816 static void
14817 aarch64_set_current_function (tree fndecl)
14818 {
14819 if (!fndecl || fndecl == aarch64_previous_fndecl)
14820 return;
14821
14822 tree old_tree = (aarch64_previous_fndecl
14823 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
14824 : NULL_TREE);
14825
14826 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14827
14828 /* If current function has no attributes but the previous one did,
14829 use the default node. */
14830 if (!new_tree && old_tree)
14831 new_tree = target_option_default_node;
14832
14833 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
14834 the default have been handled by aarch64_save_restore_target_globals from
14835 aarch64_pragma_target_parse. */
14836 if (old_tree == new_tree)
14837 return;
14838
14839 aarch64_previous_fndecl = fndecl;
14840
14841 /* First set the target options. */
14842 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
14843
14844 aarch64_save_restore_target_globals (new_tree);
14845 }
14846
14847 /* Enum describing the various ways we can handle attributes.
14848 In many cases we can reuse the generic option handling machinery. */
14849
14850 enum aarch64_attr_opt_type
14851 {
14852 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
14853 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
14854 aarch64_attr_enum, /* Attribute sets an enum variable. */
14855 aarch64_attr_custom /* Attribute requires a custom handling function. */
14856 };
14857
14858 /* All the information needed to handle a target attribute.
14859 NAME is the name of the attribute.
14860 ATTR_TYPE specifies the type of behavior of the attribute as described
14861 in the definition of enum aarch64_attr_opt_type.
14862 ALLOW_NEG is true if the attribute supports a "no-" form.
14863 HANDLER is the function that takes the attribute string as an argument
14864 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14865 OPT_NUM is the enum specifying the option that the attribute modifies.
14866 This is needed for attributes that mirror the behavior of a command-line
14867 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14868 aarch64_attr_enum. */
14869
14870 struct aarch64_attribute_info
14871 {
14872 const char *name;
14873 enum aarch64_attr_opt_type attr_type;
14874 bool allow_neg;
14875 bool (*handler) (const char *);
14876 enum opt_code opt_num;
14877 };
14878
14879 /* Handle the ARCH_STR argument to the arch= target attribute. */
14880
14881 static bool
14882 aarch64_handle_attr_arch (const char *str)
14883 {
14884 const struct processor *tmp_arch = NULL;
14885 std::string invalid_extension;
14886 enum aarch64_parse_opt_result parse_res
14887 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
14888
14889 if (parse_res == AARCH64_PARSE_OK)
14890 {
14891 gcc_assert (tmp_arch);
14892 selected_arch = tmp_arch;
14893 explicit_arch = selected_arch->arch;
14894 return true;
14895 }
14896
14897 switch (parse_res)
14898 {
14899 case AARCH64_PARSE_MISSING_ARG:
14900 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14901 break;
14902 case AARCH64_PARSE_INVALID_ARG:
14903 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
14904 aarch64_print_hint_for_arch (str);
14905 break;
14906 case AARCH64_PARSE_INVALID_FEATURE:
14907 error ("invalid feature modifier %s of value (\"%s\") in "
14908 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14909 aarch64_print_hint_for_extensions (invalid_extension);
14910 break;
14911 default:
14912 gcc_unreachable ();
14913 }
14914
14915 return false;
14916 }
14917
14918 /* Handle the argument CPU_STR to the cpu= target attribute. */
14919
14920 static bool
14921 aarch64_handle_attr_cpu (const char *str)
14922 {
14923 const struct processor *tmp_cpu = NULL;
14924 std::string invalid_extension;
14925 enum aarch64_parse_opt_result parse_res
14926 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
14927
14928 if (parse_res == AARCH64_PARSE_OK)
14929 {
14930 gcc_assert (tmp_cpu);
14931 selected_tune = tmp_cpu;
14932 explicit_tune_core = selected_tune->ident;
14933
14934 selected_arch = &all_architectures[tmp_cpu->arch];
14935 explicit_arch = selected_arch->arch;
14936 return true;
14937 }
14938
14939 switch (parse_res)
14940 {
14941 case AARCH64_PARSE_MISSING_ARG:
14942 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14943 break;
14944 case AARCH64_PARSE_INVALID_ARG:
14945 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
14946 aarch64_print_hint_for_core (str);
14947 break;
14948 case AARCH64_PARSE_INVALID_FEATURE:
14949 error ("invalid feature modifier %s of value (\"%s\") in "
14950 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14951 aarch64_print_hint_for_extensions (invalid_extension);
14952 break;
14953 default:
14954 gcc_unreachable ();
14955 }
14956
14957 return false;
14958 }
14959
14960 /* Handle the argument STR to the branch-protection= attribute. */
14961
14962 static bool
14963 aarch64_handle_attr_branch_protection (const char* str)
14964 {
14965 char *err_str = (char *) xmalloc (strlen (str) + 1);
14966 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
14967 &err_str);
14968 bool success = false;
14969 switch (res)
14970 {
14971 case AARCH64_PARSE_MISSING_ARG:
14972 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14973 " attribute");
14974 break;
14975 case AARCH64_PARSE_INVALID_ARG:
14976 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14977 "=\")%> pragma or attribute", err_str);
14978 break;
14979 case AARCH64_PARSE_OK:
14980 success = true;
14981 /* Fall through. */
14982 case AARCH64_PARSE_INVALID_FEATURE:
14983 break;
14984 default:
14985 gcc_unreachable ();
14986 }
14987 free (err_str);
14988 return success;
14989 }
14990
14991 /* Handle the argument STR to the tune= target attribute. */
14992
14993 static bool
14994 aarch64_handle_attr_tune (const char *str)
14995 {
14996 const struct processor *tmp_tune = NULL;
14997 enum aarch64_parse_opt_result parse_res
14998 = aarch64_parse_tune (str, &tmp_tune);
14999
15000 if (parse_res == AARCH64_PARSE_OK)
15001 {
15002 gcc_assert (tmp_tune);
15003 selected_tune = tmp_tune;
15004 explicit_tune_core = selected_tune->ident;
15005 return true;
15006 }
15007
15008 switch (parse_res)
15009 {
15010 case AARCH64_PARSE_INVALID_ARG:
15011 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
15012 aarch64_print_hint_for_core (str);
15013 break;
15014 default:
15015 gcc_unreachable ();
15016 }
15017
15018 return false;
15019 }
15020
15021 /* Parse an architecture extensions target attribute string specified in STR.
15022 For example "+fp+nosimd". Show any errors if needed. Return TRUE
15023 if successful. Update aarch64_isa_flags to reflect the ISA features
15024 modified. */
15025
15026 static bool
15027 aarch64_handle_attr_isa_flags (char *str)
15028 {
15029 enum aarch64_parse_opt_result parse_res;
15030 uint64_t isa_flags = aarch64_isa_flags;
15031
15032 /* We allow "+nothing" in the beginning to clear out all architectural
15033 features if the user wants to handpick specific features. */
15034 if (strncmp ("+nothing", str, 8) == 0)
15035 {
15036 isa_flags = 0;
15037 str += 8;
15038 }
15039
15040 std::string invalid_extension;
15041 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
15042
15043 if (parse_res == AARCH64_PARSE_OK)
15044 {
15045 aarch64_isa_flags = isa_flags;
15046 return true;
15047 }
15048
15049 switch (parse_res)
15050 {
15051 case AARCH64_PARSE_MISSING_ARG:
15052 error ("missing value in %<target()%> pragma or attribute");
15053 break;
15054
15055 case AARCH64_PARSE_INVALID_FEATURE:
15056 error ("invalid feature modifier %s of value (\"%s\") in "
15057 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15058 break;
15059
15060 default:
15061 gcc_unreachable ();
15062 }
15063
15064 return false;
15065 }
15066
15067 /* The target attributes that we support. On top of these we also support just
15068 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
15069 handled explicitly in aarch64_process_one_target_attr. */
15070
15071 static const struct aarch64_attribute_info aarch64_attributes[] =
15072 {
15073 { "general-regs-only", aarch64_attr_mask, false, NULL,
15074 OPT_mgeneral_regs_only },
15075 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
15076 OPT_mfix_cortex_a53_835769 },
15077 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
15078 OPT_mfix_cortex_a53_843419 },
15079 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
15080 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
15081 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
15082 OPT_momit_leaf_frame_pointer },
15083 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
15084 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
15085 OPT_march_ },
15086 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
15087 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
15088 OPT_mtune_ },
15089 { "branch-protection", aarch64_attr_custom, false,
15090 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
15091 { "sign-return-address", aarch64_attr_enum, false, NULL,
15092 OPT_msign_return_address_ },
15093 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
15094 };
15095
15096 /* Parse ARG_STR which contains the definition of one target attribute.
15097 Show appropriate errors if any or return true if the attribute is valid. */
15098
15099 static bool
15100 aarch64_process_one_target_attr (char *arg_str)
15101 {
15102 bool invert = false;
15103
15104 size_t len = strlen (arg_str);
15105
15106 if (len == 0)
15107 {
15108 error ("malformed %<target()%> pragma or attribute");
15109 return false;
15110 }
15111
15112 char *str_to_check = (char *) alloca (len + 1);
15113 strcpy (str_to_check, arg_str);
15114
15115 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
15116 It is easier to detect and handle it explicitly here rather than going
15117 through the machinery for the rest of the target attributes in this
15118 function. */
15119 if (*str_to_check == '+')
15120 return aarch64_handle_attr_isa_flags (str_to_check);
15121
15122 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
15123 {
15124 invert = true;
15125 str_to_check += 3;
15126 }
15127 char *arg = strchr (str_to_check, '=');
15128
15129 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
15130 and point ARG to "foo". */
15131 if (arg)
15132 {
15133 *arg = '\0';
15134 arg++;
15135 }
15136 const struct aarch64_attribute_info *p_attr;
15137 bool found = false;
15138 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
15139 {
15140 /* If the names don't match up, or the user has given an argument
15141 to an attribute that doesn't accept one, or didn't give an argument
15142 to an attribute that expects one, fail to match. */
15143 if (strcmp (str_to_check, p_attr->name) != 0)
15144 continue;
15145
15146 found = true;
15147 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
15148 || p_attr->attr_type == aarch64_attr_enum;
15149
15150 if (attr_need_arg_p ^ (arg != NULL))
15151 {
15152 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
15153 return false;
15154 }
15155
15156 /* If the name matches but the attribute does not allow "no-" versions
15157 then we can't match. */
15158 if (invert && !p_attr->allow_neg)
15159 {
15160 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
15161 return false;
15162 }
15163
15164 switch (p_attr->attr_type)
15165 {
15166 /* Has a custom handler registered.
15167 For example, cpu=, arch=, tune=. */
15168 case aarch64_attr_custom:
15169 gcc_assert (p_attr->handler);
15170 if (!p_attr->handler (arg))
15171 return false;
15172 break;
15173
15174 /* Either set or unset a boolean option. */
15175 case aarch64_attr_bool:
15176 {
15177 struct cl_decoded_option decoded;
15178
15179 generate_option (p_attr->opt_num, NULL, !invert,
15180 CL_TARGET, &decoded);
15181 aarch64_handle_option (&global_options, &global_options_set,
15182 &decoded, input_location);
15183 break;
15184 }
15185 /* Set or unset a bit in the target_flags. aarch64_handle_option
15186 should know what mask to apply given the option number. */
15187 case aarch64_attr_mask:
15188 {
15189 struct cl_decoded_option decoded;
15190 /* We only need to specify the option number.
15191 aarch64_handle_option will know which mask to apply. */
15192 decoded.opt_index = p_attr->opt_num;
15193 decoded.value = !invert;
15194 aarch64_handle_option (&global_options, &global_options_set,
15195 &decoded, input_location);
15196 break;
15197 }
15198 /* Use the option setting machinery to set an option to an enum. */
15199 case aarch64_attr_enum:
15200 {
15201 gcc_assert (arg);
15202 bool valid;
15203 int value;
15204 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
15205 &value, CL_TARGET);
15206 if (valid)
15207 {
15208 set_option (&global_options, NULL, p_attr->opt_num, value,
15209 NULL, DK_UNSPECIFIED, input_location,
15210 global_dc);
15211 }
15212 else
15213 {
15214 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
15215 }
15216 break;
15217 }
15218 default:
15219 gcc_unreachable ();
15220 }
15221 }
15222
15223 /* If we reached here we either have found an attribute and validated
15224 it or didn't match any. If we matched an attribute but its arguments
15225 were malformed we will have returned false already. */
15226 return found;
15227 }
15228
15229 /* Count how many times the character C appears in
15230 NULL-terminated string STR. */
15231
15232 static unsigned int
15233 num_occurences_in_str (char c, char *str)
15234 {
15235 unsigned int res = 0;
15236 while (*str != '\0')
15237 {
15238 if (*str == c)
15239 res++;
15240
15241 str++;
15242 }
15243
15244 return res;
15245 }
15246
15247 /* Parse the tree in ARGS that contains the target attribute information
15248 and update the global target options space. */
15249
15250 bool
15251 aarch64_process_target_attr (tree args)
15252 {
15253 if (TREE_CODE (args) == TREE_LIST)
15254 {
15255 do
15256 {
15257 tree head = TREE_VALUE (args);
15258 if (head)
15259 {
15260 if (!aarch64_process_target_attr (head))
15261 return false;
15262 }
15263 args = TREE_CHAIN (args);
15264 } while (args);
15265
15266 return true;
15267 }
15268
15269 if (TREE_CODE (args) != STRING_CST)
15270 {
15271 error ("attribute %<target%> argument not a string");
15272 return false;
15273 }
15274
15275 size_t len = strlen (TREE_STRING_POINTER (args));
15276 char *str_to_check = (char *) alloca (len + 1);
15277 strcpy (str_to_check, TREE_STRING_POINTER (args));
15278
15279 if (len == 0)
15280 {
15281 error ("malformed %<target()%> pragma or attribute");
15282 return false;
15283 }
15284
15285 /* Used to catch empty spaces between commas i.e.
15286 attribute ((target ("attr1,,attr2"))). */
15287 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
15288
15289 /* Handle multiple target attributes separated by ','. */
15290 char *token = strtok_r (str_to_check, ",", &str_to_check);
15291
15292 unsigned int num_attrs = 0;
15293 while (token)
15294 {
15295 num_attrs++;
15296 if (!aarch64_process_one_target_attr (token))
15297 {
15298 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
15299 return false;
15300 }
15301
15302 token = strtok_r (NULL, ",", &str_to_check);
15303 }
15304
15305 if (num_attrs != num_commas + 1)
15306 {
15307 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
15308 return false;
15309 }
15310
15311 return true;
15312 }
15313
15314 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
15315 process attribute ((target ("..."))). */
15316
15317 static bool
15318 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
15319 {
15320 struct cl_target_option cur_target;
15321 bool ret;
15322 tree old_optimize;
15323 tree new_target, new_optimize;
15324 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15325
15326 /* If what we're processing is the current pragma string then the
15327 target option node is already stored in target_option_current_node
15328 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
15329 having to re-parse the string. This is especially useful to keep
15330 arm_neon.h compile times down since that header contains a lot
15331 of intrinsics enclosed in pragmas. */
15332 if (!existing_target && args == current_target_pragma)
15333 {
15334 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
15335 return true;
15336 }
15337 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15338
15339 old_optimize = build_optimization_node (&global_options);
15340 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15341
15342 /* If the function changed the optimization levels as well as setting
15343 target options, start with the optimizations specified. */
15344 if (func_optimize && func_optimize != old_optimize)
15345 cl_optimization_restore (&global_options,
15346 TREE_OPTIMIZATION (func_optimize));
15347
15348 /* Save the current target options to restore at the end. */
15349 cl_target_option_save (&cur_target, &global_options);
15350
15351 /* If fndecl already has some target attributes applied to it, unpack
15352 them so that we add this attribute on top of them, rather than
15353 overwriting them. */
15354 if (existing_target)
15355 {
15356 struct cl_target_option *existing_options
15357 = TREE_TARGET_OPTION (existing_target);
15358
15359 if (existing_options)
15360 cl_target_option_restore (&global_options, existing_options);
15361 }
15362 else
15363 cl_target_option_restore (&global_options,
15364 TREE_TARGET_OPTION (target_option_current_node));
15365
15366 ret = aarch64_process_target_attr (args);
15367
15368 /* Set up any additional state. */
15369 if (ret)
15370 {
15371 aarch64_override_options_internal (&global_options);
15372 /* Initialize SIMD builtins if we haven't already.
15373 Set current_target_pragma to NULL for the duration so that
15374 the builtin initialization code doesn't try to tag the functions
15375 being built with the attributes specified by any current pragma, thus
15376 going into an infinite recursion. */
15377 if (TARGET_SIMD)
15378 {
15379 tree saved_current_target_pragma = current_target_pragma;
15380 current_target_pragma = NULL;
15381 aarch64_init_simd_builtins ();
15382 current_target_pragma = saved_current_target_pragma;
15383 }
15384 new_target = build_target_option_node (&global_options);
15385 }
15386 else
15387 new_target = NULL;
15388
15389 new_optimize = build_optimization_node (&global_options);
15390
15391 if (fndecl && ret)
15392 {
15393 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
15394
15395 if (old_optimize != new_optimize)
15396 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
15397 }
15398
15399 cl_target_option_restore (&global_options, &cur_target);
15400
15401 if (old_optimize != new_optimize)
15402 cl_optimization_restore (&global_options,
15403 TREE_OPTIMIZATION (old_optimize));
15404 return ret;
15405 }
15406
15407 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
15408 tri-bool options (yes, no, don't care) and the default value is
15409 DEF, determine whether to reject inlining. */
15410
15411 static bool
15412 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
15413 int dont_care, int def)
15414 {
15415 /* If the callee doesn't care, always allow inlining. */
15416 if (callee == dont_care)
15417 return true;
15418
15419 /* If the caller doesn't care, always allow inlining. */
15420 if (caller == dont_care)
15421 return true;
15422
15423 /* Otherwise, allow inlining if either the callee and caller values
15424 agree, or if the callee is using the default value. */
15425 return (callee == caller || callee == def);
15426 }
15427
15428 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
15429 to inline CALLEE into CALLER based on target-specific info.
15430 Make sure that the caller and callee have compatible architectural
15431 features. Then go through the other possible target attributes
15432 and see if they can block inlining. Try not to reject always_inline
15433 callees unless they are incompatible architecturally. */
15434
15435 static bool
15436 aarch64_can_inline_p (tree caller, tree callee)
15437 {
15438 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
15439 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
15440
15441 struct cl_target_option *caller_opts
15442 = TREE_TARGET_OPTION (caller_tree ? caller_tree
15443 : target_option_default_node);
15444
15445 struct cl_target_option *callee_opts
15446 = TREE_TARGET_OPTION (callee_tree ? callee_tree
15447 : target_option_default_node);
15448
15449 /* Callee's ISA flags should be a subset of the caller's. */
15450 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
15451 != callee_opts->x_aarch64_isa_flags)
15452 return false;
15453
15454 /* Allow non-strict aligned functions inlining into strict
15455 aligned ones. */
15456 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
15457 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
15458 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
15459 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
15460 return false;
15461
15462 bool always_inline = lookup_attribute ("always_inline",
15463 DECL_ATTRIBUTES (callee));
15464
15465 /* If the architectural features match up and the callee is always_inline
15466 then the other attributes don't matter. */
15467 if (always_inline)
15468 return true;
15469
15470 if (caller_opts->x_aarch64_cmodel_var
15471 != callee_opts->x_aarch64_cmodel_var)
15472 return false;
15473
15474 if (caller_opts->x_aarch64_tls_dialect
15475 != callee_opts->x_aarch64_tls_dialect)
15476 return false;
15477
15478 /* Honour explicit requests to workaround errata. */
15479 if (!aarch64_tribools_ok_for_inlining_p (
15480 caller_opts->x_aarch64_fix_a53_err835769,
15481 callee_opts->x_aarch64_fix_a53_err835769,
15482 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
15483 return false;
15484
15485 if (!aarch64_tribools_ok_for_inlining_p (
15486 caller_opts->x_aarch64_fix_a53_err843419,
15487 callee_opts->x_aarch64_fix_a53_err843419,
15488 2, TARGET_FIX_ERR_A53_843419))
15489 return false;
15490
15491 /* If the user explicitly specified -momit-leaf-frame-pointer for the
15492 caller and calle and they don't match up, reject inlining. */
15493 if (!aarch64_tribools_ok_for_inlining_p (
15494 caller_opts->x_flag_omit_leaf_frame_pointer,
15495 callee_opts->x_flag_omit_leaf_frame_pointer,
15496 2, 1))
15497 return false;
15498
15499 /* If the callee has specific tuning overrides, respect them. */
15500 if (callee_opts->x_aarch64_override_tune_string != NULL
15501 && caller_opts->x_aarch64_override_tune_string == NULL)
15502 return false;
15503
15504 /* If the user specified tuning override strings for the
15505 caller and callee and they don't match up, reject inlining.
15506 We just do a string compare here, we don't analyze the meaning
15507 of the string, as it would be too costly for little gain. */
15508 if (callee_opts->x_aarch64_override_tune_string
15509 && caller_opts->x_aarch64_override_tune_string
15510 && (strcmp (callee_opts->x_aarch64_override_tune_string,
15511 caller_opts->x_aarch64_override_tune_string) != 0))
15512 return false;
15513
15514 return true;
15515 }
15516
15517 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
15518 been already. */
15519
15520 unsigned int
15521 aarch64_tlsdesc_abi_id ()
15522 {
15523 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
15524 if (!tlsdesc_abi.initialized_p ())
15525 {
15526 HARD_REG_SET full_reg_clobbers;
15527 CLEAR_HARD_REG_SET (full_reg_clobbers);
15528 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
15529 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
15530 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
15531 SET_HARD_REG_BIT (full_reg_clobbers, regno);
15532 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
15533 }
15534 return tlsdesc_abi.id ();
15535 }
15536
15537 /* Return true if SYMBOL_REF X binds locally. */
15538
15539 static bool
15540 aarch64_symbol_binds_local_p (const_rtx x)
15541 {
15542 return (SYMBOL_REF_DECL (x)
15543 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
15544 : SYMBOL_REF_LOCAL_P (x));
15545 }
15546
15547 /* Return true if SYMBOL_REF X is thread local */
15548 static bool
15549 aarch64_tls_symbol_p (rtx x)
15550 {
15551 if (! TARGET_HAVE_TLS)
15552 return false;
15553
15554 if (GET_CODE (x) != SYMBOL_REF)
15555 return false;
15556
15557 return SYMBOL_REF_TLS_MODEL (x) != 0;
15558 }
15559
15560 /* Classify a TLS symbol into one of the TLS kinds. */
15561 enum aarch64_symbol_type
15562 aarch64_classify_tls_symbol (rtx x)
15563 {
15564 enum tls_model tls_kind = tls_symbolic_operand_type (x);
15565
15566 switch (tls_kind)
15567 {
15568 case TLS_MODEL_GLOBAL_DYNAMIC:
15569 case TLS_MODEL_LOCAL_DYNAMIC:
15570 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15571
15572 case TLS_MODEL_INITIAL_EXEC:
15573 switch (aarch64_cmodel)
15574 {
15575 case AARCH64_CMODEL_TINY:
15576 case AARCH64_CMODEL_TINY_PIC:
15577 return SYMBOL_TINY_TLSIE;
15578 default:
15579 return SYMBOL_SMALL_TLSIE;
15580 }
15581
15582 case TLS_MODEL_LOCAL_EXEC:
15583 if (aarch64_tls_size == 12)
15584 return SYMBOL_TLSLE12;
15585 else if (aarch64_tls_size == 24)
15586 return SYMBOL_TLSLE24;
15587 else if (aarch64_tls_size == 32)
15588 return SYMBOL_TLSLE32;
15589 else if (aarch64_tls_size == 48)
15590 return SYMBOL_TLSLE48;
15591 else
15592 gcc_unreachable ();
15593
15594 case TLS_MODEL_EMULATED:
15595 case TLS_MODEL_NONE:
15596 return SYMBOL_FORCE_TO_MEM;
15597
15598 default:
15599 gcc_unreachable ();
15600 }
15601 }
15602
15603 /* Return the correct method for accessing X + OFFSET, where X is either
15604 a SYMBOL_REF or LABEL_REF. */
15605
15606 enum aarch64_symbol_type
15607 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
15608 {
15609 if (GET_CODE (x) == LABEL_REF)
15610 {
15611 switch (aarch64_cmodel)
15612 {
15613 case AARCH64_CMODEL_LARGE:
15614 return SYMBOL_FORCE_TO_MEM;
15615
15616 case AARCH64_CMODEL_TINY_PIC:
15617 case AARCH64_CMODEL_TINY:
15618 return SYMBOL_TINY_ABSOLUTE;
15619
15620 case AARCH64_CMODEL_SMALL_SPIC:
15621 case AARCH64_CMODEL_SMALL_PIC:
15622 case AARCH64_CMODEL_SMALL:
15623 return SYMBOL_SMALL_ABSOLUTE;
15624
15625 default:
15626 gcc_unreachable ();
15627 }
15628 }
15629
15630 if (GET_CODE (x) == SYMBOL_REF)
15631 {
15632 if (aarch64_tls_symbol_p (x))
15633 return aarch64_classify_tls_symbol (x);
15634
15635 switch (aarch64_cmodel)
15636 {
15637 case AARCH64_CMODEL_TINY:
15638 /* When we retrieve symbol + offset address, we have to make sure
15639 the offset does not cause overflow of the final address. But
15640 we have no way of knowing the address of symbol at compile time
15641 so we can't accurately say if the distance between the PC and
15642 symbol + offset is outside the addressible range of +/-1MB in the
15643 TINY code model. So we limit the maximum offset to +/-64KB and
15644 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15645 If offset_within_block_p is true we allow larger offsets.
15646 Furthermore force to memory if the symbol is a weak reference to
15647 something that doesn't resolve to a symbol in this module. */
15648
15649 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15650 return SYMBOL_FORCE_TO_MEM;
15651 if (!(IN_RANGE (offset, -0x10000, 0x10000)
15652 || offset_within_block_p (x, offset)))
15653 return SYMBOL_FORCE_TO_MEM;
15654
15655 return SYMBOL_TINY_ABSOLUTE;
15656
15657 case AARCH64_CMODEL_SMALL:
15658 /* Same reasoning as the tiny code model, but the offset cap here is
15659 1MB, allowing +/-3.9GB for the offset to the symbol. */
15660
15661 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15662 return SYMBOL_FORCE_TO_MEM;
15663 if (!(IN_RANGE (offset, -0x100000, 0x100000)
15664 || offset_within_block_p (x, offset)))
15665 return SYMBOL_FORCE_TO_MEM;
15666
15667 return SYMBOL_SMALL_ABSOLUTE;
15668
15669 case AARCH64_CMODEL_TINY_PIC:
15670 if (!aarch64_symbol_binds_local_p (x))
15671 return SYMBOL_TINY_GOT;
15672 return SYMBOL_TINY_ABSOLUTE;
15673
15674 case AARCH64_CMODEL_SMALL_SPIC:
15675 case AARCH64_CMODEL_SMALL_PIC:
15676 if (!aarch64_symbol_binds_local_p (x))
15677 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
15678 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
15679 return SYMBOL_SMALL_ABSOLUTE;
15680
15681 case AARCH64_CMODEL_LARGE:
15682 /* This is alright even in PIC code as the constant
15683 pool reference is always PC relative and within
15684 the same translation unit. */
15685 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
15686 return SYMBOL_SMALL_ABSOLUTE;
15687 else
15688 return SYMBOL_FORCE_TO_MEM;
15689
15690 default:
15691 gcc_unreachable ();
15692 }
15693 }
15694
15695 /* By default push everything into the constant pool. */
15696 return SYMBOL_FORCE_TO_MEM;
15697 }
15698
15699 bool
15700 aarch64_constant_address_p (rtx x)
15701 {
15702 return (CONSTANT_P (x) && memory_address_p (DImode, x));
15703 }
15704
15705 bool
15706 aarch64_legitimate_pic_operand_p (rtx x)
15707 {
15708 if (GET_CODE (x) == SYMBOL_REF
15709 || (GET_CODE (x) == CONST
15710 && GET_CODE (XEXP (x, 0)) == PLUS
15711 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
15712 return false;
15713
15714 return true;
15715 }
15716
15717 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
15718 that should be rematerialized rather than spilled. */
15719
15720 static bool
15721 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
15722 {
15723 /* Support CSE and rematerialization of common constants. */
15724 if (CONST_INT_P (x)
15725 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
15726 || GET_CODE (x) == CONST_VECTOR)
15727 return true;
15728
15729 /* Do not allow vector struct mode constants for Advanced SIMD.
15730 We could support 0 and -1 easily, but they need support in
15731 aarch64-simd.md. */
15732 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15733 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15734 return false;
15735
15736 /* Only accept variable-length vector constants if they can be
15737 handled directly.
15738
15739 ??? It would be possible to handle rematerialization of other
15740 constants via secondary reloads. */
15741 if (vec_flags & VEC_ANY_SVE)
15742 return aarch64_simd_valid_immediate (x, NULL);
15743
15744 if (GET_CODE (x) == HIGH)
15745 x = XEXP (x, 0);
15746
15747 /* Accept polynomial constants that can be calculated by using the
15748 destination of a move as the sole temporary. Constants that
15749 require a second temporary cannot be rematerialized (they can't be
15750 forced to memory and also aren't legitimate constants). */
15751 poly_int64 offset;
15752 if (poly_int_rtx_p (x, &offset))
15753 return aarch64_offset_temporaries (false, offset) <= 1;
15754
15755 /* If an offset is being added to something else, we need to allow the
15756 base to be moved into the destination register, meaning that there
15757 are no free temporaries for the offset. */
15758 x = strip_offset (x, &offset);
15759 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
15760 return false;
15761
15762 /* Do not allow const (plus (anchor_symbol, const_int)). */
15763 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
15764 return false;
15765
15766 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
15767 so spilling them is better than rematerialization. */
15768 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
15769 return true;
15770
15771 /* Label references are always constant. */
15772 if (GET_CODE (x) == LABEL_REF)
15773 return true;
15774
15775 return false;
15776 }
15777
15778 rtx
15779 aarch64_load_tp (rtx target)
15780 {
15781 if (!target
15782 || GET_MODE (target) != Pmode
15783 || !register_operand (target, Pmode))
15784 target = gen_reg_rtx (Pmode);
15785
15786 /* Can return in any reg. */
15787 emit_insn (gen_aarch64_load_tp_hard (target));
15788 return target;
15789 }
15790
15791 /* On AAPCS systems, this is the "struct __va_list". */
15792 static GTY(()) tree va_list_type;
15793
15794 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15795 Return the type to use as __builtin_va_list.
15796
15797 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15798
15799 struct __va_list
15800 {
15801 void *__stack;
15802 void *__gr_top;
15803 void *__vr_top;
15804 int __gr_offs;
15805 int __vr_offs;
15806 }; */
15807
15808 static tree
15809 aarch64_build_builtin_va_list (void)
15810 {
15811 tree va_list_name;
15812 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15813
15814 /* Create the type. */
15815 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
15816 /* Give it the required name. */
15817 va_list_name = build_decl (BUILTINS_LOCATION,
15818 TYPE_DECL,
15819 get_identifier ("__va_list"),
15820 va_list_type);
15821 DECL_ARTIFICIAL (va_list_name) = 1;
15822 TYPE_NAME (va_list_type) = va_list_name;
15823 TYPE_STUB_DECL (va_list_type) = va_list_name;
15824
15825 /* Create the fields. */
15826 f_stack = build_decl (BUILTINS_LOCATION,
15827 FIELD_DECL, get_identifier ("__stack"),
15828 ptr_type_node);
15829 f_grtop = build_decl (BUILTINS_LOCATION,
15830 FIELD_DECL, get_identifier ("__gr_top"),
15831 ptr_type_node);
15832 f_vrtop = build_decl (BUILTINS_LOCATION,
15833 FIELD_DECL, get_identifier ("__vr_top"),
15834 ptr_type_node);
15835 f_groff = build_decl (BUILTINS_LOCATION,
15836 FIELD_DECL, get_identifier ("__gr_offs"),
15837 integer_type_node);
15838 f_vroff = build_decl (BUILTINS_LOCATION,
15839 FIELD_DECL, get_identifier ("__vr_offs"),
15840 integer_type_node);
15841
15842 /* Tell tree-stdarg pass about our internal offset fields.
15843 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15844 purpose to identify whether the code is updating va_list internal
15845 offset fields through irregular way. */
15846 va_list_gpr_counter_field = f_groff;
15847 va_list_fpr_counter_field = f_vroff;
15848
15849 DECL_ARTIFICIAL (f_stack) = 1;
15850 DECL_ARTIFICIAL (f_grtop) = 1;
15851 DECL_ARTIFICIAL (f_vrtop) = 1;
15852 DECL_ARTIFICIAL (f_groff) = 1;
15853 DECL_ARTIFICIAL (f_vroff) = 1;
15854
15855 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
15856 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
15857 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
15858 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
15859 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
15860
15861 TYPE_FIELDS (va_list_type) = f_stack;
15862 DECL_CHAIN (f_stack) = f_grtop;
15863 DECL_CHAIN (f_grtop) = f_vrtop;
15864 DECL_CHAIN (f_vrtop) = f_groff;
15865 DECL_CHAIN (f_groff) = f_vroff;
15866
15867 /* Compute its layout. */
15868 layout_type (va_list_type);
15869
15870 return va_list_type;
15871 }
15872
15873 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
15874 static void
15875 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
15876 {
15877 const CUMULATIVE_ARGS *cum;
15878 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15879 tree stack, grtop, vrtop, groff, vroff;
15880 tree t;
15881 int gr_save_area_size = cfun->va_list_gpr_size;
15882 int vr_save_area_size = cfun->va_list_fpr_size;
15883 int vr_offset;
15884
15885 cum = &crtl->args.info;
15886 if (cfun->va_list_gpr_size)
15887 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
15888 cfun->va_list_gpr_size);
15889 if (cfun->va_list_fpr_size)
15890 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
15891 * UNITS_PER_VREG, cfun->va_list_fpr_size);
15892
15893 if (!TARGET_FLOAT)
15894 {
15895 gcc_assert (cum->aapcs_nvrn == 0);
15896 vr_save_area_size = 0;
15897 }
15898
15899 f_stack = TYPE_FIELDS (va_list_type_node);
15900 f_grtop = DECL_CHAIN (f_stack);
15901 f_vrtop = DECL_CHAIN (f_grtop);
15902 f_groff = DECL_CHAIN (f_vrtop);
15903 f_vroff = DECL_CHAIN (f_groff);
15904
15905 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
15906 NULL_TREE);
15907 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
15908 NULL_TREE);
15909 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
15910 NULL_TREE);
15911 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
15912 NULL_TREE);
15913 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
15914 NULL_TREE);
15915
15916 /* Emit code to initialize STACK, which points to the next varargs stack
15917 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
15918 by named arguments. STACK is 8-byte aligned. */
15919 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
15920 if (cum->aapcs_stack_size > 0)
15921 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
15922 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
15923 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15924
15925 /* Emit code to initialize GRTOP, the top of the GR save area.
15926 virtual_incoming_args_rtx should have been 16 byte aligned. */
15927 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
15928 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
15929 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15930
15931 /* Emit code to initialize VRTOP, the top of the VR save area.
15932 This address is gr_save_area_bytes below GRTOP, rounded
15933 down to the next 16-byte boundary. */
15934 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
15935 vr_offset = ROUND_UP (gr_save_area_size,
15936 STACK_BOUNDARY / BITS_PER_UNIT);
15937
15938 if (vr_offset)
15939 t = fold_build_pointer_plus_hwi (t, -vr_offset);
15940 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
15941 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15942
15943 /* Emit code to initialize GROFF, the offset from GRTOP of the
15944 next GPR argument. */
15945 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
15946 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
15947 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15948
15949 /* Likewise emit code to initialize VROFF, the offset from FTOP
15950 of the next VR argument. */
15951 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
15952 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
15953 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15954 }
15955
15956 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
15957
15958 static tree
15959 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
15960 gimple_seq *post_p ATTRIBUTE_UNUSED)
15961 {
15962 tree addr;
15963 bool indirect_p;
15964 bool is_ha; /* is HFA or HVA. */
15965 bool dw_align; /* double-word align. */
15966 machine_mode ag_mode = VOIDmode;
15967 int nregs;
15968 machine_mode mode;
15969
15970 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15971 tree stack, f_top, f_off, off, arg, roundup, on_stack;
15972 HOST_WIDE_INT size, rsize, adjust, align;
15973 tree t, u, cond1, cond2;
15974
15975 indirect_p = pass_va_arg_by_reference (type);
15976 if (indirect_p)
15977 type = build_pointer_type (type);
15978
15979 mode = TYPE_MODE (type);
15980
15981 f_stack = TYPE_FIELDS (va_list_type_node);
15982 f_grtop = DECL_CHAIN (f_stack);
15983 f_vrtop = DECL_CHAIN (f_grtop);
15984 f_groff = DECL_CHAIN (f_vrtop);
15985 f_vroff = DECL_CHAIN (f_groff);
15986
15987 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
15988 f_stack, NULL_TREE);
15989 size = int_size_in_bytes (type);
15990
15991 bool abi_break;
15992 align
15993 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
15994
15995 dw_align = false;
15996 adjust = 0;
15997 if (aarch64_vfp_is_call_or_return_candidate (mode,
15998 type,
15999 &ag_mode,
16000 &nregs,
16001 &is_ha))
16002 {
16003 /* No frontends can create types with variable-sized modes, so we
16004 shouldn't be asked to pass or return them. */
16005 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
16006
16007 /* TYPE passed in fp/simd registers. */
16008 if (!TARGET_FLOAT)
16009 aarch64_err_no_fpadvsimd (mode);
16010
16011 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
16012 unshare_expr (valist), f_vrtop, NULL_TREE);
16013 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
16014 unshare_expr (valist), f_vroff, NULL_TREE);
16015
16016 rsize = nregs * UNITS_PER_VREG;
16017
16018 if (is_ha)
16019 {
16020 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
16021 adjust = UNITS_PER_VREG - ag_size;
16022 }
16023 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16024 && size < UNITS_PER_VREG)
16025 {
16026 adjust = UNITS_PER_VREG - size;
16027 }
16028 }
16029 else
16030 {
16031 /* TYPE passed in general registers. */
16032 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
16033 unshare_expr (valist), f_grtop, NULL_TREE);
16034 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
16035 unshare_expr (valist), f_groff, NULL_TREE);
16036 rsize = ROUND_UP (size, UNITS_PER_WORD);
16037 nregs = rsize / UNITS_PER_WORD;
16038
16039 if (align > 8)
16040 {
16041 if (abi_break && warn_psabi)
16042 inform (input_location, "parameter passing for argument of type "
16043 "%qT changed in GCC 9.1", type);
16044 dw_align = true;
16045 }
16046
16047 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16048 && size < UNITS_PER_WORD)
16049 {
16050 adjust = UNITS_PER_WORD - size;
16051 }
16052 }
16053
16054 /* Get a local temporary for the field value. */
16055 off = get_initialized_tmp_var (f_off, pre_p, NULL);
16056
16057 /* Emit code to branch if off >= 0. */
16058 t = build2 (GE_EXPR, boolean_type_node, off,
16059 build_int_cst (TREE_TYPE (off), 0));
16060 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
16061
16062 if (dw_align)
16063 {
16064 /* Emit: offs = (offs + 15) & -16. */
16065 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16066 build_int_cst (TREE_TYPE (off), 15));
16067 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
16068 build_int_cst (TREE_TYPE (off), -16));
16069 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
16070 }
16071 else
16072 roundup = NULL;
16073
16074 /* Update ap.__[g|v]r_offs */
16075 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16076 build_int_cst (TREE_TYPE (off), rsize));
16077 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
16078
16079 /* String up. */
16080 if (roundup)
16081 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16082
16083 /* [cond2] if (ap.__[g|v]r_offs > 0) */
16084 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
16085 build_int_cst (TREE_TYPE (f_off), 0));
16086 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
16087
16088 /* String up: make sure the assignment happens before the use. */
16089 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
16090 COND_EXPR_ELSE (cond1) = t;
16091
16092 /* Prepare the trees handling the argument that is passed on the stack;
16093 the top level node will store in ON_STACK. */
16094 arg = get_initialized_tmp_var (stack, pre_p, NULL);
16095 if (align > 8)
16096 {
16097 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
16098 t = fold_build_pointer_plus_hwi (arg, 15);
16099 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16100 build_int_cst (TREE_TYPE (t), -16));
16101 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
16102 }
16103 else
16104 roundup = NULL;
16105 /* Advance ap.__stack */
16106 t = fold_build_pointer_plus_hwi (arg, size + 7);
16107 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16108 build_int_cst (TREE_TYPE (t), -8));
16109 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
16110 /* String up roundup and advance. */
16111 if (roundup)
16112 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16113 /* String up with arg */
16114 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
16115 /* Big-endianness related address adjustment. */
16116 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16117 && size < UNITS_PER_WORD)
16118 {
16119 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
16120 size_int (UNITS_PER_WORD - size));
16121 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
16122 }
16123
16124 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
16125 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
16126
16127 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
16128 t = off;
16129 if (adjust)
16130 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
16131 build_int_cst (TREE_TYPE (off), adjust));
16132
16133 t = fold_convert (sizetype, t);
16134 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
16135
16136 if (is_ha)
16137 {
16138 /* type ha; // treat as "struct {ftype field[n];}"
16139 ... [computing offs]
16140 for (i = 0; i <nregs; ++i, offs += 16)
16141 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
16142 return ha; */
16143 int i;
16144 tree tmp_ha, field_t, field_ptr_t;
16145
16146 /* Declare a local variable. */
16147 tmp_ha = create_tmp_var_raw (type, "ha");
16148 gimple_add_tmp_var (tmp_ha);
16149
16150 /* Establish the base type. */
16151 switch (ag_mode)
16152 {
16153 case E_SFmode:
16154 field_t = float_type_node;
16155 field_ptr_t = float_ptr_type_node;
16156 break;
16157 case E_DFmode:
16158 field_t = double_type_node;
16159 field_ptr_t = double_ptr_type_node;
16160 break;
16161 case E_TFmode:
16162 field_t = long_double_type_node;
16163 field_ptr_t = long_double_ptr_type_node;
16164 break;
16165 case E_HFmode:
16166 field_t = aarch64_fp16_type_node;
16167 field_ptr_t = aarch64_fp16_ptr_type_node;
16168 break;
16169 case E_BFmode:
16170 field_t = aarch64_bf16_type_node;
16171 field_ptr_t = aarch64_bf16_ptr_type_node;
16172 break;
16173 case E_V2SImode:
16174 case E_V4SImode:
16175 {
16176 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
16177 field_t = build_vector_type_for_mode (innertype, ag_mode);
16178 field_ptr_t = build_pointer_type (field_t);
16179 }
16180 break;
16181 default:
16182 gcc_assert (0);
16183 }
16184
16185 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
16186 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
16187 addr = t;
16188 t = fold_convert (field_ptr_t, addr);
16189 t = build2 (MODIFY_EXPR, field_t,
16190 build1 (INDIRECT_REF, field_t, tmp_ha),
16191 build1 (INDIRECT_REF, field_t, t));
16192
16193 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
16194 for (i = 1; i < nregs; ++i)
16195 {
16196 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
16197 u = fold_convert (field_ptr_t, addr);
16198 u = build2 (MODIFY_EXPR, field_t,
16199 build2 (MEM_REF, field_t, tmp_ha,
16200 build_int_cst (field_ptr_t,
16201 (i *
16202 int_size_in_bytes (field_t)))),
16203 build1 (INDIRECT_REF, field_t, u));
16204 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
16205 }
16206
16207 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
16208 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
16209 }
16210
16211 COND_EXPR_ELSE (cond2) = t;
16212 addr = fold_convert (build_pointer_type (type), cond1);
16213 addr = build_va_arg_indirect_ref (addr);
16214
16215 if (indirect_p)
16216 addr = build_va_arg_indirect_ref (addr);
16217
16218 return addr;
16219 }
16220
16221 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
16222
16223 static void
16224 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
16225 const function_arg_info &arg,
16226 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
16227 {
16228 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
16229 CUMULATIVE_ARGS local_cum;
16230 int gr_saved = cfun->va_list_gpr_size;
16231 int vr_saved = cfun->va_list_fpr_size;
16232
16233 /* The caller has advanced CUM up to, but not beyond, the last named
16234 argument. Advance a local copy of CUM past the last "real" named
16235 argument, to find out how many registers are left over. */
16236 local_cum = *cum;
16237 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
16238
16239 /* Found out how many registers we need to save.
16240 Honor tree-stdvar analysis results. */
16241 if (cfun->va_list_gpr_size)
16242 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
16243 cfun->va_list_gpr_size / UNITS_PER_WORD);
16244 if (cfun->va_list_fpr_size)
16245 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
16246 cfun->va_list_fpr_size / UNITS_PER_VREG);
16247
16248 if (!TARGET_FLOAT)
16249 {
16250 gcc_assert (local_cum.aapcs_nvrn == 0);
16251 vr_saved = 0;
16252 }
16253
16254 if (!no_rtl)
16255 {
16256 if (gr_saved > 0)
16257 {
16258 rtx ptr, mem;
16259
16260 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
16261 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
16262 - gr_saved * UNITS_PER_WORD);
16263 mem = gen_frame_mem (BLKmode, ptr);
16264 set_mem_alias_set (mem, get_varargs_alias_set ());
16265
16266 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
16267 mem, gr_saved);
16268 }
16269 if (vr_saved > 0)
16270 {
16271 /* We can't use move_block_from_reg, because it will use
16272 the wrong mode, storing D regs only. */
16273 machine_mode mode = TImode;
16274 int off, i, vr_start;
16275
16276 /* Set OFF to the offset from virtual_incoming_args_rtx of
16277 the first vector register. The VR save area lies below
16278 the GR one, and is aligned to 16 bytes. */
16279 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
16280 STACK_BOUNDARY / BITS_PER_UNIT);
16281 off -= vr_saved * UNITS_PER_VREG;
16282
16283 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
16284 for (i = 0; i < vr_saved; ++i)
16285 {
16286 rtx ptr, mem;
16287
16288 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
16289 mem = gen_frame_mem (mode, ptr);
16290 set_mem_alias_set (mem, get_varargs_alias_set ());
16291 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
16292 off += UNITS_PER_VREG;
16293 }
16294 }
16295 }
16296
16297 /* We don't save the size into *PRETEND_SIZE because we want to avoid
16298 any complication of having crtl->args.pretend_args_size changed. */
16299 cfun->machine->frame.saved_varargs_size
16300 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
16301 STACK_BOUNDARY / BITS_PER_UNIT)
16302 + vr_saved * UNITS_PER_VREG);
16303 }
16304
16305 static void
16306 aarch64_conditional_register_usage (void)
16307 {
16308 int i;
16309 if (!TARGET_FLOAT)
16310 {
16311 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
16312 {
16313 fixed_regs[i] = 1;
16314 call_used_regs[i] = 1;
16315 }
16316 }
16317 if (!TARGET_SVE)
16318 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
16319 {
16320 fixed_regs[i] = 1;
16321 call_used_regs[i] = 1;
16322 }
16323
16324 /* Only allow the FFR and FFRT to be accessed via special patterns. */
16325 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
16326 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
16327
16328 /* When tracking speculation, we need a couple of call-clobbered registers
16329 to track the speculation state. It would be nice to just use
16330 IP0 and IP1, but currently there are numerous places that just
16331 assume these registers are free for other uses (eg pointer
16332 authentication). */
16333 if (aarch64_track_speculation)
16334 {
16335 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
16336 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
16337 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16338 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16339 }
16340 }
16341
16342 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
16343
16344 bool
16345 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
16346 {
16347 /* For records we're passed a FIELD_DECL, for arrays we're passed
16348 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
16349 const_tree type = TREE_TYPE (field_or_array);
16350
16351 /* Assign BLKmode to anything that contains multiple SVE predicates.
16352 For structures, the "multiple" case is indicated by MODE being
16353 VOIDmode. */
16354 unsigned int num_zr, num_pr;
16355 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
16356 {
16357 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
16358 return !simple_cst_equal (TYPE_SIZE (field_or_array),
16359 TYPE_SIZE (type));
16360 return mode == VOIDmode;
16361 }
16362
16363 return default_member_type_forces_blk (field_or_array, mode);
16364 }
16365
16366 /* Walk down the type tree of TYPE counting consecutive base elements.
16367 If *MODEP is VOIDmode, then set it to the first valid floating point
16368 type. If a non-floating point type is found, or if a floating point
16369 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
16370 otherwise return the count in the sub-tree. */
16371 static int
16372 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
16373 {
16374 machine_mode mode;
16375 HOST_WIDE_INT size;
16376
16377 if (aarch64_sve::builtin_type_p (type))
16378 return -1;
16379
16380 switch (TREE_CODE (type))
16381 {
16382 case REAL_TYPE:
16383 mode = TYPE_MODE (type);
16384 if (mode != DFmode && mode != SFmode
16385 && mode != TFmode && mode != HFmode)
16386 return -1;
16387
16388 if (*modep == VOIDmode)
16389 *modep = mode;
16390
16391 if (*modep == mode)
16392 return 1;
16393
16394 break;
16395
16396 case COMPLEX_TYPE:
16397 mode = TYPE_MODE (TREE_TYPE (type));
16398 if (mode != DFmode && mode != SFmode
16399 && mode != TFmode && mode != HFmode)
16400 return -1;
16401
16402 if (*modep == VOIDmode)
16403 *modep = mode;
16404
16405 if (*modep == mode)
16406 return 2;
16407
16408 break;
16409
16410 case VECTOR_TYPE:
16411 /* Use V2SImode and V4SImode as representatives of all 64-bit
16412 and 128-bit vector types. */
16413 size = int_size_in_bytes (type);
16414 switch (size)
16415 {
16416 case 8:
16417 mode = V2SImode;
16418 break;
16419 case 16:
16420 mode = V4SImode;
16421 break;
16422 default:
16423 return -1;
16424 }
16425
16426 if (*modep == VOIDmode)
16427 *modep = mode;
16428
16429 /* Vector modes are considered to be opaque: two vectors are
16430 equivalent for the purposes of being homogeneous aggregates
16431 if they are the same size. */
16432 if (*modep == mode)
16433 return 1;
16434
16435 break;
16436
16437 case ARRAY_TYPE:
16438 {
16439 int count;
16440 tree index = TYPE_DOMAIN (type);
16441
16442 /* Can't handle incomplete types nor sizes that are not
16443 fixed. */
16444 if (!COMPLETE_TYPE_P (type)
16445 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16446 return -1;
16447
16448 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
16449 if (count == -1
16450 || !index
16451 || !TYPE_MAX_VALUE (index)
16452 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
16453 || !TYPE_MIN_VALUE (index)
16454 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
16455 || count < 0)
16456 return -1;
16457
16458 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
16459 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
16460
16461 /* There must be no padding. */
16462 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16463 count * GET_MODE_BITSIZE (*modep)))
16464 return -1;
16465
16466 return count;
16467 }
16468
16469 case RECORD_TYPE:
16470 {
16471 int count = 0;
16472 int sub_count;
16473 tree field;
16474
16475 /* Can't handle incomplete types nor sizes that are not
16476 fixed. */
16477 if (!COMPLETE_TYPE_P (type)
16478 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16479 return -1;
16480
16481 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16482 {
16483 if (TREE_CODE (field) != FIELD_DECL)
16484 continue;
16485
16486 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
16487 if (sub_count < 0)
16488 return -1;
16489 count += sub_count;
16490 }
16491
16492 /* There must be no padding. */
16493 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16494 count * GET_MODE_BITSIZE (*modep)))
16495 return -1;
16496
16497 return count;
16498 }
16499
16500 case UNION_TYPE:
16501 case QUAL_UNION_TYPE:
16502 {
16503 /* These aren't very interesting except in a degenerate case. */
16504 int count = 0;
16505 int sub_count;
16506 tree field;
16507
16508 /* Can't handle incomplete types nor sizes that are not
16509 fixed. */
16510 if (!COMPLETE_TYPE_P (type)
16511 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16512 return -1;
16513
16514 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16515 {
16516 if (TREE_CODE (field) != FIELD_DECL)
16517 continue;
16518
16519 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
16520 if (sub_count < 0)
16521 return -1;
16522 count = count > sub_count ? count : sub_count;
16523 }
16524
16525 /* There must be no padding. */
16526 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16527 count * GET_MODE_BITSIZE (*modep)))
16528 return -1;
16529
16530 return count;
16531 }
16532
16533 default:
16534 break;
16535 }
16536
16537 return -1;
16538 }
16539
16540 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
16541 type as described in AAPCS64 \S 4.1.2.
16542
16543 See the comment above aarch64_composite_type_p for the notes on MODE. */
16544
16545 static bool
16546 aarch64_short_vector_p (const_tree type,
16547 machine_mode mode)
16548 {
16549 poly_int64 size = -1;
16550
16551 if (type && TREE_CODE (type) == VECTOR_TYPE)
16552 {
16553 if (aarch64_sve::builtin_type_p (type))
16554 return false;
16555 size = int_size_in_bytes (type);
16556 }
16557 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16558 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16559 {
16560 /* Rely only on the type, not the mode, when processing SVE types. */
16561 if (type && aarch64_some_values_include_pst_objects_p (type))
16562 gcc_assert (aarch64_sve_mode_p (mode));
16563 else
16564 size = GET_MODE_SIZE (mode);
16565 }
16566 if (known_eq (size, 8) || known_eq (size, 16))
16567 {
16568 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
16569 they are being treated as scalable AAPCS64 types. */
16570 gcc_assert (!aarch64_sve_mode_p (mode));
16571 return true;
16572 }
16573 return false;
16574 }
16575
16576 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
16577 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
16578 array types. The C99 floating-point complex types are also considered
16579 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
16580 types, which are GCC extensions and out of the scope of AAPCS64, are
16581 treated as composite types here as well.
16582
16583 Note that MODE itself is not sufficient in determining whether a type
16584 is such a composite type or not. This is because
16585 stor-layout.c:compute_record_mode may have already changed the MODE
16586 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
16587 structure with only one field may have its MODE set to the mode of the
16588 field. Also an integer mode whose size matches the size of the
16589 RECORD_TYPE type may be used to substitute the original mode
16590 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
16591 solely relied on. */
16592
16593 static bool
16594 aarch64_composite_type_p (const_tree type,
16595 machine_mode mode)
16596 {
16597 if (aarch64_short_vector_p (type, mode))
16598 return false;
16599
16600 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
16601 return true;
16602
16603 if (mode == BLKmode
16604 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16605 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16606 return true;
16607
16608 return false;
16609 }
16610
16611 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16612 shall be passed or returned in simd/fp register(s) (providing these
16613 parameter passing registers are available).
16614
16615 Upon successful return, *COUNT returns the number of needed registers,
16616 *BASE_MODE returns the mode of the individual register and when IS_HAF
16617 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16618 floating-point aggregate or a homogeneous short-vector aggregate. */
16619
16620 static bool
16621 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
16622 const_tree type,
16623 machine_mode *base_mode,
16624 int *count,
16625 bool *is_ha)
16626 {
16627 if (is_ha != NULL) *is_ha = false;
16628
16629 machine_mode new_mode = VOIDmode;
16630 bool composite_p = aarch64_composite_type_p (type, mode);
16631
16632 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
16633 || aarch64_short_vector_p (type, mode))
16634 {
16635 *count = 1;
16636 new_mode = mode;
16637 }
16638 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
16639 {
16640 if (is_ha != NULL) *is_ha = true;
16641 *count = 2;
16642 new_mode = GET_MODE_INNER (mode);
16643 }
16644 else if (type && composite_p)
16645 {
16646 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
16647
16648 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
16649 {
16650 if (is_ha != NULL) *is_ha = true;
16651 *count = ag_count;
16652 }
16653 else
16654 return false;
16655 }
16656 else
16657 return false;
16658
16659 gcc_assert (!aarch64_sve_mode_p (new_mode));
16660 *base_mode = new_mode;
16661 return true;
16662 }
16663
16664 /* Implement TARGET_STRUCT_VALUE_RTX. */
16665
16666 static rtx
16667 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
16668 int incoming ATTRIBUTE_UNUSED)
16669 {
16670 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
16671 }
16672
16673 /* Implements target hook vector_mode_supported_p. */
16674 static bool
16675 aarch64_vector_mode_supported_p (machine_mode mode)
16676 {
16677 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16678 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
16679 }
16680
16681 /* Return the full-width SVE vector mode for element mode MODE, if one
16682 exists. */
16683 opt_machine_mode
16684 aarch64_full_sve_mode (scalar_mode mode)
16685 {
16686 switch (mode)
16687 {
16688 case E_DFmode:
16689 return VNx2DFmode;
16690 case E_SFmode:
16691 return VNx4SFmode;
16692 case E_HFmode:
16693 return VNx8HFmode;
16694 case E_BFmode:
16695 return VNx8BFmode;
16696 case E_DImode:
16697 return VNx2DImode;
16698 case E_SImode:
16699 return VNx4SImode;
16700 case E_HImode:
16701 return VNx8HImode;
16702 case E_QImode:
16703 return VNx16QImode;
16704 default:
16705 return opt_machine_mode ();
16706 }
16707 }
16708
16709 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16710 if it exists. */
16711 opt_machine_mode
16712 aarch64_vq_mode (scalar_mode mode)
16713 {
16714 switch (mode)
16715 {
16716 case E_DFmode:
16717 return V2DFmode;
16718 case E_SFmode:
16719 return V4SFmode;
16720 case E_HFmode:
16721 return V8HFmode;
16722 case E_BFmode:
16723 return V8BFmode;
16724 case E_SImode:
16725 return V4SImode;
16726 case E_HImode:
16727 return V8HImode;
16728 case E_QImode:
16729 return V16QImode;
16730 case E_DImode:
16731 return V2DImode;
16732 default:
16733 return opt_machine_mode ();
16734 }
16735 }
16736
16737 /* Return appropriate SIMD container
16738 for MODE within a vector of WIDTH bits. */
16739 static machine_mode
16740 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
16741 {
16742 if (TARGET_SVE
16743 && maybe_ne (width, 128)
16744 && known_eq (width, BITS_PER_SVE_VECTOR))
16745 return aarch64_full_sve_mode (mode).else_mode (word_mode);
16746
16747 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
16748 if (TARGET_SIMD)
16749 {
16750 if (known_eq (width, 128))
16751 return aarch64_vq_mode (mode).else_mode (word_mode);
16752 else
16753 switch (mode)
16754 {
16755 case E_SFmode:
16756 return V2SFmode;
16757 case E_HFmode:
16758 return V4HFmode;
16759 case E_BFmode:
16760 return V4BFmode;
16761 case E_SImode:
16762 return V2SImode;
16763 case E_HImode:
16764 return V4HImode;
16765 case E_QImode:
16766 return V8QImode;
16767 default:
16768 break;
16769 }
16770 }
16771 return word_mode;
16772 }
16773
16774 /* Return 128-bit container as the preferred SIMD mode for MODE. */
16775 static machine_mode
16776 aarch64_preferred_simd_mode (scalar_mode mode)
16777 {
16778 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
16779 return aarch64_simd_container_mode (mode, bits);
16780 }
16781
16782 /* Return a list of possible vector sizes for the vectorizer
16783 to iterate over. */
16784 static unsigned int
16785 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
16786 {
16787 static const machine_mode sve_modes[] = {
16788 /* Try using full vectors for all element types. */
16789 VNx16QImode,
16790
16791 /* Try using 16-bit containers for 8-bit elements and full vectors
16792 for wider elements. */
16793 VNx8QImode,
16794
16795 /* Try using 32-bit containers for 8-bit and 16-bit elements and
16796 full vectors for wider elements. */
16797 VNx4QImode,
16798
16799 /* Try using 64-bit containers for all element types. */
16800 VNx2QImode
16801 };
16802
16803 static const machine_mode advsimd_modes[] = {
16804 /* Try using 128-bit vectors for all element types. */
16805 V16QImode,
16806
16807 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16808 for wider elements. */
16809 V8QImode,
16810
16811 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16812 for wider elements.
16813
16814 TODO: We could support a limited form of V4QImode too, so that
16815 we use 32-bit vectors for 8-bit elements. */
16816 V4HImode,
16817
16818 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16819 for 64-bit elements.
16820
16821 TODO: We could similarly support limited forms of V2QImode and V2HImode
16822 for this case. */
16823 V2SImode
16824 };
16825
16826 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16827 This is because:
16828
16829 - If we can't use N-byte Advanced SIMD vectors then the placement
16830 doesn't matter; we'll just continue as though the Advanced SIMD
16831 entry didn't exist.
16832
16833 - If an SVE main loop with N bytes ends up being cheaper than an
16834 Advanced SIMD main loop with N bytes then by default we'll replace
16835 the Advanced SIMD version with the SVE one.
16836
16837 - If an Advanced SIMD main loop with N bytes ends up being cheaper
16838 than an SVE main loop with N bytes then by default we'll try to
16839 use the SVE loop to vectorize the epilogue instead. */
16840 unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
16841 unsigned int advsimd_i = 0;
16842 while (advsimd_i < ARRAY_SIZE (advsimd_modes))
16843 {
16844 if (sve_i < ARRAY_SIZE (sve_modes)
16845 && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
16846 GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
16847 modes->safe_push (sve_modes[sve_i++]);
16848 else
16849 modes->safe_push (advsimd_modes[advsimd_i++]);
16850 }
16851 while (sve_i < ARRAY_SIZE (sve_modes))
16852 modes->safe_push (sve_modes[sve_i++]);
16853
16854 unsigned int flags = 0;
16855 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16856 can compare SVE against Advanced SIMD and so that we can compare
16857 multiple SVE vectorization approaches against each other. There's
16858 not really any point doing this for Advanced SIMD only, since the
16859 first mode that works should always be the best. */
16860 if (TARGET_SVE && aarch64_sve_compare_costs)
16861 flags |= VECT_COMPARE_COSTS;
16862 return flags;
16863 }
16864
16865 /* Implement TARGET_MANGLE_TYPE. */
16866
16867 static const char *
16868 aarch64_mangle_type (const_tree type)
16869 {
16870 /* The AArch64 ABI documents say that "__va_list" has to be
16871 mangled as if it is in the "std" namespace. */
16872 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
16873 return "St9__va_list";
16874
16875 /* Half-precision floating point types. */
16876 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
16877 {
16878 if (TYPE_MODE (type) == BFmode)
16879 return "u6__bf16";
16880 else
16881 return "Dh";
16882 }
16883
16884 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
16885 builtin types. */
16886 if (TYPE_NAME (type) != NULL)
16887 {
16888 const char *res;
16889 if ((res = aarch64_general_mangle_builtin_type (type))
16890 || (res = aarch64_sve::mangle_builtin_type (type)))
16891 return res;
16892 }
16893
16894 /* Use the default mangling. */
16895 return NULL;
16896 }
16897
16898 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
16899
16900 static bool
16901 aarch64_verify_type_context (location_t loc, type_context_kind context,
16902 const_tree type, bool silent_p)
16903 {
16904 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
16905 }
16906
16907 /* Find the first rtx_insn before insn that will generate an assembly
16908 instruction. */
16909
16910 static rtx_insn *
16911 aarch64_prev_real_insn (rtx_insn *insn)
16912 {
16913 if (!insn)
16914 return NULL;
16915
16916 do
16917 {
16918 insn = prev_real_insn (insn);
16919 }
16920 while (insn && recog_memoized (insn) < 0);
16921
16922 return insn;
16923 }
16924
16925 static bool
16926 is_madd_op (enum attr_type t1)
16927 {
16928 unsigned int i;
16929 /* A number of these may be AArch32 only. */
16930 enum attr_type mlatypes[] = {
16931 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
16932 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
16933 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
16934 };
16935
16936 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
16937 {
16938 if (t1 == mlatypes[i])
16939 return true;
16940 }
16941
16942 return false;
16943 }
16944
16945 /* Check if there is a register dependency between a load and the insn
16946 for which we hold recog_data. */
16947
16948 static bool
16949 dep_between_memop_and_curr (rtx memop)
16950 {
16951 rtx load_reg;
16952 int opno;
16953
16954 gcc_assert (GET_CODE (memop) == SET);
16955
16956 if (!REG_P (SET_DEST (memop)))
16957 return false;
16958
16959 load_reg = SET_DEST (memop);
16960 for (opno = 1; opno < recog_data.n_operands; opno++)
16961 {
16962 rtx operand = recog_data.operand[opno];
16963 if (REG_P (operand)
16964 && reg_overlap_mentioned_p (load_reg, operand))
16965 return true;
16966
16967 }
16968 return false;
16969 }
16970
16971
16972 /* When working around the Cortex-A53 erratum 835769,
16973 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16974 instruction and has a preceding memory instruction such that a NOP
16975 should be inserted between them. */
16976
16977 bool
16978 aarch64_madd_needs_nop (rtx_insn* insn)
16979 {
16980 enum attr_type attr_type;
16981 rtx_insn *prev;
16982 rtx body;
16983
16984 if (!TARGET_FIX_ERR_A53_835769)
16985 return false;
16986
16987 if (!INSN_P (insn) || recog_memoized (insn) < 0)
16988 return false;
16989
16990 attr_type = get_attr_type (insn);
16991 if (!is_madd_op (attr_type))
16992 return false;
16993
16994 prev = aarch64_prev_real_insn (insn);
16995 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16996 Restore recog state to INSN to avoid state corruption. */
16997 extract_constrain_insn_cached (insn);
16998
16999 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
17000 return false;
17001
17002 body = single_set (prev);
17003
17004 /* If the previous insn is a memory op and there is no dependency between
17005 it and the DImode madd, emit a NOP between them. If body is NULL then we
17006 have a complex memory operation, probably a load/store pair.
17007 Be conservative for now and emit a NOP. */
17008 if (GET_MODE (recog_data.operand[0]) == DImode
17009 && (!body || !dep_between_memop_and_curr (body)))
17010 return true;
17011
17012 return false;
17013
17014 }
17015
17016
17017 /* Implement FINAL_PRESCAN_INSN. */
17018
17019 void
17020 aarch64_final_prescan_insn (rtx_insn *insn)
17021 {
17022 if (aarch64_madd_needs_nop (insn))
17023 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
17024 }
17025
17026
17027 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
17028 instruction. */
17029
17030 bool
17031 aarch64_sve_index_immediate_p (rtx base_or_step)
17032 {
17033 return (CONST_INT_P (base_or_step)
17034 && IN_RANGE (INTVAL (base_or_step), -16, 15));
17035 }
17036
17037 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
17038 when applied to mode MODE. Negate X first if NEGATE_P is true. */
17039
17040 bool
17041 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
17042 {
17043 rtx elt = unwrap_const_vec_duplicate (x);
17044 if (!CONST_INT_P (elt))
17045 return false;
17046
17047 HOST_WIDE_INT val = INTVAL (elt);
17048 if (negate_p)
17049 val = -val;
17050 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
17051
17052 if (val & 0xff)
17053 return IN_RANGE (val, 0, 0xff);
17054 return IN_RANGE (val, 0, 0xff00);
17055 }
17056
17057 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
17058 instructions when applied to mode MODE. Negate X first if NEGATE_P
17059 is true. */
17060
17061 bool
17062 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
17063 {
17064 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
17065 return false;
17066
17067 /* After the optional negation, the immediate must be nonnegative.
17068 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
17069 instead of SQADD Zn.B, Zn.B, #129. */
17070 rtx elt = unwrap_const_vec_duplicate (x);
17071 return negate_p == (INTVAL (elt) < 0);
17072 }
17073
17074 /* Return true if X is a valid immediate operand for an SVE logical
17075 instruction such as AND. */
17076
17077 bool
17078 aarch64_sve_bitmask_immediate_p (rtx x)
17079 {
17080 rtx elt;
17081
17082 return (const_vec_duplicate_p (x, &elt)
17083 && CONST_INT_P (elt)
17084 && aarch64_bitmask_imm (INTVAL (elt),
17085 GET_MODE_INNER (GET_MODE (x))));
17086 }
17087
17088 /* Return true if X is a valid immediate for the SVE DUP and CPY
17089 instructions. */
17090
17091 bool
17092 aarch64_sve_dup_immediate_p (rtx x)
17093 {
17094 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
17095 if (!CONST_INT_P (x))
17096 return false;
17097
17098 HOST_WIDE_INT val = INTVAL (x);
17099 if (val & 0xff)
17100 return IN_RANGE (val, -0x80, 0x7f);
17101 return IN_RANGE (val, -0x8000, 0x7f00);
17102 }
17103
17104 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
17105 SIGNED_P says whether the operand is signed rather than unsigned. */
17106
17107 bool
17108 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
17109 {
17110 x = unwrap_const_vec_duplicate (x);
17111 return (CONST_INT_P (x)
17112 && (signed_p
17113 ? IN_RANGE (INTVAL (x), -16, 15)
17114 : IN_RANGE (INTVAL (x), 0, 127)));
17115 }
17116
17117 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
17118 instruction. Negate X first if NEGATE_P is true. */
17119
17120 bool
17121 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
17122 {
17123 rtx elt;
17124 REAL_VALUE_TYPE r;
17125
17126 if (!const_vec_duplicate_p (x, &elt)
17127 || GET_CODE (elt) != CONST_DOUBLE)
17128 return false;
17129
17130 r = *CONST_DOUBLE_REAL_VALUE (elt);
17131
17132 if (negate_p)
17133 r = real_value_negate (&r);
17134
17135 if (real_equal (&r, &dconst1))
17136 return true;
17137 if (real_equal (&r, &dconsthalf))
17138 return true;
17139 return false;
17140 }
17141
17142 /* Return true if X is a valid immediate operand for an SVE FMUL
17143 instruction. */
17144
17145 bool
17146 aarch64_sve_float_mul_immediate_p (rtx x)
17147 {
17148 rtx elt;
17149
17150 return (const_vec_duplicate_p (x, &elt)
17151 && GET_CODE (elt) == CONST_DOUBLE
17152 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
17153 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
17154 }
17155
17156 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
17157 for the Advanced SIMD operation described by WHICH and INSN. If INFO
17158 is nonnull, use it to describe valid immediates. */
17159 static bool
17160 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
17161 simd_immediate_info *info,
17162 enum simd_immediate_check which,
17163 simd_immediate_info::insn_type insn)
17164 {
17165 /* Try a 4-byte immediate with LSL. */
17166 for (unsigned int shift = 0; shift < 32; shift += 8)
17167 if ((val32 & (0xff << shift)) == val32)
17168 {
17169 if (info)
17170 *info = simd_immediate_info (SImode, val32 >> shift, insn,
17171 simd_immediate_info::LSL, shift);
17172 return true;
17173 }
17174
17175 /* Try a 2-byte immediate with LSL. */
17176 unsigned int imm16 = val32 & 0xffff;
17177 if (imm16 == (val32 >> 16))
17178 for (unsigned int shift = 0; shift < 16; shift += 8)
17179 if ((imm16 & (0xff << shift)) == imm16)
17180 {
17181 if (info)
17182 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
17183 simd_immediate_info::LSL, shift);
17184 return true;
17185 }
17186
17187 /* Try a 4-byte immediate with MSL, except for cases that MVN
17188 can handle. */
17189 if (which == AARCH64_CHECK_MOV)
17190 for (unsigned int shift = 8; shift < 24; shift += 8)
17191 {
17192 unsigned int low = (1 << shift) - 1;
17193 if (((val32 & (0xff << shift)) | low) == val32)
17194 {
17195 if (info)
17196 *info = simd_immediate_info (SImode, val32 >> shift, insn,
17197 simd_immediate_info::MSL, shift);
17198 return true;
17199 }
17200 }
17201
17202 return false;
17203 }
17204
17205 /* Return true if replicating VAL64 is a valid immediate for the
17206 Advanced SIMD operation described by WHICH. If INFO is nonnull,
17207 use it to describe valid immediates. */
17208 static bool
17209 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
17210 simd_immediate_info *info,
17211 enum simd_immediate_check which)
17212 {
17213 unsigned int val32 = val64 & 0xffffffff;
17214 unsigned int val16 = val64 & 0xffff;
17215 unsigned int val8 = val64 & 0xff;
17216
17217 if (val32 == (val64 >> 32))
17218 {
17219 if ((which & AARCH64_CHECK_ORR) != 0
17220 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
17221 simd_immediate_info::MOV))
17222 return true;
17223
17224 if ((which & AARCH64_CHECK_BIC) != 0
17225 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
17226 simd_immediate_info::MVN))
17227 return true;
17228
17229 /* Try using a replicated byte. */
17230 if (which == AARCH64_CHECK_MOV
17231 && val16 == (val32 >> 16)
17232 && val8 == (val16 >> 8))
17233 {
17234 if (info)
17235 *info = simd_immediate_info (QImode, val8);
17236 return true;
17237 }
17238 }
17239
17240 /* Try using a bit-to-bytemask. */
17241 if (which == AARCH64_CHECK_MOV)
17242 {
17243 unsigned int i;
17244 for (i = 0; i < 64; i += 8)
17245 {
17246 unsigned char byte = (val64 >> i) & 0xff;
17247 if (byte != 0 && byte != 0xff)
17248 break;
17249 }
17250 if (i == 64)
17251 {
17252 if (info)
17253 *info = simd_immediate_info (DImode, val64);
17254 return true;
17255 }
17256 }
17257 return false;
17258 }
17259
17260 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
17261 instruction. If INFO is nonnull, use it to describe valid immediates. */
17262
17263 static bool
17264 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
17265 simd_immediate_info *info)
17266 {
17267 scalar_int_mode mode = DImode;
17268 unsigned int val32 = val64 & 0xffffffff;
17269 if (val32 == (val64 >> 32))
17270 {
17271 mode = SImode;
17272 unsigned int val16 = val32 & 0xffff;
17273 if (val16 == (val32 >> 16))
17274 {
17275 mode = HImode;
17276 unsigned int val8 = val16 & 0xff;
17277 if (val8 == (val16 >> 8))
17278 mode = QImode;
17279 }
17280 }
17281 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
17282 if (IN_RANGE (val, -0x80, 0x7f))
17283 {
17284 /* DUP with no shift. */
17285 if (info)
17286 *info = simd_immediate_info (mode, val);
17287 return true;
17288 }
17289 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
17290 {
17291 /* DUP with LSL #8. */
17292 if (info)
17293 *info = simd_immediate_info (mode, val);
17294 return true;
17295 }
17296 if (aarch64_bitmask_imm (val64, mode))
17297 {
17298 /* DUPM. */
17299 if (info)
17300 *info = simd_immediate_info (mode, val);
17301 return true;
17302 }
17303 return false;
17304 }
17305
17306 /* Return true if X is an UNSPEC_PTRUE constant of the form:
17307
17308 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
17309
17310 where PATTERN is the svpattern as a CONST_INT and where ZERO
17311 is a zero constant of the required PTRUE mode (which can have
17312 fewer elements than X's mode, if zero bits are significant).
17313
17314 If so, and if INFO is nonnull, describe the immediate in INFO. */
17315 bool
17316 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
17317 {
17318 if (GET_CODE (x) != CONST)
17319 return false;
17320
17321 x = XEXP (x, 0);
17322 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
17323 return false;
17324
17325 if (info)
17326 {
17327 aarch64_svpattern pattern
17328 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
17329 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
17330 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
17331 *info = simd_immediate_info (int_mode, pattern);
17332 }
17333 return true;
17334 }
17335
17336 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
17337 it to describe valid immediates. */
17338
17339 static bool
17340 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
17341 {
17342 if (aarch64_sve_ptrue_svpattern_p (x, info))
17343 return true;
17344
17345 if (x == CONST0_RTX (GET_MODE (x)))
17346 {
17347 if (info)
17348 *info = simd_immediate_info (DImode, 0);
17349 return true;
17350 }
17351
17352 /* Analyze the value as a VNx16BImode. This should be relatively
17353 efficient, since rtx_vector_builder has enough built-in capacity
17354 to store all VLA predicate constants without needing the heap. */
17355 rtx_vector_builder builder;
17356 if (!aarch64_get_sve_pred_bits (builder, x))
17357 return false;
17358
17359 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
17360 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
17361 {
17362 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
17363 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
17364 if (pattern != AARCH64_NUM_SVPATTERNS)
17365 {
17366 if (info)
17367 {
17368 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
17369 *info = simd_immediate_info (int_mode, pattern);
17370 }
17371 return true;
17372 }
17373 }
17374 return false;
17375 }
17376
17377 /* Return true if OP is a valid SIMD immediate for the operation
17378 described by WHICH. If INFO is nonnull, use it to describe valid
17379 immediates. */
17380 bool
17381 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
17382 enum simd_immediate_check which)
17383 {
17384 machine_mode mode = GET_MODE (op);
17385 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17386 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
17387 return false;
17388
17389 if (vec_flags & VEC_SVE_PRED)
17390 return aarch64_sve_pred_valid_immediate (op, info);
17391
17392 scalar_mode elt_mode = GET_MODE_INNER (mode);
17393 rtx base, step;
17394 unsigned int n_elts;
17395 if (GET_CODE (op) == CONST_VECTOR
17396 && CONST_VECTOR_DUPLICATE_P (op))
17397 n_elts = CONST_VECTOR_NPATTERNS (op);
17398 else if ((vec_flags & VEC_SVE_DATA)
17399 && const_vec_series_p (op, &base, &step))
17400 {
17401 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
17402 if (!aarch64_sve_index_immediate_p (base)
17403 || !aarch64_sve_index_immediate_p (step))
17404 return false;
17405
17406 if (info)
17407 {
17408 /* Get the corresponding container mode. E.g. an INDEX on V2SI
17409 should yield two integer values per 128-bit block, meaning
17410 that we need to treat it in the same way as V2DI and then
17411 ignore the upper 32 bits of each element. */
17412 elt_mode = aarch64_sve_container_int_mode (mode);
17413 *info = simd_immediate_info (elt_mode, base, step);
17414 }
17415 return true;
17416 }
17417 else if (GET_CODE (op) == CONST_VECTOR
17418 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
17419 /* N_ELTS set above. */;
17420 else
17421 return false;
17422
17423 scalar_float_mode elt_float_mode;
17424 if (n_elts == 1
17425 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
17426 {
17427 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
17428 if (aarch64_float_const_zero_rtx_p (elt)
17429 || aarch64_float_const_representable_p (elt))
17430 {
17431 if (info)
17432 *info = simd_immediate_info (elt_float_mode, elt);
17433 return true;
17434 }
17435 }
17436
17437 /* If all elements in an SVE vector have the same value, we have a free
17438 choice between using the element mode and using the container mode.
17439 Using the element mode means that unused parts of the vector are
17440 duplicates of the used elements, while using the container mode means
17441 that the unused parts are an extension of the used elements. Using the
17442 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
17443 for its container mode VNx4SI while 0x00000101 isn't.
17444
17445 If not all elements in an SVE vector have the same value, we need the
17446 transition from one element to the next to occur at container boundaries.
17447 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
17448 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
17449 scalar_int_mode elt_int_mode;
17450 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
17451 elt_int_mode = aarch64_sve_container_int_mode (mode);
17452 else
17453 elt_int_mode = int_mode_for_mode (elt_mode).require ();
17454
17455 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
17456 if (elt_size > 8)
17457 return false;
17458
17459 /* Expand the vector constant out into a byte vector, with the least
17460 significant byte of the register first. */
17461 auto_vec<unsigned char, 16> bytes;
17462 bytes.reserve (n_elts * elt_size);
17463 for (unsigned int i = 0; i < n_elts; i++)
17464 {
17465 /* The vector is provided in gcc endian-neutral fashion.
17466 For aarch64_be Advanced SIMD, it must be laid out in the vector
17467 register in reverse order. */
17468 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
17469 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
17470
17471 if (elt_mode != elt_int_mode)
17472 elt = gen_lowpart (elt_int_mode, elt);
17473
17474 if (!CONST_INT_P (elt))
17475 return false;
17476
17477 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
17478 for (unsigned int byte = 0; byte < elt_size; byte++)
17479 {
17480 bytes.quick_push (elt_val & 0xff);
17481 elt_val >>= BITS_PER_UNIT;
17482 }
17483 }
17484
17485 /* The immediate must repeat every eight bytes. */
17486 unsigned int nbytes = bytes.length ();
17487 for (unsigned i = 8; i < nbytes; ++i)
17488 if (bytes[i] != bytes[i - 8])
17489 return false;
17490
17491 /* Get the repeating 8-byte value as an integer. No endian correction
17492 is needed here because bytes is already in lsb-first order. */
17493 unsigned HOST_WIDE_INT val64 = 0;
17494 for (unsigned int i = 0; i < 8; i++)
17495 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
17496 << (i * BITS_PER_UNIT));
17497
17498 if (vec_flags & VEC_SVE_DATA)
17499 return aarch64_sve_valid_immediate (val64, info);
17500 else
17501 return aarch64_advsimd_valid_immediate (val64, info, which);
17502 }
17503
17504 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
17505 has a step in the range of INDEX. Return the index expression if so,
17506 otherwise return null. */
17507 rtx
17508 aarch64_check_zero_based_sve_index_immediate (rtx x)
17509 {
17510 rtx base, step;
17511 if (const_vec_series_p (x, &base, &step)
17512 && base == const0_rtx
17513 && aarch64_sve_index_immediate_p (step))
17514 return step;
17515 return NULL_RTX;
17516 }
17517
17518 /* Check of immediate shift constants are within range. */
17519 bool
17520 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
17521 {
17522 x = unwrap_const_vec_duplicate (x);
17523 if (!CONST_INT_P (x))
17524 return false;
17525 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
17526 if (left)
17527 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
17528 else
17529 return IN_RANGE (INTVAL (x), 1, bit_width);
17530 }
17531
17532 /* Return the bitmask CONST_INT to select the bits required by a zero extract
17533 operation of width WIDTH at bit position POS. */
17534
17535 rtx
17536 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
17537 {
17538 gcc_assert (CONST_INT_P (width));
17539 gcc_assert (CONST_INT_P (pos));
17540
17541 unsigned HOST_WIDE_INT mask
17542 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
17543 return GEN_INT (mask << UINTVAL (pos));
17544 }
17545
17546 bool
17547 aarch64_mov_operand_p (rtx x, machine_mode mode)
17548 {
17549 if (GET_CODE (x) == HIGH
17550 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
17551 return true;
17552
17553 if (CONST_INT_P (x))
17554 return true;
17555
17556 if (VECTOR_MODE_P (GET_MODE (x)))
17557 {
17558 /* Require predicate constants to be VNx16BI before RA, so that we
17559 force everything to have a canonical form. */
17560 if (!lra_in_progress
17561 && !reload_completed
17562 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
17563 && GET_MODE (x) != VNx16BImode)
17564 return false;
17565
17566 return aarch64_simd_valid_immediate (x, NULL);
17567 }
17568
17569 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
17570 return true;
17571
17572 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
17573 return true;
17574
17575 return aarch64_classify_symbolic_expression (x)
17576 == SYMBOL_TINY_ABSOLUTE;
17577 }
17578
17579 /* Return a const_int vector of VAL. */
17580 rtx
17581 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
17582 {
17583 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
17584 return gen_const_vec_duplicate (mode, c);
17585 }
17586
17587 /* Check OP is a legal scalar immediate for the MOVI instruction. */
17588
17589 bool
17590 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
17591 {
17592 machine_mode vmode;
17593
17594 vmode = aarch64_simd_container_mode (mode, 64);
17595 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
17596 return aarch64_simd_valid_immediate (op_v, NULL);
17597 }
17598
17599 /* Construct and return a PARALLEL RTX vector with elements numbering the
17600 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
17601 the vector - from the perspective of the architecture. This does not
17602 line up with GCC's perspective on lane numbers, so we end up with
17603 different masks depending on our target endian-ness. The diagram
17604 below may help. We must draw the distinction when building masks
17605 which select one half of the vector. An instruction selecting
17606 architectural low-lanes for a big-endian target, must be described using
17607 a mask selecting GCC high-lanes.
17608
17609 Big-Endian Little-Endian
17610
17611 GCC 0 1 2 3 3 2 1 0
17612 | x | x | x | x | | x | x | x | x |
17613 Architecture 3 2 1 0 3 2 1 0
17614
17615 Low Mask: { 2, 3 } { 0, 1 }
17616 High Mask: { 0, 1 } { 2, 3 }
17617
17618 MODE Is the mode of the vector and NUNITS is the number of units in it. */
17619
17620 rtx
17621 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
17622 {
17623 rtvec v = rtvec_alloc (nunits / 2);
17624 int high_base = nunits / 2;
17625 int low_base = 0;
17626 int base;
17627 rtx t1;
17628 int i;
17629
17630 if (BYTES_BIG_ENDIAN)
17631 base = high ? low_base : high_base;
17632 else
17633 base = high ? high_base : low_base;
17634
17635 for (i = 0; i < nunits / 2; i++)
17636 RTVEC_ELT (v, i) = GEN_INT (base + i);
17637
17638 t1 = gen_rtx_PARALLEL (mode, v);
17639 return t1;
17640 }
17641
17642 /* Check OP for validity as a PARALLEL RTX vector with elements
17643 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17644 from the perspective of the architecture. See the diagram above
17645 aarch64_simd_vect_par_cnst_half for more details. */
17646
17647 bool
17648 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
17649 bool high)
17650 {
17651 int nelts;
17652 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
17653 return false;
17654
17655 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
17656 HOST_WIDE_INT count_op = XVECLEN (op, 0);
17657 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
17658 int i = 0;
17659
17660 if (count_op != count_ideal)
17661 return false;
17662
17663 for (i = 0; i < count_ideal; i++)
17664 {
17665 rtx elt_op = XVECEXP (op, 0, i);
17666 rtx elt_ideal = XVECEXP (ideal, 0, i);
17667
17668 if (!CONST_INT_P (elt_op)
17669 || INTVAL (elt_ideal) != INTVAL (elt_op))
17670 return false;
17671 }
17672 return true;
17673 }
17674
17675 /* Return a PARALLEL containing NELTS elements, with element I equal
17676 to BASE + I * STEP. */
17677
17678 rtx
17679 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
17680 {
17681 rtvec vec = rtvec_alloc (nelts);
17682 for (unsigned int i = 0; i < nelts; ++i)
17683 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
17684 return gen_rtx_PARALLEL (VOIDmode, vec);
17685 }
17686
17687 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17688 series with step STEP. */
17689
17690 bool
17691 aarch64_stepped_int_parallel_p (rtx op, int step)
17692 {
17693 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
17694 return false;
17695
17696 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
17697 for (int i = 1; i < XVECLEN (op, 0); ++i)
17698 if (!CONST_INT_P (XVECEXP (op, 0, i))
17699 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
17700 return false;
17701
17702 return true;
17703 }
17704
17705 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
17706 HIGH (exclusive). */
17707 void
17708 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
17709 const_tree exp)
17710 {
17711 HOST_WIDE_INT lane;
17712 gcc_assert (CONST_INT_P (operand));
17713 lane = INTVAL (operand);
17714
17715 if (lane < low || lane >= high)
17716 {
17717 if (exp)
17718 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
17719 else
17720 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
17721 }
17722 }
17723
17724 /* Peform endian correction on lane number N, which indexes a vector
17725 of mode MODE, and return the result as an SImode rtx. */
17726
17727 rtx
17728 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
17729 {
17730 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
17731 }
17732
17733 /* Return TRUE if OP is a valid vector addressing mode. */
17734
17735 bool
17736 aarch64_simd_mem_operand_p (rtx op)
17737 {
17738 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
17739 || REG_P (XEXP (op, 0)));
17740 }
17741
17742 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
17743
17744 bool
17745 aarch64_sve_ld1r_operand_p (rtx op)
17746 {
17747 struct aarch64_address_info addr;
17748 scalar_mode mode;
17749
17750 return (MEM_P (op)
17751 && is_a <scalar_mode> (GET_MODE (op), &mode)
17752 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
17753 && addr.type == ADDRESS_REG_IMM
17754 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
17755 }
17756
17757 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
17758 where the size of the read data is specified by `mode` and the size of the
17759 vector elements are specified by `elem_mode`. */
17760 bool
17761 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
17762 scalar_mode elem_mode)
17763 {
17764 struct aarch64_address_info addr;
17765 if (!MEM_P (op)
17766 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
17767 return false;
17768
17769 if (addr.type == ADDRESS_REG_IMM)
17770 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
17771
17772 if (addr.type == ADDRESS_REG_REG)
17773 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
17774
17775 return false;
17776 }
17777
17778 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
17779 bool
17780 aarch64_sve_ld1rq_operand_p (rtx op)
17781 {
17782 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
17783 GET_MODE_INNER (GET_MODE (op)));
17784 }
17785
17786 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
17787 accessing a vector where the element size is specified by `elem_mode`. */
17788 bool
17789 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
17790 {
17791 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
17792 }
17793
17794 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
17795 bool
17796 aarch64_sve_ldff1_operand_p (rtx op)
17797 {
17798 if (!MEM_P (op))
17799 return false;
17800
17801 struct aarch64_address_info addr;
17802 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
17803 return false;
17804
17805 if (addr.type == ADDRESS_REG_IMM)
17806 return known_eq (addr.const_offset, 0);
17807
17808 return addr.type == ADDRESS_REG_REG;
17809 }
17810
17811 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
17812 bool
17813 aarch64_sve_ldnf1_operand_p (rtx op)
17814 {
17815 struct aarch64_address_info addr;
17816
17817 return (MEM_P (op)
17818 && aarch64_classify_address (&addr, XEXP (op, 0),
17819 GET_MODE (op), false)
17820 && addr.type == ADDRESS_REG_IMM);
17821 }
17822
17823 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17824 The conditions for STR are the same. */
17825 bool
17826 aarch64_sve_ldr_operand_p (rtx op)
17827 {
17828 struct aarch64_address_info addr;
17829
17830 return (MEM_P (op)
17831 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
17832 false, ADDR_QUERY_ANY)
17833 && addr.type == ADDRESS_REG_IMM);
17834 }
17835
17836 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17837 addressing memory of mode MODE. */
17838 bool
17839 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
17840 {
17841 struct aarch64_address_info addr;
17842 if (!aarch64_classify_address (&addr, op, mode, false))
17843 return false;
17844
17845 if (addr.type == ADDRESS_REG_IMM)
17846 return known_eq (addr.const_offset, 0);
17847
17848 return addr.type == ADDRESS_REG_REG;
17849 }
17850
17851 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17852 We need to be able to access the individual pieces, so the range
17853 is different from LD[234] and ST[234]. */
17854 bool
17855 aarch64_sve_struct_memory_operand_p (rtx op)
17856 {
17857 if (!MEM_P (op))
17858 return false;
17859
17860 machine_mode mode = GET_MODE (op);
17861 struct aarch64_address_info addr;
17862 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
17863 ADDR_QUERY_ANY)
17864 || addr.type != ADDRESS_REG_IMM)
17865 return false;
17866
17867 poly_int64 first = addr.const_offset;
17868 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
17869 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
17870 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
17871 }
17872
17873 /* Emit a register copy from operand to operand, taking care not to
17874 early-clobber source registers in the process.
17875
17876 COUNT is the number of components into which the copy needs to be
17877 decomposed. */
17878 void
17879 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
17880 unsigned int count)
17881 {
17882 unsigned int i;
17883 int rdest = REGNO (operands[0]);
17884 int rsrc = REGNO (operands[1]);
17885
17886 if (!reg_overlap_mentioned_p (operands[0], operands[1])
17887 || rdest < rsrc)
17888 for (i = 0; i < count; i++)
17889 emit_move_insn (gen_rtx_REG (mode, rdest + i),
17890 gen_rtx_REG (mode, rsrc + i));
17891 else
17892 for (i = 0; i < count; i++)
17893 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
17894 gen_rtx_REG (mode, rsrc + count - i - 1));
17895 }
17896
17897 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17898 one of VSTRUCT modes: OI, CI, or XI. */
17899 int
17900 aarch64_simd_attr_length_rglist (machine_mode mode)
17901 {
17902 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
17903 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
17904 }
17905
17906 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
17907 alignment of a vector to 128 bits. SVE predicates have an alignment of
17908 16 bits. */
17909 static HOST_WIDE_INT
17910 aarch64_simd_vector_alignment (const_tree type)
17911 {
17912 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17913 be set for non-predicate vectors of booleans. Modes are the most
17914 direct way we have of identifying real SVE predicate types. */
17915 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
17916 return 16;
17917 widest_int min_size
17918 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
17919 return wi::umin (min_size, 128).to_uhwi ();
17920 }
17921
17922 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
17923 static poly_uint64
17924 aarch64_vectorize_preferred_vector_alignment (const_tree type)
17925 {
17926 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
17927 {
17928 /* If the length of the vector is fixed, try to align to that length,
17929 otherwise don't try to align at all. */
17930 HOST_WIDE_INT result;
17931 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
17932 result = TYPE_ALIGN (TREE_TYPE (type));
17933 return result;
17934 }
17935 return TYPE_ALIGN (type);
17936 }
17937
17938 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
17939 static bool
17940 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
17941 {
17942 if (is_packed)
17943 return false;
17944
17945 /* For fixed-length vectors, check that the vectorizer will aim for
17946 full-vector alignment. This isn't true for generic GCC vectors
17947 that are wider than the ABI maximum of 128 bits. */
17948 poly_uint64 preferred_alignment =
17949 aarch64_vectorize_preferred_vector_alignment (type);
17950 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
17951 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
17952 preferred_alignment))
17953 return false;
17954
17955 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
17956 return true;
17957 }
17958
17959 /* Return true if the vector misalignment factor is supported by the
17960 target. */
17961 static bool
17962 aarch64_builtin_support_vector_misalignment (machine_mode mode,
17963 const_tree type, int misalignment,
17964 bool is_packed)
17965 {
17966 if (TARGET_SIMD && STRICT_ALIGNMENT)
17967 {
17968 /* Return if movmisalign pattern is not supported for this mode. */
17969 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
17970 return false;
17971
17972 /* Misalignment factor is unknown at compile time. */
17973 if (misalignment == -1)
17974 return false;
17975 }
17976 return default_builtin_support_vector_misalignment (mode, type, misalignment,
17977 is_packed);
17978 }
17979
17980 /* If VALS is a vector constant that can be loaded into a register
17981 using DUP, generate instructions to do so and return an RTX to
17982 assign to the register. Otherwise return NULL_RTX. */
17983 static rtx
17984 aarch64_simd_dup_constant (rtx vals)
17985 {
17986 machine_mode mode = GET_MODE (vals);
17987 machine_mode inner_mode = GET_MODE_INNER (mode);
17988 rtx x;
17989
17990 if (!const_vec_duplicate_p (vals, &x))
17991 return NULL_RTX;
17992
17993 /* We can load this constant by using DUP and a constant in a
17994 single ARM register. This will be cheaper than a vector
17995 load. */
17996 x = copy_to_mode_reg (inner_mode, x);
17997 return gen_vec_duplicate (mode, x);
17998 }
17999
18000
18001 /* Generate code to load VALS, which is a PARALLEL containing only
18002 constants (for vec_init) or CONST_VECTOR, efficiently into a
18003 register. Returns an RTX to copy into the register, or NULL_RTX
18004 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
18005 static rtx
18006 aarch64_simd_make_constant (rtx vals)
18007 {
18008 machine_mode mode = GET_MODE (vals);
18009 rtx const_dup;
18010 rtx const_vec = NULL_RTX;
18011 int n_const = 0;
18012 int i;
18013
18014 if (GET_CODE (vals) == CONST_VECTOR)
18015 const_vec = vals;
18016 else if (GET_CODE (vals) == PARALLEL)
18017 {
18018 /* A CONST_VECTOR must contain only CONST_INTs and
18019 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
18020 Only store valid constants in a CONST_VECTOR. */
18021 int n_elts = XVECLEN (vals, 0);
18022 for (i = 0; i < n_elts; ++i)
18023 {
18024 rtx x = XVECEXP (vals, 0, i);
18025 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18026 n_const++;
18027 }
18028 if (n_const == n_elts)
18029 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
18030 }
18031 else
18032 gcc_unreachable ();
18033
18034 if (const_vec != NULL_RTX
18035 && aarch64_simd_valid_immediate (const_vec, NULL))
18036 /* Load using MOVI/MVNI. */
18037 return const_vec;
18038 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
18039 /* Loaded using DUP. */
18040 return const_dup;
18041 else if (const_vec != NULL_RTX)
18042 /* Load from constant pool. We cannot take advantage of single-cycle
18043 LD1 because we need a PC-relative addressing mode. */
18044 return const_vec;
18045 else
18046 /* A PARALLEL containing something not valid inside CONST_VECTOR.
18047 We cannot construct an initializer. */
18048 return NULL_RTX;
18049 }
18050
18051 /* Expand a vector initialisation sequence, such that TARGET is
18052 initialised to contain VALS. */
18053
18054 void
18055 aarch64_expand_vector_init (rtx target, rtx vals)
18056 {
18057 machine_mode mode = GET_MODE (target);
18058 scalar_mode inner_mode = GET_MODE_INNER (mode);
18059 /* The number of vector elements. */
18060 int n_elts = XVECLEN (vals, 0);
18061 /* The number of vector elements which are not constant. */
18062 int n_var = 0;
18063 rtx any_const = NULL_RTX;
18064 /* The first element of vals. */
18065 rtx v0 = XVECEXP (vals, 0, 0);
18066 bool all_same = true;
18067
18068 /* This is a special vec_init<M><N> where N is not an element mode but a
18069 vector mode with half the elements of M. We expect to find two entries
18070 of mode N in VALS and we must put their concatentation into TARGET. */
18071 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
18072 {
18073 gcc_assert (known_eq (GET_MODE_SIZE (mode),
18074 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
18075 rtx lo = XVECEXP (vals, 0, 0);
18076 rtx hi = XVECEXP (vals, 0, 1);
18077 machine_mode narrow_mode = GET_MODE (lo);
18078 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
18079 gcc_assert (narrow_mode == GET_MODE (hi));
18080
18081 /* When we want to concatenate a half-width vector with zeroes we can
18082 use the aarch64_combinez[_be] patterns. Just make sure that the
18083 zeroes are in the right half. */
18084 if (BYTES_BIG_ENDIAN
18085 && aarch64_simd_imm_zero (lo, narrow_mode)
18086 && general_operand (hi, narrow_mode))
18087 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
18088 else if (!BYTES_BIG_ENDIAN
18089 && aarch64_simd_imm_zero (hi, narrow_mode)
18090 && general_operand (lo, narrow_mode))
18091 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
18092 else
18093 {
18094 /* Else create the two half-width registers and combine them. */
18095 if (!REG_P (lo))
18096 lo = force_reg (GET_MODE (lo), lo);
18097 if (!REG_P (hi))
18098 hi = force_reg (GET_MODE (hi), hi);
18099
18100 if (BYTES_BIG_ENDIAN)
18101 std::swap (lo, hi);
18102 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
18103 }
18104 return;
18105 }
18106
18107 /* Count the number of variable elements to initialise. */
18108 for (int i = 0; i < n_elts; ++i)
18109 {
18110 rtx x = XVECEXP (vals, 0, i);
18111 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
18112 ++n_var;
18113 else
18114 any_const = x;
18115
18116 all_same &= rtx_equal_p (x, v0);
18117 }
18118
18119 /* No variable elements, hand off to aarch64_simd_make_constant which knows
18120 how best to handle this. */
18121 if (n_var == 0)
18122 {
18123 rtx constant = aarch64_simd_make_constant (vals);
18124 if (constant != NULL_RTX)
18125 {
18126 emit_move_insn (target, constant);
18127 return;
18128 }
18129 }
18130
18131 /* Splat a single non-constant element if we can. */
18132 if (all_same)
18133 {
18134 rtx x = copy_to_mode_reg (inner_mode, v0);
18135 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18136 return;
18137 }
18138
18139 enum insn_code icode = optab_handler (vec_set_optab, mode);
18140 gcc_assert (icode != CODE_FOR_nothing);
18141
18142 /* If there are only variable elements, try to optimize
18143 the insertion using dup for the most common element
18144 followed by insertions. */
18145
18146 /* The algorithm will fill matches[*][0] with the earliest matching element,
18147 and matches[X][1] with the count of duplicate elements (if X is the
18148 earliest element which has duplicates). */
18149
18150 if (n_var == n_elts && n_elts <= 16)
18151 {
18152 int matches[16][2] = {0};
18153 for (int i = 0; i < n_elts; i++)
18154 {
18155 for (int j = 0; j <= i; j++)
18156 {
18157 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
18158 {
18159 matches[i][0] = j;
18160 matches[j][1]++;
18161 break;
18162 }
18163 }
18164 }
18165 int maxelement = 0;
18166 int maxv = 0;
18167 for (int i = 0; i < n_elts; i++)
18168 if (matches[i][1] > maxv)
18169 {
18170 maxelement = i;
18171 maxv = matches[i][1];
18172 }
18173
18174 /* Create a duplicate of the most common element, unless all elements
18175 are equally useless to us, in which case just immediately set the
18176 vector register using the first element. */
18177
18178 if (maxv == 1)
18179 {
18180 /* For vectors of two 64-bit elements, we can do even better. */
18181 if (n_elts == 2
18182 && (inner_mode == E_DImode
18183 || inner_mode == E_DFmode))
18184
18185 {
18186 rtx x0 = XVECEXP (vals, 0, 0);
18187 rtx x1 = XVECEXP (vals, 0, 1);
18188 /* Combine can pick up this case, but handling it directly
18189 here leaves clearer RTL.
18190
18191 This is load_pair_lanes<mode>, and also gives us a clean-up
18192 for store_pair_lanes<mode>. */
18193 if (memory_operand (x0, inner_mode)
18194 && memory_operand (x1, inner_mode)
18195 && !STRICT_ALIGNMENT
18196 && rtx_equal_p (XEXP (x1, 0),
18197 plus_constant (Pmode,
18198 XEXP (x0, 0),
18199 GET_MODE_SIZE (inner_mode))))
18200 {
18201 rtx t;
18202 if (inner_mode == DFmode)
18203 t = gen_load_pair_lanesdf (target, x0, x1);
18204 else
18205 t = gen_load_pair_lanesdi (target, x0, x1);
18206 emit_insn (t);
18207 return;
18208 }
18209 }
18210 /* The subreg-move sequence below will move into lane zero of the
18211 vector register. For big-endian we want that position to hold
18212 the last element of VALS. */
18213 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
18214 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18215 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
18216 }
18217 else
18218 {
18219 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18220 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18221 }
18222
18223 /* Insert the rest. */
18224 for (int i = 0; i < n_elts; i++)
18225 {
18226 rtx x = XVECEXP (vals, 0, i);
18227 if (matches[i][0] == maxelement)
18228 continue;
18229 x = copy_to_mode_reg (inner_mode, x);
18230 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18231 }
18232 return;
18233 }
18234
18235 /* Initialise a vector which is part-variable. We want to first try
18236 to build those lanes which are constant in the most efficient way we
18237 can. */
18238 if (n_var != n_elts)
18239 {
18240 rtx copy = copy_rtx (vals);
18241
18242 /* Load constant part of vector. We really don't care what goes into the
18243 parts we will overwrite, but we're more likely to be able to load the
18244 constant efficiently if it has fewer, larger, repeating parts
18245 (see aarch64_simd_valid_immediate). */
18246 for (int i = 0; i < n_elts; i++)
18247 {
18248 rtx x = XVECEXP (vals, 0, i);
18249 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18250 continue;
18251 rtx subst = any_const;
18252 for (int bit = n_elts / 2; bit > 0; bit /= 2)
18253 {
18254 /* Look in the copied vector, as more elements are const. */
18255 rtx test = XVECEXP (copy, 0, i ^ bit);
18256 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
18257 {
18258 subst = test;
18259 break;
18260 }
18261 }
18262 XVECEXP (copy, 0, i) = subst;
18263 }
18264 aarch64_expand_vector_init (target, copy);
18265 }
18266
18267 /* Insert the variable lanes directly. */
18268 for (int i = 0; i < n_elts; i++)
18269 {
18270 rtx x = XVECEXP (vals, 0, i);
18271 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18272 continue;
18273 x = copy_to_mode_reg (inner_mode, x);
18274 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18275 }
18276 }
18277
18278 /* Emit RTL corresponding to:
18279 insr TARGET, ELEM. */
18280
18281 static void
18282 emit_insr (rtx target, rtx elem)
18283 {
18284 machine_mode mode = GET_MODE (target);
18285 scalar_mode elem_mode = GET_MODE_INNER (mode);
18286 elem = force_reg (elem_mode, elem);
18287
18288 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
18289 gcc_assert (icode != CODE_FOR_nothing);
18290 emit_insn (GEN_FCN (icode) (target, target, elem));
18291 }
18292
18293 /* Subroutine of aarch64_sve_expand_vector_init for handling
18294 trailing constants.
18295 This function works as follows:
18296 (a) Create a new vector consisting of trailing constants.
18297 (b) Initialize TARGET with the constant vector using emit_move_insn.
18298 (c) Insert remaining elements in TARGET using insr.
18299 NELTS is the total number of elements in original vector while
18300 while NELTS_REQD is the number of elements that are actually
18301 significant.
18302
18303 ??? The heuristic used is to do above only if number of constants
18304 is at least half the total number of elements. May need fine tuning. */
18305
18306 static bool
18307 aarch64_sve_expand_vector_init_handle_trailing_constants
18308 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
18309 {
18310 machine_mode mode = GET_MODE (target);
18311 scalar_mode elem_mode = GET_MODE_INNER (mode);
18312 int n_trailing_constants = 0;
18313
18314 for (int i = nelts_reqd - 1;
18315 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
18316 i--)
18317 n_trailing_constants++;
18318
18319 if (n_trailing_constants >= nelts_reqd / 2)
18320 {
18321 rtx_vector_builder v (mode, 1, nelts);
18322 for (int i = 0; i < nelts; i++)
18323 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
18324 rtx const_vec = v.build ();
18325 emit_move_insn (target, const_vec);
18326
18327 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
18328 emit_insr (target, builder.elt (i));
18329
18330 return true;
18331 }
18332
18333 return false;
18334 }
18335
18336 /* Subroutine of aarch64_sve_expand_vector_init.
18337 Works as follows:
18338 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
18339 (b) Skip trailing elements from BUILDER, which are the same as
18340 element NELTS_REQD - 1.
18341 (c) Insert earlier elements in reverse order in TARGET using insr. */
18342
18343 static void
18344 aarch64_sve_expand_vector_init_insert_elems (rtx target,
18345 const rtx_vector_builder &builder,
18346 int nelts_reqd)
18347 {
18348 machine_mode mode = GET_MODE (target);
18349 scalar_mode elem_mode = GET_MODE_INNER (mode);
18350
18351 struct expand_operand ops[2];
18352 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
18353 gcc_assert (icode != CODE_FOR_nothing);
18354
18355 create_output_operand (&ops[0], target, mode);
18356 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
18357 expand_insn (icode, 2, ops);
18358
18359 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18360 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
18361 emit_insr (target, builder.elt (i));
18362 }
18363
18364 /* Subroutine of aarch64_sve_expand_vector_init to handle case
18365 when all trailing elements of builder are same.
18366 This works as follows:
18367 (a) Use expand_insn interface to broadcast last vector element in TARGET.
18368 (b) Insert remaining elements in TARGET using insr.
18369
18370 ??? The heuristic used is to do above if number of same trailing elements
18371 is at least 3/4 of total number of elements, loosely based on
18372 heuristic from mostly_zeros_p. May need fine-tuning. */
18373
18374 static bool
18375 aarch64_sve_expand_vector_init_handle_trailing_same_elem
18376 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
18377 {
18378 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18379 if (ndups >= (3 * nelts_reqd) / 4)
18380 {
18381 aarch64_sve_expand_vector_init_insert_elems (target, builder,
18382 nelts_reqd - ndups + 1);
18383 return true;
18384 }
18385
18386 return false;
18387 }
18388
18389 /* Initialize register TARGET from BUILDER. NELTS is the constant number
18390 of elements in BUILDER.
18391
18392 The function tries to initialize TARGET from BUILDER if it fits one
18393 of the special cases outlined below.
18394
18395 Failing that, the function divides BUILDER into two sub-vectors:
18396 v_even = even elements of BUILDER;
18397 v_odd = odd elements of BUILDER;
18398
18399 and recursively calls itself with v_even and v_odd.
18400
18401 if (recursive call succeeded for v_even or v_odd)
18402 TARGET = zip (v_even, v_odd)
18403
18404 The function returns true if it managed to build TARGET from BUILDER
18405 with one of the special cases, false otherwise.
18406
18407 Example: {a, 1, b, 2, c, 3, d, 4}
18408
18409 The vector gets divided into:
18410 v_even = {a, b, c, d}
18411 v_odd = {1, 2, 3, 4}
18412
18413 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
18414 initialize tmp2 from constant vector v_odd using emit_move_insn.
18415
18416 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
18417 4 elements, so we construct tmp1 from v_even using insr:
18418 tmp1 = dup(d)
18419 insr tmp1, c
18420 insr tmp1, b
18421 insr tmp1, a
18422
18423 And finally:
18424 TARGET = zip (tmp1, tmp2)
18425 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
18426
18427 static bool
18428 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
18429 int nelts, int nelts_reqd)
18430 {
18431 machine_mode mode = GET_MODE (target);
18432
18433 /* Case 1: Vector contains trailing constants. */
18434
18435 if (aarch64_sve_expand_vector_init_handle_trailing_constants
18436 (target, builder, nelts, nelts_reqd))
18437 return true;
18438
18439 /* Case 2: Vector contains leading constants. */
18440
18441 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
18442 for (int i = 0; i < nelts_reqd; i++)
18443 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
18444 rev_builder.finalize ();
18445
18446 if (aarch64_sve_expand_vector_init_handle_trailing_constants
18447 (target, rev_builder, nelts, nelts_reqd))
18448 {
18449 emit_insn (gen_aarch64_sve_rev (mode, target, target));
18450 return true;
18451 }
18452
18453 /* Case 3: Vector contains trailing same element. */
18454
18455 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18456 (target, builder, nelts_reqd))
18457 return true;
18458
18459 /* Case 4: Vector contains leading same element. */
18460
18461 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18462 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
18463 {
18464 emit_insn (gen_aarch64_sve_rev (mode, target, target));
18465 return true;
18466 }
18467
18468 /* Avoid recursing below 4-elements.
18469 ??? The threshold 4 may need fine-tuning. */
18470
18471 if (nelts_reqd <= 4)
18472 return false;
18473
18474 rtx_vector_builder v_even (mode, 1, nelts);
18475 rtx_vector_builder v_odd (mode, 1, nelts);
18476
18477 for (int i = 0; i < nelts * 2; i += 2)
18478 {
18479 v_even.quick_push (builder.elt (i));
18480 v_odd.quick_push (builder.elt (i + 1));
18481 }
18482
18483 v_even.finalize ();
18484 v_odd.finalize ();
18485
18486 rtx tmp1 = gen_reg_rtx (mode);
18487 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
18488 nelts, nelts_reqd / 2);
18489
18490 rtx tmp2 = gen_reg_rtx (mode);
18491 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
18492 nelts, nelts_reqd / 2);
18493
18494 if (!did_even_p && !did_odd_p)
18495 return false;
18496
18497 /* Initialize v_even and v_odd using INSR if it didn't match any of the
18498 special cases and zip v_even, v_odd. */
18499
18500 if (!did_even_p)
18501 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
18502
18503 if (!did_odd_p)
18504 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
18505
18506 rtvec v = gen_rtvec (2, tmp1, tmp2);
18507 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
18508 return true;
18509 }
18510
18511 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
18512
18513 void
18514 aarch64_sve_expand_vector_init (rtx target, rtx vals)
18515 {
18516 machine_mode mode = GET_MODE (target);
18517 int nelts = XVECLEN (vals, 0);
18518
18519 rtx_vector_builder v (mode, 1, nelts);
18520 for (int i = 0; i < nelts; i++)
18521 v.quick_push (XVECEXP (vals, 0, i));
18522 v.finalize ();
18523
18524 /* If neither sub-vectors of v could be initialized specially,
18525 then use INSR to insert all elements from v into TARGET.
18526 ??? This might not be optimal for vectors with large
18527 initializers like 16-element or above.
18528 For nelts < 4, it probably isn't useful to handle specially. */
18529
18530 if (nelts < 4
18531 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
18532 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
18533 }
18534
18535 /* Check whether VALUE is a vector constant in which every element
18536 is either a power of 2 or a negated power of 2. If so, return
18537 a constant vector of log2s, and flip CODE between PLUS and MINUS
18538 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
18539
18540 static rtx
18541 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
18542 {
18543 if (GET_CODE (value) != CONST_VECTOR)
18544 return NULL_RTX;
18545
18546 rtx_vector_builder builder;
18547 if (!builder.new_unary_operation (GET_MODE (value), value, false))
18548 return NULL_RTX;
18549
18550 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
18551 /* 1 if the result of the multiplication must be negated,
18552 0 if it mustn't, or -1 if we don't yet care. */
18553 int negate = -1;
18554 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
18555 for (unsigned int i = 0; i < encoded_nelts; ++i)
18556 {
18557 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
18558 if (!CONST_SCALAR_INT_P (elt))
18559 return NULL_RTX;
18560 rtx_mode_t val (elt, int_mode);
18561 wide_int pow2 = wi::neg (val);
18562 if (val != pow2)
18563 {
18564 /* It matters whether we negate or not. Make that choice,
18565 and make sure that it's consistent with previous elements. */
18566 if (negate == !wi::neg_p (val))
18567 return NULL_RTX;
18568 negate = wi::neg_p (val);
18569 if (!negate)
18570 pow2 = val;
18571 }
18572 /* POW2 is now the value that we want to be a power of 2. */
18573 int shift = wi::exact_log2 (pow2);
18574 if (shift < 0)
18575 return NULL_RTX;
18576 builder.quick_push (gen_int_mode (shift, int_mode));
18577 }
18578 if (negate == -1)
18579 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
18580 code = PLUS;
18581 else if (negate == 1)
18582 code = code == PLUS ? MINUS : PLUS;
18583 return builder.build ();
18584 }
18585
18586 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
18587 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
18588 operands array, in the same order as for fma_optab. Return true if
18589 the function emitted all the necessary instructions, false if the caller
18590 should generate the pattern normally with the new OPERANDS array. */
18591
18592 bool
18593 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
18594 {
18595 machine_mode mode = GET_MODE (operands[0]);
18596 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
18597 {
18598 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
18599 NULL_RTX, true, OPTAB_DIRECT);
18600 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
18601 operands[3], product, operands[0], true,
18602 OPTAB_DIRECT);
18603 return true;
18604 }
18605 operands[2] = force_reg (mode, operands[2]);
18606 return false;
18607 }
18608
18609 /* Likewise, but for a conditional pattern. */
18610
18611 bool
18612 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
18613 {
18614 machine_mode mode = GET_MODE (operands[0]);
18615 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
18616 {
18617 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
18618 NULL_RTX, true, OPTAB_DIRECT);
18619 emit_insn (gen_cond (code, mode, operands[0], operands[1],
18620 operands[4], product, operands[5]));
18621 return true;
18622 }
18623 operands[3] = force_reg (mode, operands[3]);
18624 return false;
18625 }
18626
18627 static unsigned HOST_WIDE_INT
18628 aarch64_shift_truncation_mask (machine_mode mode)
18629 {
18630 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
18631 return 0;
18632 return GET_MODE_UNIT_BITSIZE (mode) - 1;
18633 }
18634
18635 /* Select a format to encode pointers in exception handling data. */
18636 int
18637 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
18638 {
18639 int type;
18640 switch (aarch64_cmodel)
18641 {
18642 case AARCH64_CMODEL_TINY:
18643 case AARCH64_CMODEL_TINY_PIC:
18644 case AARCH64_CMODEL_SMALL:
18645 case AARCH64_CMODEL_SMALL_PIC:
18646 case AARCH64_CMODEL_SMALL_SPIC:
18647 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
18648 for everything. */
18649 type = DW_EH_PE_sdata4;
18650 break;
18651 default:
18652 /* No assumptions here. 8-byte relocs required. */
18653 type = DW_EH_PE_sdata8;
18654 break;
18655 }
18656 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
18657 }
18658
18659 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
18660
18661 static void
18662 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
18663 {
18664 if (TREE_CODE (decl) == FUNCTION_DECL)
18665 {
18666 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
18667 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
18668 {
18669 fprintf (stream, "\t.variant_pcs\t");
18670 assemble_name (stream, name);
18671 fprintf (stream, "\n");
18672 }
18673 }
18674 }
18675
18676 /* The last .arch and .tune assembly strings that we printed. */
18677 static std::string aarch64_last_printed_arch_string;
18678 static std::string aarch64_last_printed_tune_string;
18679
18680 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
18681 by the function fndecl. */
18682
18683 void
18684 aarch64_declare_function_name (FILE *stream, const char* name,
18685 tree fndecl)
18686 {
18687 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18688
18689 struct cl_target_option *targ_options;
18690 if (target_parts)
18691 targ_options = TREE_TARGET_OPTION (target_parts);
18692 else
18693 targ_options = TREE_TARGET_OPTION (target_option_current_node);
18694 gcc_assert (targ_options);
18695
18696 const struct processor *this_arch
18697 = aarch64_get_arch (targ_options->x_explicit_arch);
18698
18699 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
18700 std::string extension
18701 = aarch64_get_extension_string_for_isa_flags (isa_flags,
18702 this_arch->flags);
18703 /* Only update the assembler .arch string if it is distinct from the last
18704 such string we printed. */
18705 std::string to_print = this_arch->name + extension;
18706 if (to_print != aarch64_last_printed_arch_string)
18707 {
18708 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
18709 aarch64_last_printed_arch_string = to_print;
18710 }
18711
18712 /* Print the cpu name we're tuning for in the comments, might be
18713 useful to readers of the generated asm. Do it only when it changes
18714 from function to function and verbose assembly is requested. */
18715 const struct processor *this_tune
18716 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
18717
18718 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
18719 {
18720 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
18721 this_tune->name);
18722 aarch64_last_printed_tune_string = this_tune->name;
18723 }
18724
18725 aarch64_asm_output_variant_pcs (stream, fndecl, name);
18726
18727 /* Don't forget the type directive for ELF. */
18728 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
18729 ASM_OUTPUT_LABEL (stream, name);
18730
18731 cfun->machine->label_is_assembled = true;
18732 }
18733
18734 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
18735 the function label and emit a BTI if necessary. */
18736
18737 void
18738 aarch64_print_patchable_function_entry (FILE *file,
18739 unsigned HOST_WIDE_INT patch_area_size,
18740 bool record_p)
18741 {
18742 if (cfun->machine->label_is_assembled
18743 && aarch64_bti_enabled ()
18744 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
18745 {
18746 /* Remove the BTI that follows the patch area and insert a new BTI
18747 before the patch area right after the function label. */
18748 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
18749 if (insn
18750 && INSN_P (insn)
18751 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
18752 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
18753 delete_insn (insn);
18754 asm_fprintf (file, "\thint\t34 // bti c\n");
18755 }
18756
18757 default_print_patchable_function_entry (file, patch_area_size, record_p);
18758 }
18759
18760 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
18761
18762 void
18763 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
18764 {
18765 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
18766 const char *value = IDENTIFIER_POINTER (target);
18767 aarch64_asm_output_variant_pcs (stream, decl, name);
18768 ASM_OUTPUT_DEF (stream, name, value);
18769 }
18770
18771 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
18772 function symbol references. */
18773
18774 void
18775 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
18776 {
18777 default_elf_asm_output_external (stream, decl, name);
18778 aarch64_asm_output_variant_pcs (stream, decl, name);
18779 }
18780
18781 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18782 Used to output the .cfi_b_key_frame directive when signing the current
18783 function with the B key. */
18784
18785 void
18786 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
18787 {
18788 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
18789 && aarch64_ra_sign_key == AARCH64_KEY_B)
18790 asm_fprintf (f, "\t.cfi_b_key_frame\n");
18791 }
18792
18793 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
18794
18795 static void
18796 aarch64_start_file (void)
18797 {
18798 struct cl_target_option *default_options
18799 = TREE_TARGET_OPTION (target_option_default_node);
18800
18801 const struct processor *default_arch
18802 = aarch64_get_arch (default_options->x_explicit_arch);
18803 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
18804 std::string extension
18805 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
18806 default_arch->flags);
18807
18808 aarch64_last_printed_arch_string = default_arch->name + extension;
18809 aarch64_last_printed_tune_string = "";
18810 asm_fprintf (asm_out_file, "\t.arch %s\n",
18811 aarch64_last_printed_arch_string.c_str ());
18812
18813 default_file_start ();
18814 }
18815
18816 /* Emit load exclusive. */
18817
18818 static void
18819 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
18820 rtx mem, rtx model_rtx)
18821 {
18822 if (mode == TImode)
18823 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
18824 gen_highpart (DImode, rval),
18825 mem, model_rtx));
18826 else
18827 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
18828 }
18829
18830 /* Emit store exclusive. */
18831
18832 static void
18833 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
18834 rtx mem, rtx rval, rtx model_rtx)
18835 {
18836 if (mode == TImode)
18837 emit_insn (gen_aarch64_store_exclusive_pair
18838 (bval, mem, operand_subword (rval, 0, 0, TImode),
18839 operand_subword (rval, 1, 0, TImode), model_rtx));
18840 else
18841 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
18842 }
18843
18844 /* Mark the previous jump instruction as unlikely. */
18845
18846 static void
18847 aarch64_emit_unlikely_jump (rtx insn)
18848 {
18849 rtx_insn *jump = emit_jump_insn (insn);
18850 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
18851 }
18852
18853 /* We store the names of the various atomic helpers in a 5x4 array.
18854 Return the libcall function given MODE, MODEL and NAMES. */
18855
18856 rtx
18857 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
18858 const atomic_ool_names *names)
18859 {
18860 memmodel model = memmodel_base (INTVAL (model_rtx));
18861 int mode_idx, model_idx;
18862
18863 switch (mode)
18864 {
18865 case E_QImode:
18866 mode_idx = 0;
18867 break;
18868 case E_HImode:
18869 mode_idx = 1;
18870 break;
18871 case E_SImode:
18872 mode_idx = 2;
18873 break;
18874 case E_DImode:
18875 mode_idx = 3;
18876 break;
18877 case E_TImode:
18878 mode_idx = 4;
18879 break;
18880 default:
18881 gcc_unreachable ();
18882 }
18883
18884 switch (model)
18885 {
18886 case MEMMODEL_RELAXED:
18887 model_idx = 0;
18888 break;
18889 case MEMMODEL_CONSUME:
18890 case MEMMODEL_ACQUIRE:
18891 model_idx = 1;
18892 break;
18893 case MEMMODEL_RELEASE:
18894 model_idx = 2;
18895 break;
18896 case MEMMODEL_ACQ_REL:
18897 case MEMMODEL_SEQ_CST:
18898 model_idx = 3;
18899 break;
18900 default:
18901 gcc_unreachable ();
18902 }
18903
18904 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
18905 VISIBILITY_HIDDEN);
18906 }
18907
18908 #define DEF0(B, N) \
18909 { "__aarch64_" #B #N "_relax", \
18910 "__aarch64_" #B #N "_acq", \
18911 "__aarch64_" #B #N "_rel", \
18912 "__aarch64_" #B #N "_acq_rel" }
18913
18914 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18915 { NULL, NULL, NULL, NULL }
18916 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18917
18918 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
18919 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
18920 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
18921 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
18922 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
18923 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
18924
18925 #undef DEF0
18926 #undef DEF4
18927 #undef DEF5
18928
18929 /* Expand a compare and swap pattern. */
18930
18931 void
18932 aarch64_expand_compare_and_swap (rtx operands[])
18933 {
18934 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
18935 machine_mode mode, r_mode;
18936
18937 bval = operands[0];
18938 rval = operands[1];
18939 mem = operands[2];
18940 oldval = operands[3];
18941 newval = operands[4];
18942 is_weak = operands[5];
18943 mod_s = operands[6];
18944 mod_f = operands[7];
18945 mode = GET_MODE (mem);
18946
18947 /* Normally the succ memory model must be stronger than fail, but in the
18948 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18949 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
18950 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
18951 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
18952 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
18953
18954 r_mode = mode;
18955 if (mode == QImode || mode == HImode)
18956 {
18957 r_mode = SImode;
18958 rval = gen_reg_rtx (r_mode);
18959 }
18960
18961 if (TARGET_LSE)
18962 {
18963 /* The CAS insn requires oldval and rval overlap, but we need to
18964 have a copy of oldval saved across the operation to tell if
18965 the operation is successful. */
18966 if (reg_overlap_mentioned_p (rval, oldval))
18967 rval = copy_to_mode_reg (r_mode, oldval);
18968 else
18969 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
18970
18971 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
18972 newval, mod_s));
18973 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18974 }
18975 else if (TARGET_OUTLINE_ATOMICS)
18976 {
18977 /* Oldval must satisfy compare afterward. */
18978 if (!aarch64_plus_operand (oldval, mode))
18979 oldval = force_reg (mode, oldval);
18980 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
18981 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
18982 oldval, mode, newval, mode,
18983 XEXP (mem, 0), Pmode);
18984 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18985 }
18986 else
18987 {
18988 /* The oldval predicate varies by mode. Test it and force to reg. */
18989 insn_code code = code_for_aarch64_compare_and_swap (mode);
18990 if (!insn_data[code].operand[2].predicate (oldval, mode))
18991 oldval = force_reg (mode, oldval);
18992
18993 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
18994 is_weak, mod_s, mod_f));
18995 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
18996 }
18997
18998 if (r_mode != mode)
18999 rval = gen_lowpart (mode, rval);
19000 emit_move_insn (operands[1], rval);
19001
19002 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
19003 emit_insn (gen_rtx_SET (bval, x));
19004 }
19005
19006 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
19007 sequence implementing an atomic operation. */
19008
19009 static void
19010 aarch64_emit_post_barrier (enum memmodel model)
19011 {
19012 const enum memmodel base_model = memmodel_base (model);
19013
19014 if (is_mm_sync (model)
19015 && (base_model == MEMMODEL_ACQUIRE
19016 || base_model == MEMMODEL_ACQ_REL
19017 || base_model == MEMMODEL_SEQ_CST))
19018 {
19019 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
19020 }
19021 }
19022
19023 /* Split a compare and swap pattern. */
19024
19025 void
19026 aarch64_split_compare_and_swap (rtx operands[])
19027 {
19028 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19029 gcc_assert (epilogue_completed);
19030
19031 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
19032 machine_mode mode;
19033 bool is_weak;
19034 rtx_code_label *label1, *label2;
19035 enum memmodel model;
19036
19037 rval = operands[0];
19038 mem = operands[1];
19039 oldval = operands[2];
19040 newval = operands[3];
19041 is_weak = (operands[4] != const0_rtx);
19042 model_rtx = operands[5];
19043 scratch = operands[7];
19044 mode = GET_MODE (mem);
19045 model = memmodel_from_int (INTVAL (model_rtx));
19046
19047 /* When OLDVAL is zero and we want the strong version we can emit a tighter
19048 loop:
19049 .label1:
19050 LD[A]XR rval, [mem]
19051 CBNZ rval, .label2
19052 ST[L]XR scratch, newval, [mem]
19053 CBNZ scratch, .label1
19054 .label2:
19055 CMP rval, 0. */
19056 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
19057 oldval == const0_rtx && mode != TImode);
19058
19059 label1 = NULL;
19060 if (!is_weak)
19061 {
19062 label1 = gen_label_rtx ();
19063 emit_label (label1);
19064 }
19065 label2 = gen_label_rtx ();
19066
19067 /* The initial load can be relaxed for a __sync operation since a final
19068 barrier will be emitted to stop code hoisting. */
19069 if (is_mm_sync (model))
19070 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
19071 else
19072 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
19073
19074 if (strong_zero_p)
19075 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
19076 else
19077 {
19078 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19079 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
19080 }
19081 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19082 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
19083 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19084
19085 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
19086
19087 if (!is_weak)
19088 {
19089 if (aarch64_track_speculation)
19090 {
19091 /* Emit an explicit compare instruction, so that we can correctly
19092 track the condition codes. */
19093 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19094 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19095 }
19096 else
19097 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
19098
19099 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19100 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
19101 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19102 }
19103 else
19104 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19105
19106 emit_label (label2);
19107
19108 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
19109 to set the condition flags. If this is not used it will be removed by
19110 later passes. */
19111 if (strong_zero_p)
19112 aarch64_gen_compare_reg (NE, rval, const0_rtx);
19113
19114 /* Emit any final barrier needed for a __sync operation. */
19115 if (is_mm_sync (model))
19116 aarch64_emit_post_barrier (model);
19117 }
19118
19119 /* Split an atomic operation. */
19120
19121 void
19122 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
19123 rtx value, rtx model_rtx, rtx cond)
19124 {
19125 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19126 gcc_assert (epilogue_completed);
19127
19128 machine_mode mode = GET_MODE (mem);
19129 machine_mode wmode = (mode == DImode ? DImode : SImode);
19130 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
19131 const bool is_sync = is_mm_sync (model);
19132 rtx_code_label *label;
19133 rtx x;
19134
19135 /* Split the atomic operation into a sequence. */
19136 label = gen_label_rtx ();
19137 emit_label (label);
19138
19139 if (new_out)
19140 new_out = gen_lowpart (wmode, new_out);
19141 if (old_out)
19142 old_out = gen_lowpart (wmode, old_out);
19143 else
19144 old_out = new_out;
19145 value = simplify_gen_subreg (wmode, value, mode, 0);
19146
19147 /* The initial load can be relaxed for a __sync operation since a final
19148 barrier will be emitted to stop code hoisting. */
19149 if (is_sync)
19150 aarch64_emit_load_exclusive (mode, old_out, mem,
19151 GEN_INT (MEMMODEL_RELAXED));
19152 else
19153 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
19154
19155 switch (code)
19156 {
19157 case SET:
19158 new_out = value;
19159 break;
19160
19161 case NOT:
19162 x = gen_rtx_AND (wmode, old_out, value);
19163 emit_insn (gen_rtx_SET (new_out, x));
19164 x = gen_rtx_NOT (wmode, new_out);
19165 emit_insn (gen_rtx_SET (new_out, x));
19166 break;
19167
19168 case MINUS:
19169 if (CONST_INT_P (value))
19170 {
19171 value = GEN_INT (-INTVAL (value));
19172 code = PLUS;
19173 }
19174 /* Fall through. */
19175
19176 default:
19177 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
19178 emit_insn (gen_rtx_SET (new_out, x));
19179 break;
19180 }
19181
19182 aarch64_emit_store_exclusive (mode, cond, mem,
19183 gen_lowpart (mode, new_out), model_rtx);
19184
19185 if (aarch64_track_speculation)
19186 {
19187 /* Emit an explicit compare instruction, so that we can correctly
19188 track the condition codes. */
19189 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
19190 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19191 }
19192 else
19193 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
19194
19195 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19196 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
19197 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19198
19199 /* Emit any final barrier needed for a __sync operation. */
19200 if (is_sync)
19201 aarch64_emit_post_barrier (model);
19202 }
19203
19204 static void
19205 aarch64_init_libfuncs (void)
19206 {
19207 /* Half-precision float operations. The compiler handles all operations
19208 with NULL libfuncs by converting to SFmode. */
19209
19210 /* Conversions. */
19211 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
19212 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
19213
19214 /* Arithmetic. */
19215 set_optab_libfunc (add_optab, HFmode, NULL);
19216 set_optab_libfunc (sdiv_optab, HFmode, NULL);
19217 set_optab_libfunc (smul_optab, HFmode, NULL);
19218 set_optab_libfunc (neg_optab, HFmode, NULL);
19219 set_optab_libfunc (sub_optab, HFmode, NULL);
19220
19221 /* Comparisons. */
19222 set_optab_libfunc (eq_optab, HFmode, NULL);
19223 set_optab_libfunc (ne_optab, HFmode, NULL);
19224 set_optab_libfunc (lt_optab, HFmode, NULL);
19225 set_optab_libfunc (le_optab, HFmode, NULL);
19226 set_optab_libfunc (ge_optab, HFmode, NULL);
19227 set_optab_libfunc (gt_optab, HFmode, NULL);
19228 set_optab_libfunc (unord_optab, HFmode, NULL);
19229 }
19230
19231 /* Target hook for c_mode_for_suffix. */
19232 static machine_mode
19233 aarch64_c_mode_for_suffix (char suffix)
19234 {
19235 if (suffix == 'q')
19236 return TFmode;
19237
19238 return VOIDmode;
19239 }
19240
19241 /* We can only represent floating point constants which will fit in
19242 "quarter-precision" values. These values are characterised by
19243 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
19244 by:
19245
19246 (-1)^s * (n/16) * 2^r
19247
19248 Where:
19249 's' is the sign bit.
19250 'n' is an integer in the range 16 <= n <= 31.
19251 'r' is an integer in the range -3 <= r <= 4. */
19252
19253 /* Return true iff X can be represented by a quarter-precision
19254 floating point immediate operand X. Note, we cannot represent 0.0. */
19255 bool
19256 aarch64_float_const_representable_p (rtx x)
19257 {
19258 /* This represents our current view of how many bits
19259 make up the mantissa. */
19260 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
19261 int exponent;
19262 unsigned HOST_WIDE_INT mantissa, mask;
19263 REAL_VALUE_TYPE r, m;
19264 bool fail;
19265
19266 x = unwrap_const_vec_duplicate (x);
19267 if (!CONST_DOUBLE_P (x))
19268 return false;
19269
19270 if (GET_MODE (x) == VOIDmode
19271 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
19272 return false;
19273
19274 r = *CONST_DOUBLE_REAL_VALUE (x);
19275
19276 /* We cannot represent infinities, NaNs or +/-zero. We won't
19277 know if we have +zero until we analyse the mantissa, but we
19278 can reject the other invalid values. */
19279 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
19280 || REAL_VALUE_MINUS_ZERO (r))
19281 return false;
19282
19283 /* Extract exponent. */
19284 r = real_value_abs (&r);
19285 exponent = REAL_EXP (&r);
19286
19287 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
19288 highest (sign) bit, with a fixed binary point at bit point_pos.
19289 m1 holds the low part of the mantissa, m2 the high part.
19290 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
19291 bits for the mantissa, this can fail (low bits will be lost). */
19292 real_ldexp (&m, &r, point_pos - exponent);
19293 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
19294
19295 /* If the low part of the mantissa has bits set we cannot represent
19296 the value. */
19297 if (w.ulow () != 0)
19298 return false;
19299 /* We have rejected the lower HOST_WIDE_INT, so update our
19300 understanding of how many bits lie in the mantissa and
19301 look only at the high HOST_WIDE_INT. */
19302 mantissa = w.elt (1);
19303 point_pos -= HOST_BITS_PER_WIDE_INT;
19304
19305 /* We can only represent values with a mantissa of the form 1.xxxx. */
19306 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
19307 if ((mantissa & mask) != 0)
19308 return false;
19309
19310 /* Having filtered unrepresentable values, we may now remove all
19311 but the highest 5 bits. */
19312 mantissa >>= point_pos - 5;
19313
19314 /* We cannot represent the value 0.0, so reject it. This is handled
19315 elsewhere. */
19316 if (mantissa == 0)
19317 return false;
19318
19319 /* Then, as bit 4 is always set, we can mask it off, leaving
19320 the mantissa in the range [0, 15]. */
19321 mantissa &= ~(1 << 4);
19322 gcc_assert (mantissa <= 15);
19323
19324 /* GCC internally does not use IEEE754-like encoding (where normalized
19325 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
19326 Our mantissa values are shifted 4 places to the left relative to
19327 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
19328 by 5 places to correct for GCC's representation. */
19329 exponent = 5 - exponent;
19330
19331 return (exponent >= 0 && exponent <= 7);
19332 }
19333
19334 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
19335 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
19336 output MOVI/MVNI, ORR or BIC immediate. */
19337 char*
19338 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
19339 enum simd_immediate_check which)
19340 {
19341 bool is_valid;
19342 static char templ[40];
19343 const char *mnemonic;
19344 const char *shift_op;
19345 unsigned int lane_count = 0;
19346 char element_char;
19347
19348 struct simd_immediate_info info;
19349
19350 /* This will return true to show const_vector is legal for use as either
19351 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
19352 It will also update INFO to show how the immediate should be generated.
19353 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
19354 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
19355 gcc_assert (is_valid);
19356
19357 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19358 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
19359
19360 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19361 {
19362 gcc_assert (info.insn == simd_immediate_info::MOV
19363 && info.u.mov.shift == 0);
19364 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
19365 move immediate path. */
19366 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19367 info.u.mov.value = GEN_INT (0);
19368 else
19369 {
19370 const unsigned int buf_size = 20;
19371 char float_buf[buf_size] = {'\0'};
19372 real_to_decimal_for_mode (float_buf,
19373 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19374 buf_size, buf_size, 1, info.elt_mode);
19375
19376 if (lane_count == 1)
19377 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
19378 else
19379 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
19380 lane_count, element_char, float_buf);
19381 return templ;
19382 }
19383 }
19384
19385 gcc_assert (CONST_INT_P (info.u.mov.value));
19386
19387 if (which == AARCH64_CHECK_MOV)
19388 {
19389 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
19390 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
19391 ? "msl" : "lsl");
19392 if (lane_count == 1)
19393 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
19394 mnemonic, UINTVAL (info.u.mov.value));
19395 else if (info.u.mov.shift)
19396 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19397 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
19398 element_char, UINTVAL (info.u.mov.value), shift_op,
19399 info.u.mov.shift);
19400 else
19401 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19402 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
19403 element_char, UINTVAL (info.u.mov.value));
19404 }
19405 else
19406 {
19407 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
19408 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
19409 if (info.u.mov.shift)
19410 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19411 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
19412 element_char, UINTVAL (info.u.mov.value), "lsl",
19413 info.u.mov.shift);
19414 else
19415 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19416 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
19417 element_char, UINTVAL (info.u.mov.value));
19418 }
19419 return templ;
19420 }
19421
19422 char*
19423 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
19424 {
19425
19426 /* If a floating point number was passed and we desire to use it in an
19427 integer mode do the conversion to integer. */
19428 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
19429 {
19430 unsigned HOST_WIDE_INT ival;
19431 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
19432 gcc_unreachable ();
19433 immediate = gen_int_mode (ival, mode);
19434 }
19435
19436 machine_mode vmode;
19437 /* use a 64 bit mode for everything except for DI/DF mode, where we use
19438 a 128 bit vector mode. */
19439 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
19440
19441 vmode = aarch64_simd_container_mode (mode, width);
19442 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
19443 return aarch64_output_simd_mov_immediate (v_op, width);
19444 }
19445
19446 /* Return the output string to use for moving immediate CONST_VECTOR
19447 into an SVE register. */
19448
19449 char *
19450 aarch64_output_sve_mov_immediate (rtx const_vector)
19451 {
19452 static char templ[40];
19453 struct simd_immediate_info info;
19454 char element_char;
19455
19456 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
19457 gcc_assert (is_valid);
19458
19459 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19460
19461 machine_mode vec_mode = GET_MODE (const_vector);
19462 if (aarch64_sve_pred_mode_p (vec_mode))
19463 {
19464 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
19465 if (info.insn == simd_immediate_info::MOV)
19466 {
19467 gcc_assert (info.u.mov.value == const0_rtx);
19468 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
19469 }
19470 else
19471 {
19472 gcc_assert (info.insn == simd_immediate_info::PTRUE);
19473 unsigned int total_bytes;
19474 if (info.u.pattern == AARCH64_SV_ALL
19475 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
19476 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
19477 total_bytes / GET_MODE_SIZE (info.elt_mode));
19478 else
19479 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
19480 svpattern_token (info.u.pattern));
19481 }
19482 return buf;
19483 }
19484
19485 if (info.insn == simd_immediate_info::INDEX)
19486 {
19487 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
19488 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
19489 element_char, INTVAL (info.u.index.base),
19490 INTVAL (info.u.index.step));
19491 return templ;
19492 }
19493
19494 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19495 {
19496 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19497 info.u.mov.value = GEN_INT (0);
19498 else
19499 {
19500 const int buf_size = 20;
19501 char float_buf[buf_size] = {};
19502 real_to_decimal_for_mode (float_buf,
19503 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19504 buf_size, buf_size, 1, info.elt_mode);
19505
19506 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
19507 element_char, float_buf);
19508 return templ;
19509 }
19510 }
19511
19512 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
19513 element_char, INTVAL (info.u.mov.value));
19514 return templ;
19515 }
19516
19517 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
19518 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
19519 pattern. */
19520
19521 char *
19522 aarch64_output_sve_ptrues (rtx const_unspec)
19523 {
19524 static char templ[40];
19525
19526 struct simd_immediate_info info;
19527 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
19528 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
19529
19530 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19531 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
19532 svpattern_token (info.u.pattern));
19533 return templ;
19534 }
19535
19536 /* Split operands into moves from op[1] + op[2] into op[0]. */
19537
19538 void
19539 aarch64_split_combinev16qi (rtx operands[3])
19540 {
19541 unsigned int dest = REGNO (operands[0]);
19542 unsigned int src1 = REGNO (operands[1]);
19543 unsigned int src2 = REGNO (operands[2]);
19544 machine_mode halfmode = GET_MODE (operands[1]);
19545 unsigned int halfregs = REG_NREGS (operands[1]);
19546 rtx destlo, desthi;
19547
19548 gcc_assert (halfmode == V16QImode);
19549
19550 if (src1 == dest && src2 == dest + halfregs)
19551 {
19552 /* No-op move. Can't split to nothing; emit something. */
19553 emit_note (NOTE_INSN_DELETED);
19554 return;
19555 }
19556
19557 /* Preserve register attributes for variable tracking. */
19558 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
19559 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
19560 GET_MODE_SIZE (halfmode));
19561
19562 /* Special case of reversed high/low parts. */
19563 if (reg_overlap_mentioned_p (operands[2], destlo)
19564 && reg_overlap_mentioned_p (operands[1], desthi))
19565 {
19566 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19567 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
19568 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19569 }
19570 else if (!reg_overlap_mentioned_p (operands[2], destlo))
19571 {
19572 /* Try to avoid unnecessary moves if part of the result
19573 is in the right place already. */
19574 if (src1 != dest)
19575 emit_move_insn (destlo, operands[1]);
19576 if (src2 != dest + halfregs)
19577 emit_move_insn (desthi, operands[2]);
19578 }
19579 else
19580 {
19581 if (src2 != dest + halfregs)
19582 emit_move_insn (desthi, operands[2]);
19583 if (src1 != dest)
19584 emit_move_insn (destlo, operands[1]);
19585 }
19586 }
19587
19588 /* vec_perm support. */
19589
19590 struct expand_vec_perm_d
19591 {
19592 rtx target, op0, op1;
19593 vec_perm_indices perm;
19594 machine_mode vmode;
19595 unsigned int vec_flags;
19596 bool one_vector_p;
19597 bool testing_p;
19598 };
19599
19600 /* Generate a variable permutation. */
19601
19602 static void
19603 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
19604 {
19605 machine_mode vmode = GET_MODE (target);
19606 bool one_vector_p = rtx_equal_p (op0, op1);
19607
19608 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
19609 gcc_checking_assert (GET_MODE (op0) == vmode);
19610 gcc_checking_assert (GET_MODE (op1) == vmode);
19611 gcc_checking_assert (GET_MODE (sel) == vmode);
19612 gcc_checking_assert (TARGET_SIMD);
19613
19614 if (one_vector_p)
19615 {
19616 if (vmode == V8QImode)
19617 {
19618 /* Expand the argument to a V16QI mode by duplicating it. */
19619 rtx pair = gen_reg_rtx (V16QImode);
19620 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
19621 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19622 }
19623 else
19624 {
19625 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
19626 }
19627 }
19628 else
19629 {
19630 rtx pair;
19631
19632 if (vmode == V8QImode)
19633 {
19634 pair = gen_reg_rtx (V16QImode);
19635 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
19636 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19637 }
19638 else
19639 {
19640 pair = gen_reg_rtx (OImode);
19641 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
19642 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
19643 }
19644 }
19645 }
19646
19647 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
19648 NELT is the number of elements in the vector. */
19649
19650 void
19651 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
19652 unsigned int nelt)
19653 {
19654 machine_mode vmode = GET_MODE (target);
19655 bool one_vector_p = rtx_equal_p (op0, op1);
19656 rtx mask;
19657
19658 /* The TBL instruction does not use a modulo index, so we must take care
19659 of that ourselves. */
19660 mask = aarch64_simd_gen_const_vector_dup (vmode,
19661 one_vector_p ? nelt - 1 : 2 * nelt - 1);
19662 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
19663
19664 /* For big-endian, we also need to reverse the index within the vector
19665 (but not which vector). */
19666 if (BYTES_BIG_ENDIAN)
19667 {
19668 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
19669 if (!one_vector_p)
19670 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
19671 sel = expand_simple_binop (vmode, XOR, sel, mask,
19672 NULL, 0, OPTAB_LIB_WIDEN);
19673 }
19674 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
19675 }
19676
19677 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
19678
19679 static void
19680 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
19681 {
19682 emit_insn (gen_rtx_SET (target,
19683 gen_rtx_UNSPEC (GET_MODE (target),
19684 gen_rtvec (2, op0, op1), code)));
19685 }
19686
19687 /* Expand an SVE vec_perm with the given operands. */
19688
19689 void
19690 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
19691 {
19692 machine_mode data_mode = GET_MODE (target);
19693 machine_mode sel_mode = GET_MODE (sel);
19694 /* Enforced by the pattern condition. */
19695 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
19696
19697 /* Note: vec_perm indices are supposed to wrap when they go beyond the
19698 size of the two value vectors, i.e. the upper bits of the indices
19699 are effectively ignored. SVE TBL instead produces 0 for any
19700 out-of-range indices, so we need to modulo all the vec_perm indices
19701 to ensure they are all in range. */
19702 rtx sel_reg = force_reg (sel_mode, sel);
19703
19704 /* Check if the sel only references the first values vector. */
19705 if (GET_CODE (sel) == CONST_VECTOR
19706 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
19707 {
19708 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
19709 return;
19710 }
19711
19712 /* Check if the two values vectors are the same. */
19713 if (rtx_equal_p (op0, op1))
19714 {
19715 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
19716 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19717 NULL, 0, OPTAB_DIRECT);
19718 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
19719 return;
19720 }
19721
19722 /* Run TBL on for each value vector and combine the results. */
19723
19724 rtx res0 = gen_reg_rtx (data_mode);
19725 rtx res1 = gen_reg_rtx (data_mode);
19726 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
19727 if (GET_CODE (sel) != CONST_VECTOR
19728 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
19729 {
19730 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
19731 2 * nunits - 1);
19732 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19733 NULL, 0, OPTAB_DIRECT);
19734 }
19735 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
19736 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
19737 NULL, 0, OPTAB_DIRECT);
19738 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
19739 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
19740 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
19741 else
19742 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
19743 }
19744
19745 /* Recognize patterns suitable for the TRN instructions. */
19746 static bool
19747 aarch64_evpc_trn (struct expand_vec_perm_d *d)
19748 {
19749 HOST_WIDE_INT odd;
19750 poly_uint64 nelt = d->perm.length ();
19751 rtx out, in0, in1, x;
19752 machine_mode vmode = d->vmode;
19753
19754 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19755 return false;
19756
19757 /* Note that these are little-endian tests.
19758 We correct for big-endian later. */
19759 if (!d->perm[0].is_constant (&odd)
19760 || (odd != 0 && odd != 1)
19761 || !d->perm.series_p (0, 2, odd, 2)
19762 || !d->perm.series_p (1, 2, nelt + odd, 2))
19763 return false;
19764
19765 /* Success! */
19766 if (d->testing_p)
19767 return true;
19768
19769 in0 = d->op0;
19770 in1 = d->op1;
19771 /* We don't need a big-endian lane correction for SVE; see the comment
19772 at the head of aarch64-sve.md for details. */
19773 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19774 {
19775 x = in0, in0 = in1, in1 = x;
19776 odd = !odd;
19777 }
19778 out = d->target;
19779
19780 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19781 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
19782 return true;
19783 }
19784
19785 /* Recognize patterns suitable for the UZP instructions. */
19786 static bool
19787 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
19788 {
19789 HOST_WIDE_INT odd;
19790 rtx out, in0, in1, x;
19791 machine_mode vmode = d->vmode;
19792
19793 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19794 return false;
19795
19796 /* Note that these are little-endian tests.
19797 We correct for big-endian later. */
19798 if (!d->perm[0].is_constant (&odd)
19799 || (odd != 0 && odd != 1)
19800 || !d->perm.series_p (0, 1, odd, 2))
19801 return false;
19802
19803 /* Success! */
19804 if (d->testing_p)
19805 return true;
19806
19807 in0 = d->op0;
19808 in1 = d->op1;
19809 /* We don't need a big-endian lane correction for SVE; see the comment
19810 at the head of aarch64-sve.md for details. */
19811 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19812 {
19813 x = in0, in0 = in1, in1 = x;
19814 odd = !odd;
19815 }
19816 out = d->target;
19817
19818 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19819 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
19820 return true;
19821 }
19822
19823 /* Recognize patterns suitable for the ZIP instructions. */
19824 static bool
19825 aarch64_evpc_zip (struct expand_vec_perm_d *d)
19826 {
19827 unsigned int high;
19828 poly_uint64 nelt = d->perm.length ();
19829 rtx out, in0, in1, x;
19830 machine_mode vmode = d->vmode;
19831
19832 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19833 return false;
19834
19835 /* Note that these are little-endian tests.
19836 We correct for big-endian later. */
19837 poly_uint64 first = d->perm[0];
19838 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
19839 || !d->perm.series_p (0, 2, first, 1)
19840 || !d->perm.series_p (1, 2, first + nelt, 1))
19841 return false;
19842 high = maybe_ne (first, 0U);
19843
19844 /* Success! */
19845 if (d->testing_p)
19846 return true;
19847
19848 in0 = d->op0;
19849 in1 = d->op1;
19850 /* We don't need a big-endian lane correction for SVE; see the comment
19851 at the head of aarch64-sve.md for details. */
19852 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19853 {
19854 x = in0, in0 = in1, in1 = x;
19855 high = !high;
19856 }
19857 out = d->target;
19858
19859 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19860 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
19861 return true;
19862 }
19863
19864 /* Recognize patterns for the EXT insn. */
19865
19866 static bool
19867 aarch64_evpc_ext (struct expand_vec_perm_d *d)
19868 {
19869 HOST_WIDE_INT location;
19870 rtx offset;
19871
19872 /* The first element always refers to the first vector.
19873 Check if the extracted indices are increasing by one. */
19874 if (d->vec_flags == VEC_SVE_PRED
19875 || !d->perm[0].is_constant (&location)
19876 || !d->perm.series_p (0, 1, location, 1))
19877 return false;
19878
19879 /* Success! */
19880 if (d->testing_p)
19881 return true;
19882
19883 /* The case where (location == 0) is a no-op for both big- and little-endian,
19884 and is removed by the mid-end at optimization levels -O1 and higher.
19885
19886 We don't need a big-endian lane correction for SVE; see the comment
19887 at the head of aarch64-sve.md for details. */
19888 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
19889 {
19890 /* After setup, we want the high elements of the first vector (stored
19891 at the LSB end of the register), and the low elements of the second
19892 vector (stored at the MSB end of the register). So swap. */
19893 std::swap (d->op0, d->op1);
19894 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19895 to_constant () is safe since this is restricted to Advanced SIMD
19896 vectors. */
19897 location = d->perm.length ().to_constant () - location;
19898 }
19899
19900 offset = GEN_INT (location);
19901 emit_set_insn (d->target,
19902 gen_rtx_UNSPEC (d->vmode,
19903 gen_rtvec (3, d->op0, d->op1, offset),
19904 UNSPEC_EXT));
19905 return true;
19906 }
19907
19908 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19909 within each 64-bit, 32-bit or 16-bit granule. */
19910
19911 static bool
19912 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
19913 {
19914 HOST_WIDE_INT diff;
19915 unsigned int i, size, unspec;
19916 machine_mode pred_mode;
19917
19918 if (d->vec_flags == VEC_SVE_PRED
19919 || !d->one_vector_p
19920 || !d->perm[0].is_constant (&diff))
19921 return false;
19922
19923 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
19924 if (size == 8)
19925 {
19926 unspec = UNSPEC_REV64;
19927 pred_mode = VNx2BImode;
19928 }
19929 else if (size == 4)
19930 {
19931 unspec = UNSPEC_REV32;
19932 pred_mode = VNx4BImode;
19933 }
19934 else if (size == 2)
19935 {
19936 unspec = UNSPEC_REV16;
19937 pred_mode = VNx8BImode;
19938 }
19939 else
19940 return false;
19941
19942 unsigned int step = diff + 1;
19943 for (i = 0; i < step; ++i)
19944 if (!d->perm.series_p (i, step, diff - i, step))
19945 return false;
19946
19947 /* Success! */
19948 if (d->testing_p)
19949 return true;
19950
19951 if (d->vec_flags == VEC_SVE_DATA)
19952 {
19953 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
19954 rtx target = gen_reg_rtx (int_mode);
19955 if (BYTES_BIG_ENDIAN)
19956 /* The act of taking a subreg between INT_MODE and d->vmode
19957 is itself a reversing operation on big-endian targets;
19958 see the comment at the head of aarch64-sve.md for details.
19959 First reinterpret OP0 as INT_MODE without using a subreg
19960 and without changing the contents. */
19961 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
19962 else
19963 {
19964 /* For SVE we use REV[BHW] unspecs derived from the element size
19965 of v->mode and vector modes whose elements have SIZE bytes.
19966 This ensures that the vector modes match the predicate modes. */
19967 int unspec = aarch64_sve_rev_unspec (d->vmode);
19968 rtx pred = aarch64_ptrue_reg (pred_mode);
19969 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
19970 gen_lowpart (int_mode, d->op0)));
19971 }
19972 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19973 return true;
19974 }
19975 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
19976 emit_set_insn (d->target, src);
19977 return true;
19978 }
19979
19980 /* Recognize patterns for the REV insn, which reverses elements within
19981 a full vector. */
19982
19983 static bool
19984 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
19985 {
19986 poly_uint64 nelt = d->perm.length ();
19987
19988 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
19989 return false;
19990
19991 if (!d->perm.series_p (0, 1, nelt - 1, -1))
19992 return false;
19993
19994 /* Success! */
19995 if (d->testing_p)
19996 return true;
19997
19998 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
19999 emit_set_insn (d->target, src);
20000 return true;
20001 }
20002
20003 static bool
20004 aarch64_evpc_dup (struct expand_vec_perm_d *d)
20005 {
20006 rtx out = d->target;
20007 rtx in0;
20008 HOST_WIDE_INT elt;
20009 machine_mode vmode = d->vmode;
20010 rtx lane;
20011
20012 if (d->vec_flags == VEC_SVE_PRED
20013 || d->perm.encoding ().encoded_nelts () != 1
20014 || !d->perm[0].is_constant (&elt))
20015 return false;
20016
20017 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
20018 return false;
20019
20020 /* Success! */
20021 if (d->testing_p)
20022 return true;
20023
20024 /* The generic preparation in aarch64_expand_vec_perm_const_1
20025 swaps the operand order and the permute indices if it finds
20026 d->perm[0] to be in the second operand. Thus, we can always
20027 use d->op0 and need not do any extra arithmetic to get the
20028 correct lane number. */
20029 in0 = d->op0;
20030 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
20031
20032 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
20033 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
20034 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
20035 return true;
20036 }
20037
20038 static bool
20039 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
20040 {
20041 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
20042 machine_mode vmode = d->vmode;
20043
20044 /* Make sure that the indices are constant. */
20045 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
20046 for (unsigned int i = 0; i < encoded_nelts; ++i)
20047 if (!d->perm[i].is_constant ())
20048 return false;
20049
20050 if (d->testing_p)
20051 return true;
20052
20053 /* Generic code will try constant permutation twice. Once with the
20054 original mode and again with the elements lowered to QImode.
20055 So wait and don't do the selector expansion ourselves. */
20056 if (vmode != V8QImode && vmode != V16QImode)
20057 return false;
20058
20059 /* to_constant is safe since this routine is specific to Advanced SIMD
20060 vectors. */
20061 unsigned int nelt = d->perm.length ().to_constant ();
20062 for (unsigned int i = 0; i < nelt; ++i)
20063 /* If big-endian and two vectors we end up with a weird mixed-endian
20064 mode on NEON. Reverse the index within each word but not the word
20065 itself. to_constant is safe because we checked is_constant above. */
20066 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
20067 ? d->perm[i].to_constant () ^ (nelt - 1)
20068 : d->perm[i].to_constant ());
20069
20070 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
20071 sel = force_reg (vmode, sel);
20072
20073 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
20074 return true;
20075 }
20076
20077 /* Try to implement D using an SVE TBL instruction. */
20078
20079 static bool
20080 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
20081 {
20082 unsigned HOST_WIDE_INT nelt;
20083
20084 /* Permuting two variable-length vectors could overflow the
20085 index range. */
20086 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
20087 return false;
20088
20089 if (d->testing_p)
20090 return true;
20091
20092 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
20093 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
20094 if (d->one_vector_p)
20095 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
20096 else
20097 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
20098 return true;
20099 }
20100
20101 /* Try to implement D using SVE SEL instruction. */
20102
20103 static bool
20104 aarch64_evpc_sel (struct expand_vec_perm_d *d)
20105 {
20106 machine_mode vmode = d->vmode;
20107 int unit_size = GET_MODE_UNIT_SIZE (vmode);
20108
20109 if (d->vec_flags != VEC_SVE_DATA
20110 || unit_size > 8)
20111 return false;
20112
20113 int n_patterns = d->perm.encoding ().npatterns ();
20114 poly_int64 vec_len = d->perm.length ();
20115
20116 for (int i = 0; i < n_patterns; ++i)
20117 if (!known_eq (d->perm[i], i)
20118 && !known_eq (d->perm[i], vec_len + i))
20119 return false;
20120
20121 for (int i = n_patterns; i < n_patterns * 2; i++)
20122 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
20123 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
20124 return false;
20125
20126 if (d->testing_p)
20127 return true;
20128
20129 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
20130
20131 /* Build a predicate that is true when op0 elements should be used. */
20132 rtx_vector_builder builder (pred_mode, n_patterns, 2);
20133 for (int i = 0; i < n_patterns * 2; i++)
20134 {
20135 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
20136 : CONST0_RTX (BImode);
20137 builder.quick_push (elem);
20138 }
20139
20140 rtx const_vec = builder.build ();
20141 rtx pred = force_reg (pred_mode, const_vec);
20142 /* TARGET = PRED ? OP0 : OP1. */
20143 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
20144 return true;
20145 }
20146
20147 static bool
20148 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
20149 {
20150 /* The pattern matching functions above are written to look for a small
20151 number to begin the sequence (0, 1, N/2). If we begin with an index
20152 from the second operand, we can swap the operands. */
20153 poly_int64 nelt = d->perm.length ();
20154 if (known_ge (d->perm[0], nelt))
20155 {
20156 d->perm.rotate_inputs (1);
20157 std::swap (d->op0, d->op1);
20158 }
20159
20160 if ((d->vec_flags == VEC_ADVSIMD
20161 || d->vec_flags == VEC_SVE_DATA
20162 || d->vec_flags == VEC_SVE_PRED)
20163 && known_gt (nelt, 1))
20164 {
20165 if (aarch64_evpc_rev_local (d))
20166 return true;
20167 else if (aarch64_evpc_rev_global (d))
20168 return true;
20169 else if (aarch64_evpc_ext (d))
20170 return true;
20171 else if (aarch64_evpc_dup (d))
20172 return true;
20173 else if (aarch64_evpc_zip (d))
20174 return true;
20175 else if (aarch64_evpc_uzp (d))
20176 return true;
20177 else if (aarch64_evpc_trn (d))
20178 return true;
20179 else if (aarch64_evpc_sel (d))
20180 return true;
20181 if (d->vec_flags == VEC_SVE_DATA)
20182 return aarch64_evpc_sve_tbl (d);
20183 else if (d->vec_flags == VEC_ADVSIMD)
20184 return aarch64_evpc_tbl (d);
20185 }
20186 return false;
20187 }
20188
20189 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
20190
20191 static bool
20192 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
20193 rtx op1, const vec_perm_indices &sel)
20194 {
20195 struct expand_vec_perm_d d;
20196
20197 /* Check whether the mask can be applied to a single vector. */
20198 if (sel.ninputs () == 1
20199 || (op0 && rtx_equal_p (op0, op1)))
20200 d.one_vector_p = true;
20201 else if (sel.all_from_input_p (0))
20202 {
20203 d.one_vector_p = true;
20204 op1 = op0;
20205 }
20206 else if (sel.all_from_input_p (1))
20207 {
20208 d.one_vector_p = true;
20209 op0 = op1;
20210 }
20211 else
20212 d.one_vector_p = false;
20213
20214 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
20215 sel.nelts_per_input ());
20216 d.vmode = vmode;
20217 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
20218 d.target = target;
20219 d.op0 = op0;
20220 d.op1 = op1;
20221 d.testing_p = !target;
20222
20223 if (!d.testing_p)
20224 return aarch64_expand_vec_perm_const_1 (&d);
20225
20226 rtx_insn *last = get_last_insn ();
20227 bool ret = aarch64_expand_vec_perm_const_1 (&d);
20228 gcc_assert (last == get_last_insn ());
20229
20230 return ret;
20231 }
20232
20233 /* Generate a byte permute mask for a register of mode MODE,
20234 which has NUNITS units. */
20235
20236 rtx
20237 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
20238 {
20239 /* We have to reverse each vector because we dont have
20240 a permuted load that can reverse-load according to ABI rules. */
20241 rtx mask;
20242 rtvec v = rtvec_alloc (16);
20243 unsigned int i, j;
20244 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
20245
20246 gcc_assert (BYTES_BIG_ENDIAN);
20247 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
20248
20249 for (i = 0; i < nunits; i++)
20250 for (j = 0; j < usize; j++)
20251 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
20252 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
20253 return force_reg (V16QImode, mask);
20254 }
20255
20256 /* Expand an SVE integer comparison using the SVE equivalent of:
20257
20258 (set TARGET (CODE OP0 OP1)). */
20259
20260 void
20261 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
20262 {
20263 machine_mode pred_mode = GET_MODE (target);
20264 machine_mode data_mode = GET_MODE (op0);
20265 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
20266 op0, op1);
20267 if (!rtx_equal_p (target, res))
20268 emit_move_insn (target, res);
20269 }
20270
20271 /* Return the UNSPEC_COND_* code for comparison CODE. */
20272
20273 static unsigned int
20274 aarch64_unspec_cond_code (rtx_code code)
20275 {
20276 switch (code)
20277 {
20278 case NE:
20279 return UNSPEC_COND_FCMNE;
20280 case EQ:
20281 return UNSPEC_COND_FCMEQ;
20282 case LT:
20283 return UNSPEC_COND_FCMLT;
20284 case GT:
20285 return UNSPEC_COND_FCMGT;
20286 case LE:
20287 return UNSPEC_COND_FCMLE;
20288 case GE:
20289 return UNSPEC_COND_FCMGE;
20290 case UNORDERED:
20291 return UNSPEC_COND_FCMUO;
20292 default:
20293 gcc_unreachable ();
20294 }
20295 }
20296
20297 /* Emit:
20298
20299 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20300
20301 where <X> is the operation associated with comparison CODE.
20302 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
20303
20304 static void
20305 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
20306 bool known_ptrue_p, rtx op0, rtx op1)
20307 {
20308 rtx flag = gen_int_mode (known_ptrue_p, SImode);
20309 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
20310 gen_rtvec (4, pred, flag, op0, op1),
20311 aarch64_unspec_cond_code (code));
20312 emit_set_insn (target, unspec);
20313 }
20314
20315 /* Emit the SVE equivalent of:
20316
20317 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
20318 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
20319 (set TARGET (ior:PRED_MODE TMP1 TMP2))
20320
20321 where <Xi> is the operation associated with comparison CODEi.
20322 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
20323
20324 static void
20325 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
20326 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
20327 {
20328 machine_mode pred_mode = GET_MODE (pred);
20329 rtx tmp1 = gen_reg_rtx (pred_mode);
20330 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
20331 rtx tmp2 = gen_reg_rtx (pred_mode);
20332 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
20333 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
20334 }
20335
20336 /* Emit the SVE equivalent of:
20337
20338 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20339 (set TARGET (not TMP))
20340
20341 where <X> is the operation associated with comparison CODE.
20342 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
20343
20344 static void
20345 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
20346 bool known_ptrue_p, rtx op0, rtx op1)
20347 {
20348 machine_mode pred_mode = GET_MODE (pred);
20349 rtx tmp = gen_reg_rtx (pred_mode);
20350 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
20351 aarch64_emit_unop (target, one_cmpl_optab, tmp);
20352 }
20353
20354 /* Expand an SVE floating-point comparison using the SVE equivalent of:
20355
20356 (set TARGET (CODE OP0 OP1))
20357
20358 If CAN_INVERT_P is true, the caller can also handle inverted results;
20359 return true if the result is in fact inverted. */
20360
20361 bool
20362 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
20363 rtx op0, rtx op1, bool can_invert_p)
20364 {
20365 machine_mode pred_mode = GET_MODE (target);
20366 machine_mode data_mode = GET_MODE (op0);
20367
20368 rtx ptrue = aarch64_ptrue_reg (pred_mode);
20369 switch (code)
20370 {
20371 case UNORDERED:
20372 /* UNORDERED has no immediate form. */
20373 op1 = force_reg (data_mode, op1);
20374 /* fall through */
20375 case LT:
20376 case LE:
20377 case GT:
20378 case GE:
20379 case EQ:
20380 case NE:
20381 {
20382 /* There is native support for the comparison. */
20383 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
20384 return false;
20385 }
20386
20387 case LTGT:
20388 /* This is a trapping operation (LT or GT). */
20389 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
20390 return false;
20391
20392 case UNEQ:
20393 if (!flag_trapping_math)
20394 {
20395 /* This would trap for signaling NaNs. */
20396 op1 = force_reg (data_mode, op1);
20397 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
20398 ptrue, true, op0, op1);
20399 return false;
20400 }
20401 /* fall through */
20402 case UNLT:
20403 case UNLE:
20404 case UNGT:
20405 case UNGE:
20406 if (flag_trapping_math)
20407 {
20408 /* Work out which elements are ordered. */
20409 rtx ordered = gen_reg_rtx (pred_mode);
20410 op1 = force_reg (data_mode, op1);
20411 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
20412 ptrue, true, op0, op1);
20413
20414 /* Test the opposite condition for the ordered elements,
20415 then invert the result. */
20416 if (code == UNEQ)
20417 code = NE;
20418 else
20419 code = reverse_condition_maybe_unordered (code);
20420 if (can_invert_p)
20421 {
20422 aarch64_emit_sve_fp_cond (target, code,
20423 ordered, false, op0, op1);
20424 return true;
20425 }
20426 aarch64_emit_sve_invert_fp_cond (target, code,
20427 ordered, false, op0, op1);
20428 return false;
20429 }
20430 break;
20431
20432 case ORDERED:
20433 /* ORDERED has no immediate form. */
20434 op1 = force_reg (data_mode, op1);
20435 break;
20436
20437 default:
20438 gcc_unreachable ();
20439 }
20440
20441 /* There is native support for the inverse comparison. */
20442 code = reverse_condition_maybe_unordered (code);
20443 if (can_invert_p)
20444 {
20445 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
20446 return true;
20447 }
20448 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
20449 return false;
20450 }
20451
20452 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
20453 of the data being selected and CMP_MODE is the mode of the values being
20454 compared. */
20455
20456 void
20457 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
20458 rtx *ops)
20459 {
20460 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
20461 rtx pred = gen_reg_rtx (pred_mode);
20462 if (FLOAT_MODE_P (cmp_mode))
20463 {
20464 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
20465 ops[4], ops[5], true))
20466 std::swap (ops[1], ops[2]);
20467 }
20468 else
20469 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
20470
20471 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
20472 ops[1] = force_reg (data_mode, ops[1]);
20473 /* The "false" value can only be zero if the "true" value is a constant. */
20474 if (register_operand (ops[1], data_mode)
20475 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
20476 ops[2] = force_reg (data_mode, ops[2]);
20477
20478 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
20479 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
20480 }
20481
20482 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
20483 true. However due to issues with register allocation it is preferable
20484 to avoid tieing integer scalar and FP scalar modes. Executing integer
20485 operations in general registers is better than treating them as scalar
20486 vector operations. This reduces latency and avoids redundant int<->FP
20487 moves. So tie modes if they are either the same class, or vector modes
20488 with other vector modes, vector structs or any scalar mode. */
20489
20490 static bool
20491 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
20492 {
20493 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
20494 return true;
20495
20496 /* We specifically want to allow elements of "structure" modes to
20497 be tieable to the structure. This more general condition allows
20498 other rarer situations too. The reason we don't extend this to
20499 predicate modes is that there are no predicate structure modes
20500 nor any specific instructions for extracting part of a predicate
20501 register. */
20502 if (aarch64_vector_data_mode_p (mode1)
20503 && aarch64_vector_data_mode_p (mode2))
20504 return true;
20505
20506 /* Also allow any scalar modes with vectors. */
20507 if (aarch64_vector_mode_supported_p (mode1)
20508 || aarch64_vector_mode_supported_p (mode2))
20509 return true;
20510
20511 return false;
20512 }
20513
20514 /* Return a new RTX holding the result of moving POINTER forward by
20515 AMOUNT bytes. */
20516
20517 static rtx
20518 aarch64_move_pointer (rtx pointer, poly_int64 amount)
20519 {
20520 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
20521
20522 return adjust_automodify_address (pointer, GET_MODE (pointer),
20523 next, amount);
20524 }
20525
20526 /* Return a new RTX holding the result of moving POINTER forward by the
20527 size of the mode it points to. */
20528
20529 static rtx
20530 aarch64_progress_pointer (rtx pointer)
20531 {
20532 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
20533 }
20534
20535 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
20536 MODE bytes. */
20537
20538 static void
20539 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
20540 machine_mode mode)
20541 {
20542 rtx reg = gen_reg_rtx (mode);
20543
20544 /* "Cast" the pointers to the correct mode. */
20545 *src = adjust_address (*src, mode, 0);
20546 *dst = adjust_address (*dst, mode, 0);
20547 /* Emit the memcpy. */
20548 emit_move_insn (reg, *src);
20549 emit_move_insn (*dst, reg);
20550 /* Move the pointers forward. */
20551 *src = aarch64_progress_pointer (*src);
20552 *dst = aarch64_progress_pointer (*dst);
20553 }
20554
20555 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
20556 we succeed, otherwise return false. */
20557
20558 bool
20559 aarch64_expand_cpymem (rtx *operands)
20560 {
20561 int n, mode_bits;
20562 rtx dst = operands[0];
20563 rtx src = operands[1];
20564 rtx base;
20565 machine_mode cur_mode = BLKmode, next_mode;
20566 bool speed_p = !optimize_function_for_size_p (cfun);
20567
20568 /* When optimizing for size, give a better estimate of the length of a
20569 memcpy call, but use the default otherwise. Moves larger than 8 bytes
20570 will always require an even number of instructions to do now. And each
20571 operation requires both a load+store, so devide the max number by 2. */
20572 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
20573
20574 /* We can't do anything smart if the amount to copy is not constant. */
20575 if (!CONST_INT_P (operands[2]))
20576 return false;
20577
20578 n = INTVAL (operands[2]);
20579
20580 /* Try to keep the number of instructions low. For all cases we will do at
20581 most two moves for the residual amount, since we'll always overlap the
20582 remainder. */
20583 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
20584 return false;
20585
20586 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
20587 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
20588
20589 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
20590 src = adjust_automodify_address (src, VOIDmode, base, 0);
20591
20592 /* Convert n to bits to make the rest of the code simpler. */
20593 n = n * BITS_PER_UNIT;
20594
20595 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
20596 larger than TImode, but we should not use them for loads/stores here. */
20597 const int copy_limit = GET_MODE_BITSIZE (TImode);
20598
20599 while (n > 0)
20600 {
20601 /* Find the largest mode in which to do the copy in without over reading
20602 or writing. */
20603 opt_scalar_int_mode mode_iter;
20604 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
20605 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
20606 cur_mode = mode_iter.require ();
20607
20608 gcc_assert (cur_mode != BLKmode);
20609
20610 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
20611 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
20612
20613 n -= mode_bits;
20614
20615 /* Do certain trailing copies as overlapping if it's going to be
20616 cheaper. i.e. less instructions to do so. For instance doing a 15
20617 byte copy it's more efficient to do two overlapping 8 byte copies than
20618 8 + 6 + 1. */
20619 if (n > 0 && n <= 8 * BITS_PER_UNIT)
20620 {
20621 next_mode = smallest_mode_for_size (n, MODE_INT);
20622 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
20623 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
20624 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
20625 n = n_bits;
20626 }
20627 }
20628
20629 return true;
20630 }
20631
20632 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
20633 SImode stores. Handle the case when the constant has identical
20634 bottom and top halves. This is beneficial when the two stores can be
20635 merged into an STP and we avoid synthesising potentially expensive
20636 immediates twice. Return true if such a split is possible. */
20637
20638 bool
20639 aarch64_split_dimode_const_store (rtx dst, rtx src)
20640 {
20641 rtx lo = gen_lowpart (SImode, src);
20642 rtx hi = gen_highpart_mode (SImode, DImode, src);
20643
20644 bool size_p = optimize_function_for_size_p (cfun);
20645
20646 if (!rtx_equal_p (lo, hi))
20647 return false;
20648
20649 unsigned int orig_cost
20650 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
20651 unsigned int lo_cost
20652 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
20653
20654 /* We want to transform:
20655 MOV x1, 49370
20656 MOVK x1, 0x140, lsl 16
20657 MOVK x1, 0xc0da, lsl 32
20658 MOVK x1, 0x140, lsl 48
20659 STR x1, [x0]
20660 into:
20661 MOV w1, 49370
20662 MOVK w1, 0x140, lsl 16
20663 STP w1, w1, [x0]
20664 So we want to perform this only when we save two instructions
20665 or more. When optimizing for size, however, accept any code size
20666 savings we can. */
20667 if (size_p && orig_cost <= lo_cost)
20668 return false;
20669
20670 if (!size_p
20671 && (orig_cost <= lo_cost + 1))
20672 return false;
20673
20674 rtx mem_lo = adjust_address (dst, SImode, 0);
20675 if (!aarch64_mem_pair_operand (mem_lo, SImode))
20676 return false;
20677
20678 rtx tmp_reg = gen_reg_rtx (SImode);
20679 aarch64_expand_mov_immediate (tmp_reg, lo);
20680 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
20681 /* Don't emit an explicit store pair as this may not be always profitable.
20682 Let the sched-fusion logic decide whether to merge them. */
20683 emit_move_insn (mem_lo, tmp_reg);
20684 emit_move_insn (mem_hi, tmp_reg);
20685
20686 return true;
20687 }
20688
20689 /* Generate RTL for a conditional branch with rtx comparison CODE in
20690 mode CC_MODE. The destination of the unlikely conditional branch
20691 is LABEL_REF. */
20692
20693 void
20694 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
20695 rtx label_ref)
20696 {
20697 rtx x;
20698 x = gen_rtx_fmt_ee (code, VOIDmode,
20699 gen_rtx_REG (cc_mode, CC_REGNUM),
20700 const0_rtx);
20701
20702 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
20703 gen_rtx_LABEL_REF (VOIDmode, label_ref),
20704 pc_rtx);
20705 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
20706 }
20707
20708 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20709
20710 OP1 represents the TImode destination operand 1
20711 OP2 represents the TImode destination operand 2
20712 LOW_DEST represents the low half (DImode) of TImode operand 0
20713 LOW_IN1 represents the low half (DImode) of TImode operand 1
20714 LOW_IN2 represents the low half (DImode) of TImode operand 2
20715 HIGH_DEST represents the high half (DImode) of TImode operand 0
20716 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20717 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20718
20719 void
20720 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20721 rtx *low_in1, rtx *low_in2,
20722 rtx *high_dest, rtx *high_in1,
20723 rtx *high_in2)
20724 {
20725 *low_dest = gen_reg_rtx (DImode);
20726 *low_in1 = gen_lowpart (DImode, op1);
20727 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20728 subreg_lowpart_offset (DImode, TImode));
20729 *high_dest = gen_reg_rtx (DImode);
20730 *high_in1 = gen_highpart (DImode, op1);
20731 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20732 subreg_highpart_offset (DImode, TImode));
20733 }
20734
20735 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20736
20737 This function differs from 'arch64_addti_scratch_regs' in that
20738 OP1 can be an immediate constant (zero). We must call
20739 subreg_highpart_offset with DImode and TImode arguments, otherwise
20740 VOIDmode will be used for the const_int which generates an internal
20741 error from subreg_size_highpart_offset which does not expect a size of zero.
20742
20743 OP1 represents the TImode destination operand 1
20744 OP2 represents the TImode destination operand 2
20745 LOW_DEST represents the low half (DImode) of TImode operand 0
20746 LOW_IN1 represents the low half (DImode) of TImode operand 1
20747 LOW_IN2 represents the low half (DImode) of TImode operand 2
20748 HIGH_DEST represents the high half (DImode) of TImode operand 0
20749 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20750 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20751
20752
20753 void
20754 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20755 rtx *low_in1, rtx *low_in2,
20756 rtx *high_dest, rtx *high_in1,
20757 rtx *high_in2)
20758 {
20759 *low_dest = gen_reg_rtx (DImode);
20760 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
20761 subreg_lowpart_offset (DImode, TImode));
20762
20763 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20764 subreg_lowpart_offset (DImode, TImode));
20765 *high_dest = gen_reg_rtx (DImode);
20766
20767 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
20768 subreg_highpart_offset (DImode, TImode));
20769 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20770 subreg_highpart_offset (DImode, TImode));
20771 }
20772
20773 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20774
20775 OP0 represents the TImode destination operand 0
20776 LOW_DEST represents the low half (DImode) of TImode operand 0
20777 LOW_IN1 represents the low half (DImode) of TImode operand 1
20778 LOW_IN2 represents the low half (DImode) of TImode operand 2
20779 HIGH_DEST represents the high half (DImode) of TImode operand 0
20780 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20781 HIGH_IN2 represents the high half (DImode) of TImode operand 2
20782 UNSIGNED_P is true if the operation is being performed on unsigned
20783 values. */
20784 void
20785 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
20786 rtx low_in2, rtx high_dest, rtx high_in1,
20787 rtx high_in2, bool unsigned_p)
20788 {
20789 if (low_in2 == const0_rtx)
20790 {
20791 low_dest = low_in1;
20792 high_in2 = force_reg (DImode, high_in2);
20793 if (unsigned_p)
20794 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
20795 else
20796 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
20797 }
20798 else
20799 {
20800 if (aarch64_plus_immediate (low_in2, DImode))
20801 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
20802 GEN_INT (-INTVAL (low_in2))));
20803 else
20804 {
20805 low_in2 = force_reg (DImode, low_in2);
20806 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
20807 }
20808 high_in2 = force_reg (DImode, high_in2);
20809
20810 if (unsigned_p)
20811 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
20812 else
20813 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
20814 }
20815
20816 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
20817 emit_move_insn (gen_highpart (DImode, op0), high_dest);
20818
20819 }
20820
20821 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
20822
20823 static unsigned HOST_WIDE_INT
20824 aarch64_asan_shadow_offset (void)
20825 {
20826 if (TARGET_ILP32)
20827 return (HOST_WIDE_INT_1 << 29);
20828 else
20829 return (HOST_WIDE_INT_1 << 36);
20830 }
20831
20832 static rtx
20833 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
20834 int code, tree treeop0, tree treeop1)
20835 {
20836 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20837 rtx op0, op1;
20838 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20839 insn_code icode;
20840 struct expand_operand ops[4];
20841
20842 start_sequence ();
20843 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20844
20845 op_mode = GET_MODE (op0);
20846 if (op_mode == VOIDmode)
20847 op_mode = GET_MODE (op1);
20848
20849 switch (op_mode)
20850 {
20851 case E_QImode:
20852 case E_HImode:
20853 case E_SImode:
20854 cmp_mode = SImode;
20855 icode = CODE_FOR_cmpsi;
20856 break;
20857
20858 case E_DImode:
20859 cmp_mode = DImode;
20860 icode = CODE_FOR_cmpdi;
20861 break;
20862
20863 case E_SFmode:
20864 cmp_mode = SFmode;
20865 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20866 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
20867 break;
20868
20869 case E_DFmode:
20870 cmp_mode = DFmode;
20871 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20872 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
20873 break;
20874
20875 default:
20876 end_sequence ();
20877 return NULL_RTX;
20878 }
20879
20880 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
20881 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
20882 if (!op0 || !op1)
20883 {
20884 end_sequence ();
20885 return NULL_RTX;
20886 }
20887 *prep_seq = get_insns ();
20888 end_sequence ();
20889
20890 create_fixed_operand (&ops[0], op0);
20891 create_fixed_operand (&ops[1], op1);
20892
20893 start_sequence ();
20894 if (!maybe_expand_insn (icode, 2, ops))
20895 {
20896 end_sequence ();
20897 return NULL_RTX;
20898 }
20899 *gen_seq = get_insns ();
20900 end_sequence ();
20901
20902 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
20903 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
20904 }
20905
20906 static rtx
20907 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
20908 int cmp_code, tree treeop0, tree treeop1, int bit_code)
20909 {
20910 rtx op0, op1, target;
20911 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20912 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20913 insn_code icode;
20914 struct expand_operand ops[6];
20915 int aarch64_cond;
20916
20917 push_to_sequence (*prep_seq);
20918 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20919
20920 op_mode = GET_MODE (op0);
20921 if (op_mode == VOIDmode)
20922 op_mode = GET_MODE (op1);
20923
20924 switch (op_mode)
20925 {
20926 case E_QImode:
20927 case E_HImode:
20928 case E_SImode:
20929 cmp_mode = SImode;
20930 break;
20931
20932 case E_DImode:
20933 cmp_mode = DImode;
20934 break;
20935
20936 case E_SFmode:
20937 cmp_mode = SFmode;
20938 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20939 break;
20940
20941 case E_DFmode:
20942 cmp_mode = DFmode;
20943 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20944 break;
20945
20946 default:
20947 end_sequence ();
20948 return NULL_RTX;
20949 }
20950
20951 icode = code_for_ccmp (cc_mode, cmp_mode);
20952
20953 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
20954 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
20955 if (!op0 || !op1)
20956 {
20957 end_sequence ();
20958 return NULL_RTX;
20959 }
20960 *prep_seq = get_insns ();
20961 end_sequence ();
20962
20963 target = gen_rtx_REG (cc_mode, CC_REGNUM);
20964 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
20965
20966 if (bit_code != AND)
20967 {
20968 /* Treat the ccmp patterns as canonical and use them where possible,
20969 but fall back to ccmp_rev patterns if there's no other option. */
20970 rtx_code prev_code = GET_CODE (prev);
20971 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
20972 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
20973 && !(prev_code == EQ
20974 || prev_code == NE
20975 || prev_code == ORDERED
20976 || prev_code == UNORDERED))
20977 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
20978 else
20979 {
20980 rtx_code code = reverse_condition (prev_code);
20981 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
20982 }
20983 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
20984 }
20985
20986 create_fixed_operand (&ops[0], XEXP (prev, 0));
20987 create_fixed_operand (&ops[1], target);
20988 create_fixed_operand (&ops[2], op0);
20989 create_fixed_operand (&ops[3], op1);
20990 create_fixed_operand (&ops[4], prev);
20991 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
20992
20993 push_to_sequence (*gen_seq);
20994 if (!maybe_expand_insn (icode, 6, ops))
20995 {
20996 end_sequence ();
20997 return NULL_RTX;
20998 }
20999
21000 *gen_seq = get_insns ();
21001 end_sequence ();
21002
21003 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
21004 }
21005
21006 #undef TARGET_GEN_CCMP_FIRST
21007 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
21008
21009 #undef TARGET_GEN_CCMP_NEXT
21010 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
21011
21012 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
21013 instruction fusion of some sort. */
21014
21015 static bool
21016 aarch64_macro_fusion_p (void)
21017 {
21018 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
21019 }
21020
21021
21022 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
21023 should be kept together during scheduling. */
21024
21025 static bool
21026 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
21027 {
21028 rtx set_dest;
21029 rtx prev_set = single_set (prev);
21030 rtx curr_set = single_set (curr);
21031 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
21032 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
21033
21034 if (!aarch64_macro_fusion_p ())
21035 return false;
21036
21037 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
21038 {
21039 /* We are trying to match:
21040 prev (mov) == (set (reg r0) (const_int imm16))
21041 curr (movk) == (set (zero_extract (reg r0)
21042 (const_int 16)
21043 (const_int 16))
21044 (const_int imm16_1)) */
21045
21046 set_dest = SET_DEST (curr_set);
21047
21048 if (GET_CODE (set_dest) == ZERO_EXTRACT
21049 && CONST_INT_P (SET_SRC (curr_set))
21050 && CONST_INT_P (SET_SRC (prev_set))
21051 && CONST_INT_P (XEXP (set_dest, 2))
21052 && INTVAL (XEXP (set_dest, 2)) == 16
21053 && REG_P (XEXP (set_dest, 0))
21054 && REG_P (SET_DEST (prev_set))
21055 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
21056 {
21057 return true;
21058 }
21059 }
21060
21061 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
21062 {
21063
21064 /* We're trying to match:
21065 prev (adrp) == (set (reg r1)
21066 (high (symbol_ref ("SYM"))))
21067 curr (add) == (set (reg r0)
21068 (lo_sum (reg r1)
21069 (symbol_ref ("SYM"))))
21070 Note that r0 need not necessarily be the same as r1, especially
21071 during pre-regalloc scheduling. */
21072
21073 if (satisfies_constraint_Ush (SET_SRC (prev_set))
21074 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21075 {
21076 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
21077 && REG_P (XEXP (SET_SRC (curr_set), 0))
21078 && REGNO (XEXP (SET_SRC (curr_set), 0))
21079 == REGNO (SET_DEST (prev_set))
21080 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
21081 XEXP (SET_SRC (curr_set), 1)))
21082 return true;
21083 }
21084 }
21085
21086 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
21087 {
21088
21089 /* We're trying to match:
21090 prev (movk) == (set (zero_extract (reg r0)
21091 (const_int 16)
21092 (const_int 32))
21093 (const_int imm16_1))
21094 curr (movk) == (set (zero_extract (reg r0)
21095 (const_int 16)
21096 (const_int 48))
21097 (const_int imm16_2)) */
21098
21099 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
21100 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
21101 && REG_P (XEXP (SET_DEST (prev_set), 0))
21102 && REG_P (XEXP (SET_DEST (curr_set), 0))
21103 && REGNO (XEXP (SET_DEST (prev_set), 0))
21104 == REGNO (XEXP (SET_DEST (curr_set), 0))
21105 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
21106 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
21107 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
21108 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
21109 && CONST_INT_P (SET_SRC (prev_set))
21110 && CONST_INT_P (SET_SRC (curr_set)))
21111 return true;
21112
21113 }
21114 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
21115 {
21116 /* We're trying to match:
21117 prev (adrp) == (set (reg r0)
21118 (high (symbol_ref ("SYM"))))
21119 curr (ldr) == (set (reg r1)
21120 (mem (lo_sum (reg r0)
21121 (symbol_ref ("SYM")))))
21122 or
21123 curr (ldr) == (set (reg r1)
21124 (zero_extend (mem
21125 (lo_sum (reg r0)
21126 (symbol_ref ("SYM")))))) */
21127 if (satisfies_constraint_Ush (SET_SRC (prev_set))
21128 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21129 {
21130 rtx curr_src = SET_SRC (curr_set);
21131
21132 if (GET_CODE (curr_src) == ZERO_EXTEND)
21133 curr_src = XEXP (curr_src, 0);
21134
21135 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
21136 && REG_P (XEXP (XEXP (curr_src, 0), 0))
21137 && REGNO (XEXP (XEXP (curr_src, 0), 0))
21138 == REGNO (SET_DEST (prev_set))
21139 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
21140 XEXP (SET_SRC (prev_set), 0)))
21141 return true;
21142 }
21143 }
21144
21145 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
21146 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
21147 && prev_set && curr_set && any_condjump_p (curr)
21148 && GET_CODE (SET_SRC (prev_set)) == COMPARE
21149 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
21150 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
21151 return true;
21152
21153 /* Fuse flag-setting ALU instructions and conditional branch. */
21154 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
21155 && any_condjump_p (curr))
21156 {
21157 unsigned int condreg1, condreg2;
21158 rtx cc_reg_1;
21159 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
21160 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
21161
21162 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
21163 && prev
21164 && modified_in_p (cc_reg_1, prev))
21165 {
21166 enum attr_type prev_type = get_attr_type (prev);
21167
21168 /* FIXME: this misses some which is considered simple arthematic
21169 instructions for ThunderX. Simple shifts are missed here. */
21170 if (prev_type == TYPE_ALUS_SREG
21171 || prev_type == TYPE_ALUS_IMM
21172 || prev_type == TYPE_LOGICS_REG
21173 || prev_type == TYPE_LOGICS_IMM)
21174 return true;
21175 }
21176 }
21177
21178 /* Fuse ALU instructions and CBZ/CBNZ. */
21179 if (prev_set
21180 && curr_set
21181 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
21182 && any_condjump_p (curr))
21183 {
21184 /* We're trying to match:
21185 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
21186 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
21187 (const_int 0))
21188 (label_ref ("SYM"))
21189 (pc)) */
21190 if (SET_DEST (curr_set) == (pc_rtx)
21191 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
21192 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
21193 && REG_P (SET_DEST (prev_set))
21194 && REGNO (SET_DEST (prev_set))
21195 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
21196 {
21197 /* Fuse ALU operations followed by conditional branch instruction. */
21198 switch (get_attr_type (prev))
21199 {
21200 case TYPE_ALU_IMM:
21201 case TYPE_ALU_SREG:
21202 case TYPE_ADC_REG:
21203 case TYPE_ADC_IMM:
21204 case TYPE_ADCS_REG:
21205 case TYPE_ADCS_IMM:
21206 case TYPE_LOGIC_REG:
21207 case TYPE_LOGIC_IMM:
21208 case TYPE_CSEL:
21209 case TYPE_ADR:
21210 case TYPE_MOV_IMM:
21211 case TYPE_SHIFT_REG:
21212 case TYPE_SHIFT_IMM:
21213 case TYPE_BFM:
21214 case TYPE_RBIT:
21215 case TYPE_REV:
21216 case TYPE_EXTEND:
21217 return true;
21218
21219 default:;
21220 }
21221 }
21222 }
21223
21224 return false;
21225 }
21226
21227 /* Return true iff the instruction fusion described by OP is enabled. */
21228
21229 bool
21230 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
21231 {
21232 return (aarch64_tune_params.fusible_ops & op) != 0;
21233 }
21234
21235 /* If MEM is in the form of [base+offset], extract the two parts
21236 of address and set to BASE and OFFSET, otherwise return false
21237 after clearing BASE and OFFSET. */
21238
21239 bool
21240 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
21241 {
21242 rtx addr;
21243
21244 gcc_assert (MEM_P (mem));
21245
21246 addr = XEXP (mem, 0);
21247
21248 if (REG_P (addr))
21249 {
21250 *base = addr;
21251 *offset = const0_rtx;
21252 return true;
21253 }
21254
21255 if (GET_CODE (addr) == PLUS
21256 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
21257 {
21258 *base = XEXP (addr, 0);
21259 *offset = XEXP (addr, 1);
21260 return true;
21261 }
21262
21263 *base = NULL_RTX;
21264 *offset = NULL_RTX;
21265
21266 return false;
21267 }
21268
21269 /* Types for scheduling fusion. */
21270 enum sched_fusion_type
21271 {
21272 SCHED_FUSION_NONE = 0,
21273 SCHED_FUSION_LD_SIGN_EXTEND,
21274 SCHED_FUSION_LD_ZERO_EXTEND,
21275 SCHED_FUSION_LD,
21276 SCHED_FUSION_ST,
21277 SCHED_FUSION_NUM
21278 };
21279
21280 /* If INSN is a load or store of address in the form of [base+offset],
21281 extract the two parts and set to BASE and OFFSET. Return scheduling
21282 fusion type this INSN is. */
21283
21284 static enum sched_fusion_type
21285 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
21286 {
21287 rtx x, dest, src;
21288 enum sched_fusion_type fusion = SCHED_FUSION_LD;
21289
21290 gcc_assert (INSN_P (insn));
21291 x = PATTERN (insn);
21292 if (GET_CODE (x) != SET)
21293 return SCHED_FUSION_NONE;
21294
21295 src = SET_SRC (x);
21296 dest = SET_DEST (x);
21297
21298 machine_mode dest_mode = GET_MODE (dest);
21299
21300 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
21301 return SCHED_FUSION_NONE;
21302
21303 if (GET_CODE (src) == SIGN_EXTEND)
21304 {
21305 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
21306 src = XEXP (src, 0);
21307 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21308 return SCHED_FUSION_NONE;
21309 }
21310 else if (GET_CODE (src) == ZERO_EXTEND)
21311 {
21312 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
21313 src = XEXP (src, 0);
21314 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21315 return SCHED_FUSION_NONE;
21316 }
21317
21318 if (GET_CODE (src) == MEM && REG_P (dest))
21319 extract_base_offset_in_addr (src, base, offset);
21320 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
21321 {
21322 fusion = SCHED_FUSION_ST;
21323 extract_base_offset_in_addr (dest, base, offset);
21324 }
21325 else
21326 return SCHED_FUSION_NONE;
21327
21328 if (*base == NULL_RTX || *offset == NULL_RTX)
21329 fusion = SCHED_FUSION_NONE;
21330
21331 return fusion;
21332 }
21333
21334 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
21335
21336 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
21337 and PRI are only calculated for these instructions. For other instruction,
21338 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
21339 type instruction fusion can be added by returning different priorities.
21340
21341 It's important that irrelevant instructions get the largest FUSION_PRI. */
21342
21343 static void
21344 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
21345 int *fusion_pri, int *pri)
21346 {
21347 int tmp, off_val;
21348 rtx base, offset;
21349 enum sched_fusion_type fusion;
21350
21351 gcc_assert (INSN_P (insn));
21352
21353 tmp = max_pri - 1;
21354 fusion = fusion_load_store (insn, &base, &offset);
21355 if (fusion == SCHED_FUSION_NONE)
21356 {
21357 *pri = tmp;
21358 *fusion_pri = tmp;
21359 return;
21360 }
21361
21362 /* Set FUSION_PRI according to fusion type and base register. */
21363 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
21364
21365 /* Calculate PRI. */
21366 tmp /= 2;
21367
21368 /* INSN with smaller offset goes first. */
21369 off_val = (int)(INTVAL (offset));
21370 if (off_val >= 0)
21371 tmp -= (off_val & 0xfffff);
21372 else
21373 tmp += ((- off_val) & 0xfffff);
21374
21375 *pri = tmp;
21376 return;
21377 }
21378
21379 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
21380 Adjust priority of sha1h instructions so they are scheduled before
21381 other SHA1 instructions. */
21382
21383 static int
21384 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
21385 {
21386 rtx x = PATTERN (insn);
21387
21388 if (GET_CODE (x) == SET)
21389 {
21390 x = SET_SRC (x);
21391
21392 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
21393 return priority + 10;
21394 }
21395
21396 return priority;
21397 }
21398
21399 /* Given OPERANDS of consecutive load/store, check if we can merge
21400 them into ldp/stp. LOAD is true if they are load instructions.
21401 MODE is the mode of memory operands. */
21402
21403 bool
21404 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
21405 machine_mode mode)
21406 {
21407 HOST_WIDE_INT offval_1, offval_2, msize;
21408 enum reg_class rclass_1, rclass_2;
21409 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
21410
21411 if (load)
21412 {
21413 mem_1 = operands[1];
21414 mem_2 = operands[3];
21415 reg_1 = operands[0];
21416 reg_2 = operands[2];
21417 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
21418 if (REGNO (reg_1) == REGNO (reg_2))
21419 return false;
21420 }
21421 else
21422 {
21423 mem_1 = operands[0];
21424 mem_2 = operands[2];
21425 reg_1 = operands[1];
21426 reg_2 = operands[3];
21427 }
21428
21429 /* The mems cannot be volatile. */
21430 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
21431 return false;
21432
21433 /* If we have SImode and slow unaligned ldp,
21434 check the alignment to be at least 8 byte. */
21435 if (mode == SImode
21436 && (aarch64_tune_params.extra_tuning_flags
21437 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21438 && !optimize_size
21439 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
21440 return false;
21441
21442 /* Check if the addresses are in the form of [base+offset]. */
21443 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
21444 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
21445 return false;
21446 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
21447 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
21448 return false;
21449
21450 /* Check if the bases are same. */
21451 if (!rtx_equal_p (base_1, base_2))
21452 return false;
21453
21454 /* The operands must be of the same size. */
21455 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
21456 GET_MODE_SIZE (GET_MODE (mem_2))));
21457
21458 offval_1 = INTVAL (offset_1);
21459 offval_2 = INTVAL (offset_2);
21460 /* We should only be trying this for fixed-sized modes. There is no
21461 SVE LDP/STP instruction. */
21462 msize = GET_MODE_SIZE (mode).to_constant ();
21463 /* Check if the offsets are consecutive. */
21464 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
21465 return false;
21466
21467 /* Check if the addresses are clobbered by load. */
21468 if (load)
21469 {
21470 if (reg_mentioned_p (reg_1, mem_1))
21471 return false;
21472
21473 /* In increasing order, the last load can clobber the address. */
21474 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
21475 return false;
21476 }
21477
21478 /* One of the memory accesses must be a mempair operand.
21479 If it is not the first one, they need to be swapped by the
21480 peephole. */
21481 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
21482 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
21483 return false;
21484
21485 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
21486 rclass_1 = FP_REGS;
21487 else
21488 rclass_1 = GENERAL_REGS;
21489
21490 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
21491 rclass_2 = FP_REGS;
21492 else
21493 rclass_2 = GENERAL_REGS;
21494
21495 /* Check if the registers are of same class. */
21496 if (rclass_1 != rclass_2)
21497 return false;
21498
21499 return true;
21500 }
21501
21502 /* Given OPERANDS of consecutive load/store that can be merged,
21503 swap them if they are not in ascending order. */
21504 void
21505 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
21506 {
21507 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
21508 HOST_WIDE_INT offval_1, offval_2;
21509
21510 if (load)
21511 {
21512 mem_1 = operands[1];
21513 mem_2 = operands[3];
21514 }
21515 else
21516 {
21517 mem_1 = operands[0];
21518 mem_2 = operands[2];
21519 }
21520
21521 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
21522 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
21523
21524 offval_1 = INTVAL (offset_1);
21525 offval_2 = INTVAL (offset_2);
21526
21527 if (offval_1 > offval_2)
21528 {
21529 /* Irrespective of whether this is a load or a store,
21530 we do the same swap. */
21531 std::swap (operands[0], operands[2]);
21532 std::swap (operands[1], operands[3]);
21533 }
21534 }
21535
21536 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
21537 comparison between the two. */
21538 int
21539 aarch64_host_wide_int_compare (const void *x, const void *y)
21540 {
21541 return wi::cmps (* ((const HOST_WIDE_INT *) x),
21542 * ((const HOST_WIDE_INT *) y));
21543 }
21544
21545 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
21546 other pointing to a REG rtx containing an offset, compare the offsets
21547 of the two pairs.
21548
21549 Return:
21550
21551 1 iff offset (X) > offset (Y)
21552 0 iff offset (X) == offset (Y)
21553 -1 iff offset (X) < offset (Y) */
21554 int
21555 aarch64_ldrstr_offset_compare (const void *x, const void *y)
21556 {
21557 const rtx * operands_1 = (const rtx *) x;
21558 const rtx * operands_2 = (const rtx *) y;
21559 rtx mem_1, mem_2, base, offset_1, offset_2;
21560
21561 if (MEM_P (operands_1[0]))
21562 mem_1 = operands_1[0];
21563 else
21564 mem_1 = operands_1[1];
21565
21566 if (MEM_P (operands_2[0]))
21567 mem_2 = operands_2[0];
21568 else
21569 mem_2 = operands_2[1];
21570
21571 /* Extract the offsets. */
21572 extract_base_offset_in_addr (mem_1, &base, &offset_1);
21573 extract_base_offset_in_addr (mem_2, &base, &offset_2);
21574
21575 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
21576
21577 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
21578 }
21579
21580 /* Given OPERANDS of consecutive load/store, check if we can merge
21581 them into ldp/stp by adjusting the offset. LOAD is true if they
21582 are load instructions. MODE is the mode of memory operands.
21583
21584 Given below consecutive stores:
21585
21586 str w1, [xb, 0x100]
21587 str w1, [xb, 0x104]
21588 str w1, [xb, 0x108]
21589 str w1, [xb, 0x10c]
21590
21591 Though the offsets are out of the range supported by stp, we can
21592 still pair them after adjusting the offset, like:
21593
21594 add scratch, xb, 0x100
21595 stp w1, w1, [scratch]
21596 stp w1, w1, [scratch, 0x8]
21597
21598 The peephole patterns detecting this opportunity should guarantee
21599 the scratch register is avaliable. */
21600
21601 bool
21602 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
21603 scalar_mode mode)
21604 {
21605 const int num_insns = 4;
21606 enum reg_class rclass;
21607 HOST_WIDE_INT offvals[num_insns], msize;
21608 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
21609
21610 if (load)
21611 {
21612 for (int i = 0; i < num_insns; i++)
21613 {
21614 reg[i] = operands[2 * i];
21615 mem[i] = operands[2 * i + 1];
21616
21617 gcc_assert (REG_P (reg[i]));
21618 }
21619
21620 /* Do not attempt to merge the loads if the loads clobber each other. */
21621 for (int i = 0; i < 8; i += 2)
21622 for (int j = i + 2; j < 8; j += 2)
21623 if (reg_overlap_mentioned_p (operands[i], operands[j]))
21624 return false;
21625 }
21626 else
21627 for (int i = 0; i < num_insns; i++)
21628 {
21629 mem[i] = operands[2 * i];
21630 reg[i] = operands[2 * i + 1];
21631 }
21632
21633 /* Skip if memory operand is by itself valid for ldp/stp. */
21634 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
21635 return false;
21636
21637 for (int i = 0; i < num_insns; i++)
21638 {
21639 /* The mems cannot be volatile. */
21640 if (MEM_VOLATILE_P (mem[i]))
21641 return false;
21642
21643 /* Check if the addresses are in the form of [base+offset]. */
21644 extract_base_offset_in_addr (mem[i], base + i, offset + i);
21645 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
21646 return false;
21647 }
21648
21649 /* Check if the registers are of same class. */
21650 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
21651 ? FP_REGS : GENERAL_REGS;
21652
21653 for (int i = 1; i < num_insns; i++)
21654 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
21655 {
21656 if (rclass != FP_REGS)
21657 return false;
21658 }
21659 else
21660 {
21661 if (rclass != GENERAL_REGS)
21662 return false;
21663 }
21664
21665 /* Only the last register in the order in which they occur
21666 may be clobbered by the load. */
21667 if (rclass == GENERAL_REGS && load)
21668 for (int i = 0; i < num_insns - 1; i++)
21669 if (reg_mentioned_p (reg[i], mem[i]))
21670 return false;
21671
21672 /* Check if the bases are same. */
21673 for (int i = 0; i < num_insns - 1; i++)
21674 if (!rtx_equal_p (base[i], base[i + 1]))
21675 return false;
21676
21677 for (int i = 0; i < num_insns; i++)
21678 offvals[i] = INTVAL (offset[i]);
21679
21680 msize = GET_MODE_SIZE (mode);
21681
21682 /* Check if the offsets can be put in the right order to do a ldp/stp. */
21683 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
21684 aarch64_host_wide_int_compare);
21685
21686 if (!(offvals[1] == offvals[0] + msize
21687 && offvals[3] == offvals[2] + msize))
21688 return false;
21689
21690 /* Check that offsets are within range of each other. The ldp/stp
21691 instructions have 7 bit immediate offsets, so use 0x80. */
21692 if (offvals[2] - offvals[0] >= msize * 0x80)
21693 return false;
21694
21695 /* The offsets must be aligned with respect to each other. */
21696 if (offvals[0] % msize != offvals[2] % msize)
21697 return false;
21698
21699 /* If we have SImode and slow unaligned ldp,
21700 check the alignment to be at least 8 byte. */
21701 if (mode == SImode
21702 && (aarch64_tune_params.extra_tuning_flags
21703 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21704 && !optimize_size
21705 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
21706 return false;
21707
21708 return true;
21709 }
21710
21711 /* Given OPERANDS of consecutive load/store, this function pairs them
21712 into LDP/STP after adjusting the offset. It depends on the fact
21713 that the operands can be sorted so the offsets are correct for STP.
21714 MODE is the mode of memory operands. CODE is the rtl operator
21715 which should be applied to all memory operands, it's SIGN_EXTEND,
21716 ZERO_EXTEND or UNKNOWN. */
21717
21718 bool
21719 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
21720 scalar_mode mode, RTX_CODE code)
21721 {
21722 rtx base, offset_1, offset_3, t1, t2;
21723 rtx mem_1, mem_2, mem_3, mem_4;
21724 rtx temp_operands[8];
21725 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
21726 stp_off_upper_limit, stp_off_lower_limit, msize;
21727
21728 /* We make changes on a copy as we may still bail out. */
21729 for (int i = 0; i < 8; i ++)
21730 temp_operands[i] = operands[i];
21731
21732 /* Sort the operands. */
21733 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
21734
21735 /* Copy the memory operands so that if we have to bail for some
21736 reason the original addresses are unchanged. */
21737 if (load)
21738 {
21739 mem_1 = copy_rtx (temp_operands[1]);
21740 mem_2 = copy_rtx (temp_operands[3]);
21741 mem_3 = copy_rtx (temp_operands[5]);
21742 mem_4 = copy_rtx (temp_operands[7]);
21743 }
21744 else
21745 {
21746 mem_1 = copy_rtx (temp_operands[0]);
21747 mem_2 = copy_rtx (temp_operands[2]);
21748 mem_3 = copy_rtx (temp_operands[4]);
21749 mem_4 = copy_rtx (temp_operands[6]);
21750 gcc_assert (code == UNKNOWN);
21751 }
21752
21753 extract_base_offset_in_addr (mem_1, &base, &offset_1);
21754 extract_base_offset_in_addr (mem_3, &base, &offset_3);
21755 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
21756 && offset_3 != NULL_RTX);
21757
21758 /* Adjust offset so it can fit in LDP/STP instruction. */
21759 msize = GET_MODE_SIZE (mode);
21760 stp_off_upper_limit = msize * (0x40 - 1);
21761 stp_off_lower_limit = - msize * 0x40;
21762
21763 off_val_1 = INTVAL (offset_1);
21764 off_val_3 = INTVAL (offset_3);
21765
21766 /* The base offset is optimally half way between the two STP/LDP offsets. */
21767 if (msize <= 4)
21768 base_off = (off_val_1 + off_val_3) / 2;
21769 else
21770 /* However, due to issues with negative LDP/STP offset generation for
21771 larger modes, for DF, DI and vector modes. we must not use negative
21772 addresses smaller than 9 signed unadjusted bits can store. This
21773 provides the most range in this case. */
21774 base_off = off_val_1;
21775
21776 /* Adjust the base so that it is aligned with the addresses but still
21777 optimal. */
21778 if (base_off % msize != off_val_1 % msize)
21779 /* Fix the offset, bearing in mind we want to make it bigger not
21780 smaller. */
21781 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21782 else if (msize <= 4)
21783 /* The negative range of LDP/STP is one larger than the positive range. */
21784 base_off += msize;
21785
21786 /* Check if base offset is too big or too small. We can attempt to resolve
21787 this issue by setting it to the maximum value and seeing if the offsets
21788 still fit. */
21789 if (base_off >= 0x1000)
21790 {
21791 base_off = 0x1000 - 1;
21792 /* We must still make sure that the base offset is aligned with respect
21793 to the address. But it may not be made any bigger. */
21794 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21795 }
21796
21797 /* Likewise for the case where the base is too small. */
21798 if (base_off <= -0x1000)
21799 {
21800 base_off = -0x1000 + 1;
21801 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21802 }
21803
21804 /* Offset of the first STP/LDP. */
21805 new_off_1 = off_val_1 - base_off;
21806
21807 /* Offset of the second STP/LDP. */
21808 new_off_3 = off_val_3 - base_off;
21809
21810 /* The offsets must be within the range of the LDP/STP instructions. */
21811 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
21812 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
21813 return false;
21814
21815 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
21816 new_off_1), true);
21817 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
21818 new_off_1 + msize), true);
21819 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
21820 new_off_3), true);
21821 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
21822 new_off_3 + msize), true);
21823
21824 if (!aarch64_mem_pair_operand (mem_1, mode)
21825 || !aarch64_mem_pair_operand (mem_3, mode))
21826 return false;
21827
21828 if (code == ZERO_EXTEND)
21829 {
21830 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
21831 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
21832 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
21833 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
21834 }
21835 else if (code == SIGN_EXTEND)
21836 {
21837 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
21838 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
21839 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
21840 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
21841 }
21842
21843 if (load)
21844 {
21845 operands[0] = temp_operands[0];
21846 operands[1] = mem_1;
21847 operands[2] = temp_operands[2];
21848 operands[3] = mem_2;
21849 operands[4] = temp_operands[4];
21850 operands[5] = mem_3;
21851 operands[6] = temp_operands[6];
21852 operands[7] = mem_4;
21853 }
21854 else
21855 {
21856 operands[0] = mem_1;
21857 operands[1] = temp_operands[1];
21858 operands[2] = mem_2;
21859 operands[3] = temp_operands[3];
21860 operands[4] = mem_3;
21861 operands[5] = temp_operands[5];
21862 operands[6] = mem_4;
21863 operands[7] = temp_operands[7];
21864 }
21865
21866 /* Emit adjusting instruction. */
21867 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
21868 /* Emit ldp/stp instructions. */
21869 t1 = gen_rtx_SET (operands[0], operands[1]);
21870 t2 = gen_rtx_SET (operands[2], operands[3]);
21871 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21872 t1 = gen_rtx_SET (operands[4], operands[5]);
21873 t2 = gen_rtx_SET (operands[6], operands[7]);
21874 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21875 return true;
21876 }
21877
21878 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
21879 it isn't worth branching around empty masked ops (including masked
21880 stores). */
21881
21882 static bool
21883 aarch64_empty_mask_is_expensive (unsigned)
21884 {
21885 return false;
21886 }
21887
21888 /* Return 1 if pseudo register should be created and used to hold
21889 GOT address for PIC code. */
21890
21891 bool
21892 aarch64_use_pseudo_pic_reg (void)
21893 {
21894 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
21895 }
21896
21897 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
21898
21899 static int
21900 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
21901 {
21902 switch (XINT (x, 1))
21903 {
21904 case UNSPEC_GOTSMALLPIC:
21905 case UNSPEC_GOTSMALLPIC28K:
21906 case UNSPEC_GOTTINYPIC:
21907 return 0;
21908 default:
21909 break;
21910 }
21911
21912 return default_unspec_may_trap_p (x, flags);
21913 }
21914
21915
21916 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21917 return the log2 of that value. Otherwise return -1. */
21918
21919 int
21920 aarch64_fpconst_pow_of_2 (rtx x)
21921 {
21922 const REAL_VALUE_TYPE *r;
21923
21924 if (!CONST_DOUBLE_P (x))
21925 return -1;
21926
21927 r = CONST_DOUBLE_REAL_VALUE (x);
21928
21929 if (REAL_VALUE_NEGATIVE (*r)
21930 || REAL_VALUE_ISNAN (*r)
21931 || REAL_VALUE_ISINF (*r)
21932 || !real_isinteger (r, DFmode))
21933 return -1;
21934
21935 return exact_log2 (real_to_integer (r));
21936 }
21937
21938 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21939 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21940 return n. Otherwise return -1. */
21941
21942 int
21943 aarch64_fpconst_pow2_recip (rtx x)
21944 {
21945 REAL_VALUE_TYPE r0;
21946
21947 if (!CONST_DOUBLE_P (x))
21948 return -1;
21949
21950 r0 = *CONST_DOUBLE_REAL_VALUE (x);
21951 if (exact_real_inverse (DFmode, &r0)
21952 && !REAL_VALUE_NEGATIVE (r0))
21953 {
21954 int ret = exact_log2 (real_to_integer (&r0));
21955 if (ret >= 1 && ret <= 32)
21956 return ret;
21957 }
21958 return -1;
21959 }
21960
21961 /* If X is a vector of equal CONST_DOUBLE values and that value is
21962 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
21963
21964 int
21965 aarch64_vec_fpconst_pow_of_2 (rtx x)
21966 {
21967 int nelts;
21968 if (GET_CODE (x) != CONST_VECTOR
21969 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
21970 return -1;
21971
21972 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
21973 return -1;
21974
21975 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
21976 if (firstval <= 0)
21977 return -1;
21978
21979 for (int i = 1; i < nelts; i++)
21980 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
21981 return -1;
21982
21983 return firstval;
21984 }
21985
21986 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21987 to float.
21988
21989 __fp16 always promotes through this hook.
21990 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21991 through the generic excess precision logic rather than here. */
21992
21993 static tree
21994 aarch64_promoted_type (const_tree t)
21995 {
21996 if (SCALAR_FLOAT_TYPE_P (t)
21997 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
21998 return float_type_node;
21999
22000 return NULL_TREE;
22001 }
22002
22003 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
22004
22005 static bool
22006 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
22007 optimization_type opt_type)
22008 {
22009 switch (op)
22010 {
22011 case rsqrt_optab:
22012 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
22013
22014 default:
22015 return true;
22016 }
22017 }
22018
22019 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
22020
22021 static unsigned int
22022 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
22023 int *offset)
22024 {
22025 /* Polynomial invariant 1 == (VG / 2) - 1. */
22026 gcc_assert (i == 1);
22027 *factor = 2;
22028 *offset = 1;
22029 return AARCH64_DWARF_VG;
22030 }
22031
22032 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
22033 if MODE is HFmode, and punt to the generic implementation otherwise. */
22034
22035 static bool
22036 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
22037 {
22038 return (mode == HFmode
22039 ? true
22040 : default_libgcc_floating_mode_supported_p (mode));
22041 }
22042
22043 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
22044 if MODE is HFmode, and punt to the generic implementation otherwise. */
22045
22046 static bool
22047 aarch64_scalar_mode_supported_p (scalar_mode mode)
22048 {
22049 return (mode == HFmode
22050 ? true
22051 : default_scalar_mode_supported_p (mode));
22052 }
22053
22054 /* Set the value of FLT_EVAL_METHOD.
22055 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
22056
22057 0: evaluate all operations and constants, whose semantic type has at
22058 most the range and precision of type float, to the range and
22059 precision of float; evaluate all other operations and constants to
22060 the range and precision of the semantic type;
22061
22062 N, where _FloatN is a supported interchange floating type
22063 evaluate all operations and constants, whose semantic type has at
22064 most the range and precision of _FloatN type, to the range and
22065 precision of the _FloatN type; evaluate all other operations and
22066 constants to the range and precision of the semantic type;
22067
22068 If we have the ARMv8.2-A extensions then we support _Float16 in native
22069 precision, so we should set this to 16. Otherwise, we support the type,
22070 but want to evaluate expressions in float precision, so set this to
22071 0. */
22072
22073 static enum flt_eval_method
22074 aarch64_excess_precision (enum excess_precision_type type)
22075 {
22076 switch (type)
22077 {
22078 case EXCESS_PRECISION_TYPE_FAST:
22079 case EXCESS_PRECISION_TYPE_STANDARD:
22080 /* We can calculate either in 16-bit range and precision or
22081 32-bit range and precision. Make that decision based on whether
22082 we have native support for the ARMv8.2-A 16-bit floating-point
22083 instructions or not. */
22084 return (TARGET_FP_F16INST
22085 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
22086 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
22087 case EXCESS_PRECISION_TYPE_IMPLICIT:
22088 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
22089 default:
22090 gcc_unreachable ();
22091 }
22092 return FLT_EVAL_METHOD_UNPREDICTABLE;
22093 }
22094
22095 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
22096 scheduled for speculative execution. Reject the long-running division
22097 and square-root instructions. */
22098
22099 static bool
22100 aarch64_sched_can_speculate_insn (rtx_insn *insn)
22101 {
22102 switch (get_attr_type (insn))
22103 {
22104 case TYPE_SDIV:
22105 case TYPE_UDIV:
22106 case TYPE_FDIVS:
22107 case TYPE_FDIVD:
22108 case TYPE_FSQRTS:
22109 case TYPE_FSQRTD:
22110 case TYPE_NEON_FP_SQRT_S:
22111 case TYPE_NEON_FP_SQRT_D:
22112 case TYPE_NEON_FP_SQRT_S_Q:
22113 case TYPE_NEON_FP_SQRT_D_Q:
22114 case TYPE_NEON_FP_DIV_S:
22115 case TYPE_NEON_FP_DIV_D:
22116 case TYPE_NEON_FP_DIV_S_Q:
22117 case TYPE_NEON_FP_DIV_D_Q:
22118 return false;
22119 default:
22120 return true;
22121 }
22122 }
22123
22124 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
22125
22126 static int
22127 aarch64_compute_pressure_classes (reg_class *classes)
22128 {
22129 int i = 0;
22130 classes[i++] = GENERAL_REGS;
22131 classes[i++] = FP_REGS;
22132 /* PR_REGS isn't a useful pressure class because many predicate pseudo
22133 registers need to go in PR_LO_REGS at some point during their
22134 lifetime. Splitting it into two halves has the effect of making
22135 all predicates count against PR_LO_REGS, so that we try whenever
22136 possible to restrict the number of live predicates to 8. This
22137 greatly reduces the amount of spilling in certain loops. */
22138 classes[i++] = PR_LO_REGS;
22139 classes[i++] = PR_HI_REGS;
22140 return i;
22141 }
22142
22143 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
22144
22145 static bool
22146 aarch64_can_change_mode_class (machine_mode from,
22147 machine_mode to, reg_class_t)
22148 {
22149 unsigned int from_flags = aarch64_classify_vector_mode (from);
22150 unsigned int to_flags = aarch64_classify_vector_mode (to);
22151
22152 bool from_sve_p = (from_flags & VEC_ANY_SVE);
22153 bool to_sve_p = (to_flags & VEC_ANY_SVE);
22154
22155 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
22156 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
22157
22158 bool from_pred_p = (from_flags & VEC_SVE_PRED);
22159 bool to_pred_p = (to_flags & VEC_SVE_PRED);
22160
22161 /* Don't allow changes between predicate modes and other modes.
22162 Only predicate registers can hold predicate modes and only
22163 non-predicate registers can hold non-predicate modes, so any
22164 attempt to mix them would require a round trip through memory. */
22165 if (from_pred_p != to_pred_p)
22166 return false;
22167
22168 /* Don't allow changes between partial SVE modes and other modes.
22169 The contents of partial SVE modes are distributed evenly across
22170 the register, whereas GCC expects them to be clustered together. */
22171 if (from_partial_sve_p != to_partial_sve_p)
22172 return false;
22173
22174 /* Similarly reject changes between partial SVE modes that have
22175 different patterns of significant and insignificant bits. */
22176 if (from_partial_sve_p
22177 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
22178 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
22179 return false;
22180
22181 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
22182 {
22183 /* Don't allow changes between SVE modes and other modes that might
22184 be bigger than 128 bits. In particular, OImode, CImode and XImode
22185 divide into 128-bit quantities while SVE modes divide into
22186 BITS_PER_SVE_VECTOR quantities. */
22187 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
22188 return false;
22189 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
22190 return false;
22191 }
22192
22193 if (BYTES_BIG_ENDIAN)
22194 {
22195 /* Don't allow changes between SVE data modes and non-SVE modes.
22196 See the comment at the head of aarch64-sve.md for details. */
22197 if (from_sve_p != to_sve_p)
22198 return false;
22199
22200 /* Don't allow changes in element size: lane 0 of the new vector
22201 would not then be lane 0 of the old vector. See the comment
22202 above aarch64_maybe_expand_sve_subreg_move for a more detailed
22203 description.
22204
22205 In the worst case, this forces a register to be spilled in
22206 one mode and reloaded in the other, which handles the
22207 endianness correctly. */
22208 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
22209 return false;
22210 }
22211 return true;
22212 }
22213
22214 /* Implement TARGET_EARLY_REMAT_MODES. */
22215
22216 static void
22217 aarch64_select_early_remat_modes (sbitmap modes)
22218 {
22219 /* SVE values are not normally live across a call, so it should be
22220 worth doing early rematerialization even in VL-specific mode. */
22221 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
22222 if (aarch64_sve_mode_p ((machine_mode) i))
22223 bitmap_set_bit (modes, i);
22224 }
22225
22226 /* Override the default target speculation_safe_value. */
22227 static rtx
22228 aarch64_speculation_safe_value (machine_mode mode,
22229 rtx result, rtx val, rtx failval)
22230 {
22231 /* Maybe we should warn if falling back to hard barriers. They are
22232 likely to be noticably more expensive than the alternative below. */
22233 if (!aarch64_track_speculation)
22234 return default_speculation_safe_value (mode, result, val, failval);
22235
22236 if (!REG_P (val))
22237 val = copy_to_mode_reg (mode, val);
22238
22239 if (!aarch64_reg_or_zero (failval, mode))
22240 failval = copy_to_mode_reg (mode, failval);
22241
22242 emit_insn (gen_despeculate_copy (mode, result, val, failval));
22243 return result;
22244 }
22245
22246 /* Implement TARGET_ESTIMATED_POLY_VALUE.
22247 Look into the tuning structure for an estimate.
22248 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
22249 Advanced SIMD 128 bits. */
22250
22251 static HOST_WIDE_INT
22252 aarch64_estimated_poly_value (poly_int64 val)
22253 {
22254 enum aarch64_sve_vector_bits_enum width_source
22255 = aarch64_tune_params.sve_width;
22256
22257 /* If we still don't have an estimate, use the default. */
22258 if (width_source == SVE_SCALABLE)
22259 return default_estimated_poly_value (val);
22260
22261 HOST_WIDE_INT over_128 = width_source - 128;
22262 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
22263 }
22264
22265
22266 /* Return true for types that could be supported as SIMD return or
22267 argument types. */
22268
22269 static bool
22270 supported_simd_type (tree t)
22271 {
22272 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
22273 {
22274 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
22275 return s == 1 || s == 2 || s == 4 || s == 8;
22276 }
22277 return false;
22278 }
22279
22280 /* Return true for types that currently are supported as SIMD return
22281 or argument types. */
22282
22283 static bool
22284 currently_supported_simd_type (tree t, tree b)
22285 {
22286 if (COMPLEX_FLOAT_TYPE_P (t))
22287 return false;
22288
22289 if (TYPE_SIZE (t) != TYPE_SIZE (b))
22290 return false;
22291
22292 return supported_simd_type (t);
22293 }
22294
22295 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
22296
22297 static int
22298 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
22299 struct cgraph_simd_clone *clonei,
22300 tree base_type, int num)
22301 {
22302 tree t, ret_type, arg_type;
22303 unsigned int elt_bits, vec_bits, count;
22304
22305 if (!TARGET_SIMD)
22306 return 0;
22307
22308 if (clonei->simdlen
22309 && (clonei->simdlen < 2
22310 || clonei->simdlen > 1024
22311 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
22312 {
22313 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22314 "unsupported simdlen %d", clonei->simdlen);
22315 return 0;
22316 }
22317
22318 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
22319 if (TREE_CODE (ret_type) != VOID_TYPE
22320 && !currently_supported_simd_type (ret_type, base_type))
22321 {
22322 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
22323 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22324 "GCC does not currently support mixed size types "
22325 "for %<simd%> functions");
22326 else if (supported_simd_type (ret_type))
22327 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22328 "GCC does not currently support return type %qT "
22329 "for %<simd%> functions", ret_type);
22330 else
22331 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22332 "unsupported return type %qT for %<simd%> functions",
22333 ret_type);
22334 return 0;
22335 }
22336
22337 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
22338 {
22339 arg_type = TREE_TYPE (t);
22340
22341 if (!currently_supported_simd_type (arg_type, base_type))
22342 {
22343 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
22344 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22345 "GCC does not currently support mixed size types "
22346 "for %<simd%> functions");
22347 else
22348 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22349 "GCC does not currently support argument type %qT "
22350 "for %<simd%> functions", arg_type);
22351 return 0;
22352 }
22353 }
22354
22355 clonei->vecsize_mangle = 'n';
22356 clonei->mask_mode = VOIDmode;
22357 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
22358 if (clonei->simdlen == 0)
22359 {
22360 count = 2;
22361 vec_bits = (num == 0 ? 64 : 128);
22362 clonei->simdlen = vec_bits / elt_bits;
22363 }
22364 else
22365 {
22366 count = 1;
22367 vec_bits = clonei->simdlen * elt_bits;
22368 if (vec_bits != 64 && vec_bits != 128)
22369 {
22370 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22371 "GCC does not currently support simdlen %d for type %qT",
22372 clonei->simdlen, base_type);
22373 return 0;
22374 }
22375 }
22376 clonei->vecsize_int = vec_bits;
22377 clonei->vecsize_float = vec_bits;
22378 return count;
22379 }
22380
22381 /* Implement TARGET_SIMD_CLONE_ADJUST. */
22382
22383 static void
22384 aarch64_simd_clone_adjust (struct cgraph_node *node)
22385 {
22386 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
22387 use the correct ABI. */
22388
22389 tree t = TREE_TYPE (node->decl);
22390 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
22391 TYPE_ATTRIBUTES (t));
22392 }
22393
22394 /* Implement TARGET_SIMD_CLONE_USABLE. */
22395
22396 static int
22397 aarch64_simd_clone_usable (struct cgraph_node *node)
22398 {
22399 switch (node->simdclone->vecsize_mangle)
22400 {
22401 case 'n':
22402 if (!TARGET_SIMD)
22403 return -1;
22404 return 0;
22405 default:
22406 gcc_unreachable ();
22407 }
22408 }
22409
22410 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
22411
22412 static int
22413 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
22414 {
22415 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
22416 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
22417 return 0;
22418 return 1;
22419 }
22420
22421 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
22422
22423 static const char *
22424 aarch64_get_multilib_abi_name (void)
22425 {
22426 if (TARGET_BIG_END)
22427 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
22428 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
22429 }
22430
22431 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
22432 global variable based guard use the default else
22433 return a null tree. */
22434 static tree
22435 aarch64_stack_protect_guard (void)
22436 {
22437 if (aarch64_stack_protector_guard == SSP_GLOBAL)
22438 return default_stack_protect_guard ();
22439
22440 return NULL_TREE;
22441 }
22442
22443 /* Return the diagnostic message string if conversion from FROMTYPE to
22444 TOTYPE is not allowed, NULL otherwise. */
22445
22446 static const char *
22447 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
22448 {
22449 if (element_mode (fromtype) != element_mode (totype))
22450 {
22451 /* Do no allow conversions to/from BFmode scalar types. */
22452 if (TYPE_MODE (fromtype) == BFmode)
22453 return N_("invalid conversion from type %<bfloat16_t%>");
22454 if (TYPE_MODE (totype) == BFmode)
22455 return N_("invalid conversion to type %<bfloat16_t%>");
22456 }
22457
22458 /* Conversion allowed. */
22459 return NULL;
22460 }
22461
22462 /* Return the diagnostic message string if the unary operation OP is
22463 not permitted on TYPE, NULL otherwise. */
22464
22465 static const char *
22466 aarch64_invalid_unary_op (int op, const_tree type)
22467 {
22468 /* Reject all single-operand operations on BFmode except for &. */
22469 if (element_mode (type) == BFmode && op != ADDR_EXPR)
22470 return N_("operation not permitted on type %<bfloat16_t%>");
22471
22472 /* Operation allowed. */
22473 return NULL;
22474 }
22475
22476 /* Return the diagnostic message string if the binary operation OP is
22477 not permitted on TYPE1 and TYPE2, NULL otherwise. */
22478
22479 static const char *
22480 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
22481 const_tree type2)
22482 {
22483 /* Reject all 2-operand operations on BFmode. */
22484 if (element_mode (type1) == BFmode
22485 || element_mode (type2) == BFmode)
22486 return N_("operation not permitted on type %<bfloat16_t%>");
22487
22488 if (VECTOR_TYPE_P (type1)
22489 && VECTOR_TYPE_P (type2)
22490 && !TYPE_INDIVISIBLE_P (type1)
22491 && !TYPE_INDIVISIBLE_P (type2)
22492 && (aarch64_sve::builtin_type_p (type1)
22493 != aarch64_sve::builtin_type_p (type2)))
22494 return N_("cannot combine GNU and SVE vectors in a binary operation");
22495
22496 /* Operation allowed. */
22497 return NULL;
22498 }
22499
22500 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
22501 section at the end if needed. */
22502 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
22503 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
22504 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
22505 void
22506 aarch64_file_end_indicate_exec_stack ()
22507 {
22508 file_end_indicate_exec_stack ();
22509
22510 unsigned feature_1_and = 0;
22511 if (aarch64_bti_enabled ())
22512 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
22513
22514 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
22515 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
22516
22517 if (feature_1_and)
22518 {
22519 /* Generate .note.gnu.property section. */
22520 switch_to_section (get_section (".note.gnu.property",
22521 SECTION_NOTYPE, NULL));
22522
22523 /* PT_NOTE header: namesz, descsz, type.
22524 namesz = 4 ("GNU\0")
22525 descsz = 16 (Size of the program property array)
22526 [(12 + padding) * Number of array elements]
22527 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
22528 assemble_align (POINTER_SIZE);
22529 assemble_integer (GEN_INT (4), 4, 32, 1);
22530 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
22531 assemble_integer (GEN_INT (5), 4, 32, 1);
22532
22533 /* PT_NOTE name. */
22534 assemble_string ("GNU", 4);
22535
22536 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
22537 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
22538 datasz = 4
22539 data = feature_1_and. */
22540 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
22541 assemble_integer (GEN_INT (4), 4, 32, 1);
22542 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
22543
22544 /* Pad the size of the note to the required alignment. */
22545 assemble_align (POINTER_SIZE);
22546 }
22547 }
22548 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
22549 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
22550 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
22551
22552 /* Target-specific selftests. */
22553
22554 #if CHECKING_P
22555
22556 namespace selftest {
22557
22558 /* Selftest for the RTL loader.
22559 Verify that the RTL loader copes with a dump from
22560 print_rtx_function. This is essentially just a test that class
22561 function_reader can handle a real dump, but it also verifies
22562 that lookup_reg_by_dump_name correctly handles hard regs.
22563 The presence of hard reg names in the dump means that the test is
22564 target-specific, hence it is in this file. */
22565
22566 static void
22567 aarch64_test_loading_full_dump ()
22568 {
22569 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
22570
22571 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
22572
22573 rtx_insn *insn_1 = get_insn_by_uid (1);
22574 ASSERT_EQ (NOTE, GET_CODE (insn_1));
22575
22576 rtx_insn *insn_15 = get_insn_by_uid (15);
22577 ASSERT_EQ (INSN, GET_CODE (insn_15));
22578 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
22579
22580 /* Verify crtl->return_rtx. */
22581 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
22582 ASSERT_EQ (0, REGNO (crtl->return_rtx));
22583 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
22584 }
22585
22586 /* Run all target-specific selftests. */
22587
22588 static void
22589 aarch64_run_selftests (void)
22590 {
22591 aarch64_test_loading_full_dump ();
22592 }
22593
22594 } // namespace selftest
22595
22596 #endif /* #if CHECKING_P */
22597
22598 #undef TARGET_STACK_PROTECT_GUARD
22599 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
22600
22601 #undef TARGET_ADDRESS_COST
22602 #define TARGET_ADDRESS_COST aarch64_address_cost
22603
22604 /* This hook will determines whether unnamed bitfields affect the alignment
22605 of the containing structure. The hook returns true if the structure
22606 should inherit the alignment requirements of an unnamed bitfield's
22607 type. */
22608 #undef TARGET_ALIGN_ANON_BITFIELD
22609 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
22610
22611 #undef TARGET_ASM_ALIGNED_DI_OP
22612 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
22613
22614 #undef TARGET_ASM_ALIGNED_HI_OP
22615 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
22616
22617 #undef TARGET_ASM_ALIGNED_SI_OP
22618 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
22619
22620 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
22621 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
22622 hook_bool_const_tree_hwi_hwi_const_tree_true
22623
22624 #undef TARGET_ASM_FILE_START
22625 #define TARGET_ASM_FILE_START aarch64_start_file
22626
22627 #undef TARGET_ASM_OUTPUT_MI_THUNK
22628 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
22629
22630 #undef TARGET_ASM_SELECT_RTX_SECTION
22631 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
22632
22633 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
22634 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
22635
22636 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
22637 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
22638
22639 #undef TARGET_BUILD_BUILTIN_VA_LIST
22640 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
22641
22642 #undef TARGET_CALLEE_COPIES
22643 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
22644
22645 #undef TARGET_CAN_ELIMINATE
22646 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
22647
22648 #undef TARGET_CAN_INLINE_P
22649 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
22650
22651 #undef TARGET_CANNOT_FORCE_CONST_MEM
22652 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
22653
22654 #undef TARGET_CASE_VALUES_THRESHOLD
22655 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
22656
22657 #undef TARGET_CONDITIONAL_REGISTER_USAGE
22658 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
22659
22660 #undef TARGET_MEMBER_TYPE_FORCES_BLK
22661 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
22662
22663 /* Only the least significant bit is used for initialization guard
22664 variables. */
22665 #undef TARGET_CXX_GUARD_MASK_BIT
22666 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
22667
22668 #undef TARGET_C_MODE_FOR_SUFFIX
22669 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
22670
22671 #ifdef TARGET_BIG_ENDIAN_DEFAULT
22672 #undef TARGET_DEFAULT_TARGET_FLAGS
22673 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
22674 #endif
22675
22676 #undef TARGET_CLASS_MAX_NREGS
22677 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
22678
22679 #undef TARGET_BUILTIN_DECL
22680 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
22681
22682 #undef TARGET_BUILTIN_RECIPROCAL
22683 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
22684
22685 #undef TARGET_C_EXCESS_PRECISION
22686 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
22687
22688 #undef TARGET_EXPAND_BUILTIN
22689 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
22690
22691 #undef TARGET_EXPAND_BUILTIN_VA_START
22692 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
22693
22694 #undef TARGET_FOLD_BUILTIN
22695 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
22696
22697 #undef TARGET_FUNCTION_ARG
22698 #define TARGET_FUNCTION_ARG aarch64_function_arg
22699
22700 #undef TARGET_FUNCTION_ARG_ADVANCE
22701 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
22702
22703 #undef TARGET_FUNCTION_ARG_BOUNDARY
22704 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
22705
22706 #undef TARGET_FUNCTION_ARG_PADDING
22707 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
22708
22709 #undef TARGET_GET_RAW_RESULT_MODE
22710 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
22711 #undef TARGET_GET_RAW_ARG_MODE
22712 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
22713
22714 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
22715 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
22716
22717 #undef TARGET_FUNCTION_VALUE
22718 #define TARGET_FUNCTION_VALUE aarch64_function_value
22719
22720 #undef TARGET_FUNCTION_VALUE_REGNO_P
22721 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
22722
22723 #undef TARGET_GIMPLE_FOLD_BUILTIN
22724 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
22725
22726 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
22727 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
22728
22729 #undef TARGET_INIT_BUILTINS
22730 #define TARGET_INIT_BUILTINS aarch64_init_builtins
22731
22732 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
22733 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
22734 aarch64_ira_change_pseudo_allocno_class
22735
22736 #undef TARGET_LEGITIMATE_ADDRESS_P
22737 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
22738
22739 #undef TARGET_LEGITIMATE_CONSTANT_P
22740 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
22741
22742 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
22743 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
22744 aarch64_legitimize_address_displacement
22745
22746 #undef TARGET_LIBGCC_CMP_RETURN_MODE
22747 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
22748
22749 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
22750 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
22751 aarch64_libgcc_floating_mode_supported_p
22752
22753 #undef TARGET_MANGLE_TYPE
22754 #define TARGET_MANGLE_TYPE aarch64_mangle_type
22755
22756 #undef TARGET_INVALID_CONVERSION
22757 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
22758
22759 #undef TARGET_INVALID_UNARY_OP
22760 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
22761
22762 #undef TARGET_INVALID_BINARY_OP
22763 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
22764
22765 #undef TARGET_VERIFY_TYPE_CONTEXT
22766 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22767
22768 #undef TARGET_MEMORY_MOVE_COST
22769 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22770
22771 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22772 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22773
22774 #undef TARGET_MUST_PASS_IN_STACK
22775 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22776
22777 /* This target hook should return true if accesses to volatile bitfields
22778 should use the narrowest mode possible. It should return false if these
22779 accesses should use the bitfield container type. */
22780 #undef TARGET_NARROW_VOLATILE_BITFIELD
22781 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22782
22783 #undef TARGET_OPTION_OVERRIDE
22784 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22785
22786 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22787 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22788 aarch64_override_options_after_change
22789
22790 #undef TARGET_OPTION_SAVE
22791 #define TARGET_OPTION_SAVE aarch64_option_save
22792
22793 #undef TARGET_OPTION_RESTORE
22794 #define TARGET_OPTION_RESTORE aarch64_option_restore
22795
22796 #undef TARGET_OPTION_PRINT
22797 #define TARGET_OPTION_PRINT aarch64_option_print
22798
22799 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22800 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22801
22802 #undef TARGET_SET_CURRENT_FUNCTION
22803 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22804
22805 #undef TARGET_PASS_BY_REFERENCE
22806 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22807
22808 #undef TARGET_PREFERRED_RELOAD_CLASS
22809 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22810
22811 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22812 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22813
22814 #undef TARGET_PROMOTED_TYPE
22815 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22816
22817 #undef TARGET_SECONDARY_RELOAD
22818 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22819
22820 #undef TARGET_SHIFT_TRUNCATION_MASK
22821 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22822
22823 #undef TARGET_SETUP_INCOMING_VARARGS
22824 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22825
22826 #undef TARGET_STRUCT_VALUE_RTX
22827 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
22828
22829 #undef TARGET_REGISTER_MOVE_COST
22830 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22831
22832 #undef TARGET_RETURN_IN_MEMORY
22833 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22834
22835 #undef TARGET_RETURN_IN_MSB
22836 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22837
22838 #undef TARGET_RTX_COSTS
22839 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22840
22841 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22842 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22843
22844 #undef TARGET_SCHED_ISSUE_RATE
22845 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22846
22847 #undef TARGET_SCHED_VARIABLE_ISSUE
22848 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22849
22850 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22851 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22852 aarch64_sched_first_cycle_multipass_dfa_lookahead
22853
22854 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22855 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22856 aarch64_first_cycle_multipass_dfa_lookahead_guard
22857
22858 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22859 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22860 aarch64_get_separate_components
22861
22862 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22863 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22864 aarch64_components_for_bb
22865
22866 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22867 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22868 aarch64_disqualify_components
22869
22870 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22871 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22872 aarch64_emit_prologue_components
22873
22874 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22875 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22876 aarch64_emit_epilogue_components
22877
22878 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22879 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22880 aarch64_set_handled_components
22881
22882 #undef TARGET_TRAMPOLINE_INIT
22883 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22884
22885 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22886 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22887
22888 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22889 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22890
22891 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
22892 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
22893
22894 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22895 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22896 aarch64_builtin_support_vector_misalignment
22897
22898 #undef TARGET_ARRAY_MODE
22899 #define TARGET_ARRAY_MODE aarch64_array_mode
22900
22901 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22902 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22903
22904 #undef TARGET_VECTORIZE_ADD_STMT_COST
22905 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22906
22907 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22908 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22909 aarch64_builtin_vectorization_cost
22910
22911 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22912 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22913
22914 #undef TARGET_VECTORIZE_BUILTINS
22915 #define TARGET_VECTORIZE_BUILTINS
22916
22917 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22918 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22919 aarch64_builtin_vectorized_function
22920
22921 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22922 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22923 aarch64_autovectorize_vector_modes
22924
22925 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22926 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22927 aarch64_atomic_assign_expand_fenv
22928
22929 /* Section anchor support. */
22930
22931 #undef TARGET_MIN_ANCHOR_OFFSET
22932 #define TARGET_MIN_ANCHOR_OFFSET -256
22933
22934 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22935 byte offset; we can do much more for larger data types, but have no way
22936 to determine the size of the access. We assume accesses are aligned. */
22937 #undef TARGET_MAX_ANCHOR_OFFSET
22938 #define TARGET_MAX_ANCHOR_OFFSET 4095
22939
22940 #undef TARGET_VECTOR_ALIGNMENT
22941 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22942
22943 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22944 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22945 aarch64_vectorize_preferred_vector_alignment
22946 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22947 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22948 aarch64_simd_vector_alignment_reachable
22949
22950 /* vec_perm support. */
22951
22952 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22953 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22954 aarch64_vectorize_vec_perm_const
22955
22956 #undef TARGET_VECTORIZE_RELATED_MODE
22957 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22958 #undef TARGET_VECTORIZE_GET_MASK_MODE
22959 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22960 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22961 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22962 aarch64_empty_mask_is_expensive
22963 #undef TARGET_PREFERRED_ELSE_VALUE
22964 #define TARGET_PREFERRED_ELSE_VALUE \
22965 aarch64_preferred_else_value
22966
22967 #undef TARGET_INIT_LIBFUNCS
22968 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22969
22970 #undef TARGET_FIXED_CONDITION_CODE_REGS
22971 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22972
22973 #undef TARGET_FLAGS_REGNUM
22974 #define TARGET_FLAGS_REGNUM CC_REGNUM
22975
22976 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22977 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22978
22979 #undef TARGET_ASAN_SHADOW_OFFSET
22980 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22981
22982 #undef TARGET_LEGITIMIZE_ADDRESS
22983 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22984
22985 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22986 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22987
22988 #undef TARGET_CAN_USE_DOLOOP_P
22989 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22990
22991 #undef TARGET_SCHED_ADJUST_PRIORITY
22992 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22993
22994 #undef TARGET_SCHED_MACRO_FUSION_P
22995 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22996
22997 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22998 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
22999
23000 #undef TARGET_SCHED_FUSION_PRIORITY
23001 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
23002
23003 #undef TARGET_UNSPEC_MAY_TRAP_P
23004 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
23005
23006 #undef TARGET_USE_PSEUDO_PIC_REG
23007 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
23008
23009 #undef TARGET_PRINT_OPERAND
23010 #define TARGET_PRINT_OPERAND aarch64_print_operand
23011
23012 #undef TARGET_PRINT_OPERAND_ADDRESS
23013 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
23014
23015 #undef TARGET_OPTAB_SUPPORTED_P
23016 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
23017
23018 #undef TARGET_OMIT_STRUCT_RETURN_REG
23019 #define TARGET_OMIT_STRUCT_RETURN_REG true
23020
23021 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
23022 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
23023 aarch64_dwarf_poly_indeterminate_value
23024
23025 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
23026 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
23027 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
23028
23029 #undef TARGET_HARD_REGNO_NREGS
23030 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
23031 #undef TARGET_HARD_REGNO_MODE_OK
23032 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
23033
23034 #undef TARGET_MODES_TIEABLE_P
23035 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
23036
23037 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
23038 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
23039 aarch64_hard_regno_call_part_clobbered
23040
23041 #undef TARGET_INSN_CALLEE_ABI
23042 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
23043
23044 #undef TARGET_CONSTANT_ALIGNMENT
23045 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
23046
23047 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
23048 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
23049 aarch64_stack_clash_protection_alloca_probe_range
23050
23051 #undef TARGET_COMPUTE_PRESSURE_CLASSES
23052 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
23053
23054 #undef TARGET_CAN_CHANGE_MODE_CLASS
23055 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
23056
23057 #undef TARGET_SELECT_EARLY_REMAT_MODES
23058 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
23059
23060 #undef TARGET_SPECULATION_SAFE_VALUE
23061 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
23062
23063 #undef TARGET_ESTIMATED_POLY_VALUE
23064 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
23065
23066 #undef TARGET_ATTRIBUTE_TABLE
23067 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
23068
23069 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
23070 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
23071 aarch64_simd_clone_compute_vecsize_and_simdlen
23072
23073 #undef TARGET_SIMD_CLONE_ADJUST
23074 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
23075
23076 #undef TARGET_SIMD_CLONE_USABLE
23077 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
23078
23079 #undef TARGET_COMP_TYPE_ATTRIBUTES
23080 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
23081
23082 #undef TARGET_GET_MULTILIB_ABI_NAME
23083 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
23084
23085 #undef TARGET_FNTYPE_ABI
23086 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
23087
23088 #if CHECKING_P
23089 #undef TARGET_RUN_TARGET_SELFTESTS
23090 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
23091 #endif /* #if CHECKING_P */
23092
23093 #undef TARGET_ASM_POST_CFI_STARTPROC
23094 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
23095
23096 #undef TARGET_STRICT_ARGUMENT_NAMING
23097 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
23098
23099 #undef TARGET_MD_ASM_ADJUST
23100 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
23101
23102 struct gcc_target targetm = TARGET_INITIALIZER;
23103
23104 #include "gt-aarch64.h"