]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
PR other/16615 [1/5]
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74
75 /* This file should be included last. */
76 #include "target-def.h"
77
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
80
81 /* Information about a legitimate vector immediate operand. */
82 struct simd_immediate_info
83 {
84 enum insn_type { MOV, MVN };
85 enum modifier_type { LSL, MSL };
86
87 simd_immediate_info () {}
88 simd_immediate_info (scalar_float_mode, rtx);
89 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
90 insn_type = MOV, modifier_type = LSL,
91 unsigned int = 0);
92 simd_immediate_info (scalar_mode, rtx, rtx);
93
94 /* The mode of the elements. */
95 scalar_mode elt_mode;
96
97 /* The value of each element if all elements are the same, or the
98 first value if the constant is a series. */
99 rtx value;
100
101 /* The value of the step if the constant is a series, null otherwise. */
102 rtx step;
103
104 /* The instruction to use to move the immediate into a vector. */
105 insn_type insn;
106
107 /* The kind of shift modifier to use, and the number of bits to shift.
108 This is (LSL, 0) if no shift is needed. */
109 modifier_type modifier;
110 unsigned int shift;
111 };
112
113 /* Construct a floating-point immediate in which each element has mode
114 ELT_MODE_IN and value VALUE_IN. */
115 inline simd_immediate_info
116 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
117 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
118 modifier (LSL), shift (0)
119 {}
120
121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
122 and value VALUE_IN. The other parameters are as for the structure
123 fields. */
124 inline simd_immediate_info
125 ::simd_immediate_info (scalar_int_mode elt_mode_in,
126 unsigned HOST_WIDE_INT value_in,
127 insn_type insn_in, modifier_type modifier_in,
128 unsigned int shift_in)
129 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
130 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
131 {}
132
133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
134 and where element I is equal to VALUE_IN + I * STEP_IN. */
135 inline simd_immediate_info
136 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
137 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
138 modifier (LSL), shift (0)
139 {}
140
141 /* The current code model. */
142 enum aarch64_code_model aarch64_cmodel;
143
144 /* The number of 64-bit elements in an SVE vector. */
145 poly_uint16 aarch64_sve_vg;
146
147 #ifdef HAVE_AS_TLS
148 #undef TARGET_HAVE_TLS
149 #define TARGET_HAVE_TLS 1
150 #endif
151
152 static bool aarch64_composite_type_p (const_tree, machine_mode);
153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
154 const_tree,
155 machine_mode *, int *,
156 bool *);
157 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
158 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
159 static void aarch64_override_options_after_change (void);
160 static bool aarch64_vector_mode_supported_p (machine_mode);
161 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
163 const_tree type,
164 int misalignment,
165 bool is_packed);
166 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
167 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
168 aarch64_addr_query_type);
169 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
170
171 /* Major revision number of the ARM Architecture implemented by the target. */
172 unsigned aarch64_architecture_version;
173
174 /* The processor for which instructions should be scheduled. */
175 enum aarch64_processor aarch64_tune = cortexa53;
176
177 /* Mask to specify which instruction scheduling options should be used. */
178 unsigned long aarch64_tune_flags = 0;
179
180 /* Global flag for PC relative loads. */
181 bool aarch64_pcrelative_literal_loads;
182
183 /* Global flag for whether frame pointer is enabled. */
184 bool aarch64_use_frame_pointer;
185
186 #define BRANCH_PROTECT_STR_MAX 255
187 char *accepted_branch_protection_string = NULL;
188
189 static enum aarch64_parse_opt_result
190 aarch64_parse_branch_protection (const char*, char**);
191
192 /* Support for command line parsing of boolean flags in the tuning
193 structures. */
194 struct aarch64_flag_desc
195 {
196 const char* name;
197 unsigned int flag;
198 };
199
200 #define AARCH64_FUSION_PAIR(name, internal_name) \
201 { name, AARCH64_FUSE_##internal_name },
202 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
203 {
204 { "none", AARCH64_FUSE_NOTHING },
205 #include "aarch64-fusion-pairs.def"
206 { "all", AARCH64_FUSE_ALL },
207 { NULL, AARCH64_FUSE_NOTHING }
208 };
209
210 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
211 { name, AARCH64_EXTRA_TUNE_##internal_name },
212 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
213 {
214 { "none", AARCH64_EXTRA_TUNE_NONE },
215 #include "aarch64-tuning-flags.def"
216 { "all", AARCH64_EXTRA_TUNE_ALL },
217 { NULL, AARCH64_EXTRA_TUNE_NONE }
218 };
219
220 /* Tuning parameters. */
221
222 static const struct cpu_addrcost_table generic_addrcost_table =
223 {
224 {
225 1, /* hi */
226 0, /* si */
227 0, /* di */
228 1, /* ti */
229 },
230 0, /* pre_modify */
231 0, /* post_modify */
232 0, /* register_offset */
233 0, /* register_sextend */
234 0, /* register_zextend */
235 0 /* imm_offset */
236 };
237
238 static const struct cpu_addrcost_table exynosm1_addrcost_table =
239 {
240 {
241 0, /* hi */
242 0, /* si */
243 0, /* di */
244 2, /* ti */
245 },
246 0, /* pre_modify */
247 0, /* post_modify */
248 1, /* register_offset */
249 1, /* register_sextend */
250 2, /* register_zextend */
251 0, /* imm_offset */
252 };
253
254 static const struct cpu_addrcost_table xgene1_addrcost_table =
255 {
256 {
257 1, /* hi */
258 0, /* si */
259 0, /* di */
260 1, /* ti */
261 },
262 1, /* pre_modify */
263 1, /* post_modify */
264 0, /* register_offset */
265 1, /* register_sextend */
266 1, /* register_zextend */
267 0, /* imm_offset */
268 };
269
270 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
271 {
272 {
273 1, /* hi */
274 1, /* si */
275 1, /* di */
276 2, /* ti */
277 },
278 0, /* pre_modify */
279 0, /* post_modify */
280 2, /* register_offset */
281 3, /* register_sextend */
282 3, /* register_zextend */
283 0, /* imm_offset */
284 };
285
286 static const struct cpu_addrcost_table tsv110_addrcost_table =
287 {
288 {
289 1, /* hi */
290 0, /* si */
291 0, /* di */
292 1, /* ti */
293 },
294 0, /* pre_modify */
295 0, /* post_modify */
296 0, /* register_offset */
297 1, /* register_sextend */
298 1, /* register_zextend */
299 0, /* imm_offset */
300 };
301
302 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
303 {
304 {
305 1, /* hi */
306 1, /* si */
307 1, /* di */
308 2, /* ti */
309 },
310 1, /* pre_modify */
311 1, /* post_modify */
312 3, /* register_offset */
313 3, /* register_sextend */
314 3, /* register_zextend */
315 2, /* imm_offset */
316 };
317
318 static const struct cpu_regmove_cost generic_regmove_cost =
319 {
320 1, /* GP2GP */
321 /* Avoid the use of slow int<->fp moves for spilling by setting
322 their cost higher than memmov_cost. */
323 5, /* GP2FP */
324 5, /* FP2GP */
325 2 /* FP2FP */
326 };
327
328 static const struct cpu_regmove_cost cortexa57_regmove_cost =
329 {
330 1, /* GP2GP */
331 /* Avoid the use of slow int<->fp moves for spilling by setting
332 their cost higher than memmov_cost. */
333 5, /* GP2FP */
334 5, /* FP2GP */
335 2 /* FP2FP */
336 };
337
338 static const struct cpu_regmove_cost cortexa53_regmove_cost =
339 {
340 1, /* GP2GP */
341 /* Avoid the use of slow int<->fp moves for spilling by setting
342 their cost higher than memmov_cost. */
343 5, /* GP2FP */
344 5, /* FP2GP */
345 2 /* FP2FP */
346 };
347
348 static const struct cpu_regmove_cost exynosm1_regmove_cost =
349 {
350 1, /* GP2GP */
351 /* Avoid the use of slow int<->fp moves for spilling by setting
352 their cost higher than memmov_cost (actual, 4 and 9). */
353 9, /* GP2FP */
354 9, /* FP2GP */
355 1 /* FP2FP */
356 };
357
358 static const struct cpu_regmove_cost thunderx_regmove_cost =
359 {
360 2, /* GP2GP */
361 2, /* GP2FP */
362 6, /* FP2GP */
363 4 /* FP2FP */
364 };
365
366 static const struct cpu_regmove_cost xgene1_regmove_cost =
367 {
368 1, /* GP2GP */
369 /* Avoid the use of slow int<->fp moves for spilling by setting
370 their cost higher than memmov_cost. */
371 8, /* GP2FP */
372 8, /* FP2GP */
373 2 /* FP2FP */
374 };
375
376 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
377 {
378 2, /* GP2GP */
379 /* Avoid the use of int<->fp moves for spilling. */
380 6, /* GP2FP */
381 6, /* FP2GP */
382 4 /* FP2FP */
383 };
384
385 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
386 {
387 1, /* GP2GP */
388 /* Avoid the use of int<->fp moves for spilling. */
389 8, /* GP2FP */
390 8, /* FP2GP */
391 4 /* FP2FP */
392 };
393
394 static const struct cpu_regmove_cost tsv110_regmove_cost =
395 {
396 1, /* GP2GP */
397 /* Avoid the use of slow int<->fp moves for spilling by setting
398 their cost higher than memmov_cost. */
399 2, /* GP2FP */
400 3, /* FP2GP */
401 2 /* FP2FP */
402 };
403
404 /* Generic costs for vector insn classes. */
405 static const struct cpu_vector_cost generic_vector_cost =
406 {
407 1, /* scalar_int_stmt_cost */
408 1, /* scalar_fp_stmt_cost */
409 1, /* scalar_load_cost */
410 1, /* scalar_store_cost */
411 1, /* vec_int_stmt_cost */
412 1, /* vec_fp_stmt_cost */
413 2, /* vec_permute_cost */
414 1, /* vec_to_scalar_cost */
415 1, /* scalar_to_vec_cost */
416 1, /* vec_align_load_cost */
417 1, /* vec_unalign_load_cost */
418 1, /* vec_unalign_store_cost */
419 1, /* vec_store_cost */
420 3, /* cond_taken_branch_cost */
421 1 /* cond_not_taken_branch_cost */
422 };
423
424 /* QDF24XX costs for vector insn classes. */
425 static const struct cpu_vector_cost qdf24xx_vector_cost =
426 {
427 1, /* scalar_int_stmt_cost */
428 1, /* scalar_fp_stmt_cost */
429 1, /* scalar_load_cost */
430 1, /* scalar_store_cost */
431 1, /* vec_int_stmt_cost */
432 3, /* vec_fp_stmt_cost */
433 2, /* vec_permute_cost */
434 1, /* vec_to_scalar_cost */
435 1, /* scalar_to_vec_cost */
436 1, /* vec_align_load_cost */
437 1, /* vec_unalign_load_cost */
438 1, /* vec_unalign_store_cost */
439 1, /* vec_store_cost */
440 3, /* cond_taken_branch_cost */
441 1 /* cond_not_taken_branch_cost */
442 };
443
444 /* ThunderX costs for vector insn classes. */
445 static const struct cpu_vector_cost thunderx_vector_cost =
446 {
447 1, /* scalar_int_stmt_cost */
448 1, /* scalar_fp_stmt_cost */
449 3, /* scalar_load_cost */
450 1, /* scalar_store_cost */
451 4, /* vec_int_stmt_cost */
452 1, /* vec_fp_stmt_cost */
453 4, /* vec_permute_cost */
454 2, /* vec_to_scalar_cost */
455 2, /* scalar_to_vec_cost */
456 3, /* vec_align_load_cost */
457 5, /* vec_unalign_load_cost */
458 5, /* vec_unalign_store_cost */
459 1, /* vec_store_cost */
460 3, /* cond_taken_branch_cost */
461 3 /* cond_not_taken_branch_cost */
462 };
463
464 static const struct cpu_vector_cost tsv110_vector_cost =
465 {
466 1, /* scalar_int_stmt_cost */
467 1, /* scalar_fp_stmt_cost */
468 5, /* scalar_load_cost */
469 1, /* scalar_store_cost */
470 2, /* vec_int_stmt_cost */
471 2, /* vec_fp_stmt_cost */
472 2, /* vec_permute_cost */
473 3, /* vec_to_scalar_cost */
474 2, /* scalar_to_vec_cost */
475 5, /* vec_align_load_cost */
476 5, /* vec_unalign_load_cost */
477 1, /* vec_unalign_store_cost */
478 1, /* vec_store_cost */
479 1, /* cond_taken_branch_cost */
480 1 /* cond_not_taken_branch_cost */
481 };
482
483 /* Generic costs for vector insn classes. */
484 static const struct cpu_vector_cost cortexa57_vector_cost =
485 {
486 1, /* scalar_int_stmt_cost */
487 1, /* scalar_fp_stmt_cost */
488 4, /* scalar_load_cost */
489 1, /* scalar_store_cost */
490 2, /* vec_int_stmt_cost */
491 2, /* vec_fp_stmt_cost */
492 3, /* vec_permute_cost */
493 8, /* vec_to_scalar_cost */
494 8, /* scalar_to_vec_cost */
495 4, /* vec_align_load_cost */
496 4, /* vec_unalign_load_cost */
497 1, /* vec_unalign_store_cost */
498 1, /* vec_store_cost */
499 1, /* cond_taken_branch_cost */
500 1 /* cond_not_taken_branch_cost */
501 };
502
503 static const struct cpu_vector_cost exynosm1_vector_cost =
504 {
505 1, /* scalar_int_stmt_cost */
506 1, /* scalar_fp_stmt_cost */
507 5, /* scalar_load_cost */
508 1, /* scalar_store_cost */
509 3, /* vec_int_stmt_cost */
510 3, /* vec_fp_stmt_cost */
511 3, /* vec_permute_cost */
512 3, /* vec_to_scalar_cost */
513 3, /* scalar_to_vec_cost */
514 5, /* vec_align_load_cost */
515 5, /* vec_unalign_load_cost */
516 1, /* vec_unalign_store_cost */
517 1, /* vec_store_cost */
518 1, /* cond_taken_branch_cost */
519 1 /* cond_not_taken_branch_cost */
520 };
521
522 /* Generic costs for vector insn classes. */
523 static const struct cpu_vector_cost xgene1_vector_cost =
524 {
525 1, /* scalar_int_stmt_cost */
526 1, /* scalar_fp_stmt_cost */
527 5, /* scalar_load_cost */
528 1, /* scalar_store_cost */
529 2, /* vec_int_stmt_cost */
530 2, /* vec_fp_stmt_cost */
531 2, /* vec_permute_cost */
532 4, /* vec_to_scalar_cost */
533 4, /* scalar_to_vec_cost */
534 10, /* vec_align_load_cost */
535 10, /* vec_unalign_load_cost */
536 2, /* vec_unalign_store_cost */
537 2, /* vec_store_cost */
538 2, /* cond_taken_branch_cost */
539 1 /* cond_not_taken_branch_cost */
540 };
541
542 /* Costs for vector insn classes for Vulcan. */
543 static const struct cpu_vector_cost thunderx2t99_vector_cost =
544 {
545 1, /* scalar_int_stmt_cost */
546 6, /* scalar_fp_stmt_cost */
547 4, /* scalar_load_cost */
548 1, /* scalar_store_cost */
549 5, /* vec_int_stmt_cost */
550 6, /* vec_fp_stmt_cost */
551 3, /* vec_permute_cost */
552 6, /* vec_to_scalar_cost */
553 5, /* scalar_to_vec_cost */
554 8, /* vec_align_load_cost */
555 8, /* vec_unalign_load_cost */
556 4, /* vec_unalign_store_cost */
557 4, /* vec_store_cost */
558 2, /* cond_taken_branch_cost */
559 1 /* cond_not_taken_branch_cost */
560 };
561
562 /* Generic costs for branch instructions. */
563 static const struct cpu_branch_cost generic_branch_cost =
564 {
565 1, /* Predictable. */
566 3 /* Unpredictable. */
567 };
568
569 /* Generic approximation modes. */
570 static const cpu_approx_modes generic_approx_modes =
571 {
572 AARCH64_APPROX_NONE, /* division */
573 AARCH64_APPROX_NONE, /* sqrt */
574 AARCH64_APPROX_NONE /* recip_sqrt */
575 };
576
577 /* Approximation modes for Exynos M1. */
578 static const cpu_approx_modes exynosm1_approx_modes =
579 {
580 AARCH64_APPROX_NONE, /* division */
581 AARCH64_APPROX_ALL, /* sqrt */
582 AARCH64_APPROX_ALL /* recip_sqrt */
583 };
584
585 /* Approximation modes for X-Gene 1. */
586 static const cpu_approx_modes xgene1_approx_modes =
587 {
588 AARCH64_APPROX_NONE, /* division */
589 AARCH64_APPROX_NONE, /* sqrt */
590 AARCH64_APPROX_ALL /* recip_sqrt */
591 };
592
593 /* Generic prefetch settings (which disable prefetch). */
594 static const cpu_prefetch_tune generic_prefetch_tune =
595 {
596 0, /* num_slots */
597 -1, /* l1_cache_size */
598 -1, /* l1_cache_line_size */
599 -1, /* l2_cache_size */
600 true, /* prefetch_dynamic_strides */
601 -1, /* minimum_stride */
602 -1 /* default_opt_level */
603 };
604
605 static const cpu_prefetch_tune exynosm1_prefetch_tune =
606 {
607 0, /* num_slots */
608 -1, /* l1_cache_size */
609 64, /* l1_cache_line_size */
610 -1, /* l2_cache_size */
611 true, /* prefetch_dynamic_strides */
612 -1, /* minimum_stride */
613 -1 /* default_opt_level */
614 };
615
616 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
617 {
618 4, /* num_slots */
619 32, /* l1_cache_size */
620 64, /* l1_cache_line_size */
621 512, /* l2_cache_size */
622 false, /* prefetch_dynamic_strides */
623 2048, /* minimum_stride */
624 3 /* default_opt_level */
625 };
626
627 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
628 {
629 8, /* num_slots */
630 32, /* l1_cache_size */
631 128, /* l1_cache_line_size */
632 16*1024, /* l2_cache_size */
633 true, /* prefetch_dynamic_strides */
634 -1, /* minimum_stride */
635 3 /* default_opt_level */
636 };
637
638 static const cpu_prefetch_tune thunderx_prefetch_tune =
639 {
640 8, /* num_slots */
641 32, /* l1_cache_size */
642 128, /* l1_cache_line_size */
643 -1, /* l2_cache_size */
644 true, /* prefetch_dynamic_strides */
645 -1, /* minimum_stride */
646 -1 /* default_opt_level */
647 };
648
649 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
650 {
651 8, /* num_slots */
652 32, /* l1_cache_size */
653 64, /* l1_cache_line_size */
654 256, /* l2_cache_size */
655 true, /* prefetch_dynamic_strides */
656 -1, /* minimum_stride */
657 -1 /* default_opt_level */
658 };
659
660 static const cpu_prefetch_tune tsv110_prefetch_tune =
661 {
662 0, /* num_slots */
663 64, /* l1_cache_size */
664 64, /* l1_cache_line_size */
665 512, /* l2_cache_size */
666 true, /* prefetch_dynamic_strides */
667 -1, /* minimum_stride */
668 -1 /* default_opt_level */
669 };
670
671 static const cpu_prefetch_tune xgene1_prefetch_tune =
672 {
673 8, /* num_slots */
674 32, /* l1_cache_size */
675 64, /* l1_cache_line_size */
676 256, /* l2_cache_size */
677 true, /* prefetch_dynamic_strides */
678 -1, /* minimum_stride */
679 -1 /* default_opt_level */
680 };
681
682 static const struct tune_params generic_tunings =
683 {
684 &cortexa57_extra_costs,
685 &generic_addrcost_table,
686 &generic_regmove_cost,
687 &generic_vector_cost,
688 &generic_branch_cost,
689 &generic_approx_modes,
690 SVE_NOT_IMPLEMENTED, /* sve_width */
691 4, /* memmov_cost */
692 2, /* issue_rate */
693 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
694 "8", /* function_align. */
695 "4", /* jump_align. */
696 "8", /* loop_align. */
697 2, /* int_reassoc_width. */
698 4, /* fp_reassoc_width. */
699 1, /* vec_reassoc_width. */
700 2, /* min_div_recip_mul_sf. */
701 2, /* min_div_recip_mul_df. */
702 0, /* max_case_values. */
703 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
704 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
705 &generic_prefetch_tune
706 };
707
708 static const struct tune_params cortexa35_tunings =
709 {
710 &cortexa53_extra_costs,
711 &generic_addrcost_table,
712 &cortexa53_regmove_cost,
713 &generic_vector_cost,
714 &generic_branch_cost,
715 &generic_approx_modes,
716 SVE_NOT_IMPLEMENTED, /* sve_width */
717 4, /* memmov_cost */
718 1, /* issue_rate */
719 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
720 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
721 "16", /* function_align. */
722 "4", /* jump_align. */
723 "8", /* loop_align. */
724 2, /* int_reassoc_width. */
725 4, /* fp_reassoc_width. */
726 1, /* vec_reassoc_width. */
727 2, /* min_div_recip_mul_sf. */
728 2, /* min_div_recip_mul_df. */
729 0, /* max_case_values. */
730 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
731 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
732 &generic_prefetch_tune
733 };
734
735 static const struct tune_params cortexa53_tunings =
736 {
737 &cortexa53_extra_costs,
738 &generic_addrcost_table,
739 &cortexa53_regmove_cost,
740 &generic_vector_cost,
741 &generic_branch_cost,
742 &generic_approx_modes,
743 SVE_NOT_IMPLEMENTED, /* sve_width */
744 4, /* memmov_cost */
745 2, /* issue_rate */
746 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
747 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
748 "16", /* function_align. */
749 "4", /* jump_align. */
750 "8", /* loop_align. */
751 2, /* int_reassoc_width. */
752 4, /* fp_reassoc_width. */
753 1, /* vec_reassoc_width. */
754 2, /* min_div_recip_mul_sf. */
755 2, /* min_div_recip_mul_df. */
756 0, /* max_case_values. */
757 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
758 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
759 &generic_prefetch_tune
760 };
761
762 static const struct tune_params cortexa57_tunings =
763 {
764 &cortexa57_extra_costs,
765 &generic_addrcost_table,
766 &cortexa57_regmove_cost,
767 &cortexa57_vector_cost,
768 &generic_branch_cost,
769 &generic_approx_modes,
770 SVE_NOT_IMPLEMENTED, /* sve_width */
771 4, /* memmov_cost */
772 3, /* issue_rate */
773 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
774 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
775 "16", /* function_align. */
776 "4", /* jump_align. */
777 "8", /* loop_align. */
778 2, /* int_reassoc_width. */
779 4, /* fp_reassoc_width. */
780 1, /* vec_reassoc_width. */
781 2, /* min_div_recip_mul_sf. */
782 2, /* min_div_recip_mul_df. */
783 0, /* max_case_values. */
784 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
785 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
786 &generic_prefetch_tune
787 };
788
789 static const struct tune_params cortexa72_tunings =
790 {
791 &cortexa57_extra_costs,
792 &generic_addrcost_table,
793 &cortexa57_regmove_cost,
794 &cortexa57_vector_cost,
795 &generic_branch_cost,
796 &generic_approx_modes,
797 SVE_NOT_IMPLEMENTED, /* sve_width */
798 4, /* memmov_cost */
799 3, /* issue_rate */
800 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
801 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
802 "16", /* function_align. */
803 "4", /* jump_align. */
804 "8", /* loop_align. */
805 2, /* int_reassoc_width. */
806 4, /* fp_reassoc_width. */
807 1, /* vec_reassoc_width. */
808 2, /* min_div_recip_mul_sf. */
809 2, /* min_div_recip_mul_df. */
810 0, /* max_case_values. */
811 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
812 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
813 &generic_prefetch_tune
814 };
815
816 static const struct tune_params cortexa73_tunings =
817 {
818 &cortexa57_extra_costs,
819 &generic_addrcost_table,
820 &cortexa57_regmove_cost,
821 &cortexa57_vector_cost,
822 &generic_branch_cost,
823 &generic_approx_modes,
824 SVE_NOT_IMPLEMENTED, /* sve_width */
825 4, /* memmov_cost. */
826 2, /* issue_rate. */
827 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
828 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
829 "16", /* function_align. */
830 "4", /* jump_align. */
831 "8", /* loop_align. */
832 2, /* int_reassoc_width. */
833 4, /* fp_reassoc_width. */
834 1, /* vec_reassoc_width. */
835 2, /* min_div_recip_mul_sf. */
836 2, /* min_div_recip_mul_df. */
837 0, /* max_case_values. */
838 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
839 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
840 &generic_prefetch_tune
841 };
842
843
844
845 static const struct tune_params exynosm1_tunings =
846 {
847 &exynosm1_extra_costs,
848 &exynosm1_addrcost_table,
849 &exynosm1_regmove_cost,
850 &exynosm1_vector_cost,
851 &generic_branch_cost,
852 &exynosm1_approx_modes,
853 SVE_NOT_IMPLEMENTED, /* sve_width */
854 4, /* memmov_cost */
855 3, /* issue_rate */
856 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
857 "4", /* function_align. */
858 "4", /* jump_align. */
859 "4", /* loop_align. */
860 2, /* int_reassoc_width. */
861 4, /* fp_reassoc_width. */
862 1, /* vec_reassoc_width. */
863 2, /* min_div_recip_mul_sf. */
864 2, /* min_div_recip_mul_df. */
865 48, /* max_case_values. */
866 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
867 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
868 &exynosm1_prefetch_tune
869 };
870
871 static const struct tune_params thunderxt88_tunings =
872 {
873 &thunderx_extra_costs,
874 &generic_addrcost_table,
875 &thunderx_regmove_cost,
876 &thunderx_vector_cost,
877 &generic_branch_cost,
878 &generic_approx_modes,
879 SVE_NOT_IMPLEMENTED, /* sve_width */
880 6, /* memmov_cost */
881 2, /* issue_rate */
882 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
883 "8", /* function_align. */
884 "8", /* jump_align. */
885 "8", /* loop_align. */
886 2, /* int_reassoc_width. */
887 4, /* fp_reassoc_width. */
888 1, /* vec_reassoc_width. */
889 2, /* min_div_recip_mul_sf. */
890 2, /* min_div_recip_mul_df. */
891 0, /* max_case_values. */
892 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
893 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
894 &thunderxt88_prefetch_tune
895 };
896
897 static const struct tune_params thunderx_tunings =
898 {
899 &thunderx_extra_costs,
900 &generic_addrcost_table,
901 &thunderx_regmove_cost,
902 &thunderx_vector_cost,
903 &generic_branch_cost,
904 &generic_approx_modes,
905 SVE_NOT_IMPLEMENTED, /* sve_width */
906 6, /* memmov_cost */
907 2, /* issue_rate */
908 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
909 "8", /* function_align. */
910 "8", /* jump_align. */
911 "8", /* loop_align. */
912 2, /* int_reassoc_width. */
913 4, /* fp_reassoc_width. */
914 1, /* vec_reassoc_width. */
915 2, /* min_div_recip_mul_sf. */
916 2, /* min_div_recip_mul_df. */
917 0, /* max_case_values. */
918 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
919 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
920 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
921 &thunderx_prefetch_tune
922 };
923
924 static const struct tune_params tsv110_tunings =
925 {
926 &tsv110_extra_costs,
927 &tsv110_addrcost_table,
928 &tsv110_regmove_cost,
929 &tsv110_vector_cost,
930 &generic_branch_cost,
931 &generic_approx_modes,
932 SVE_NOT_IMPLEMENTED, /* sve_width */
933 4, /* memmov_cost */
934 4, /* issue_rate */
935 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
936 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
937 "16", /* function_align. */
938 "4", /* jump_align. */
939 "8", /* loop_align. */
940 2, /* int_reassoc_width. */
941 4, /* fp_reassoc_width. */
942 1, /* vec_reassoc_width. */
943 2, /* min_div_recip_mul_sf. */
944 2, /* min_div_recip_mul_df. */
945 0, /* max_case_values. */
946 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
947 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
948 &tsv110_prefetch_tune
949 };
950
951 static const struct tune_params xgene1_tunings =
952 {
953 &xgene1_extra_costs,
954 &xgene1_addrcost_table,
955 &xgene1_regmove_cost,
956 &xgene1_vector_cost,
957 &generic_branch_cost,
958 &xgene1_approx_modes,
959 SVE_NOT_IMPLEMENTED, /* sve_width */
960 6, /* memmov_cost */
961 4, /* issue_rate */
962 AARCH64_FUSE_NOTHING, /* fusible_ops */
963 "16", /* function_align. */
964 "16", /* jump_align. */
965 "16", /* loop_align. */
966 2, /* int_reassoc_width. */
967 4, /* fp_reassoc_width. */
968 1, /* vec_reassoc_width. */
969 2, /* min_div_recip_mul_sf. */
970 2, /* min_div_recip_mul_df. */
971 17, /* max_case_values. */
972 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
973 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
974 &xgene1_prefetch_tune
975 };
976
977 static const struct tune_params emag_tunings =
978 {
979 &xgene1_extra_costs,
980 &xgene1_addrcost_table,
981 &xgene1_regmove_cost,
982 &xgene1_vector_cost,
983 &generic_branch_cost,
984 &xgene1_approx_modes,
985 SVE_NOT_IMPLEMENTED,
986 6, /* memmov_cost */
987 4, /* issue_rate */
988 AARCH64_FUSE_NOTHING, /* fusible_ops */
989 "16", /* function_align. */
990 "16", /* jump_align. */
991 "16", /* loop_align. */
992 2, /* int_reassoc_width. */
993 4, /* fp_reassoc_width. */
994 1, /* vec_reassoc_width. */
995 2, /* min_div_recip_mul_sf. */
996 2, /* min_div_recip_mul_df. */
997 17, /* max_case_values. */
998 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
999 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1000 &xgene1_prefetch_tune
1001 };
1002
1003 static const struct tune_params qdf24xx_tunings =
1004 {
1005 &qdf24xx_extra_costs,
1006 &qdf24xx_addrcost_table,
1007 &qdf24xx_regmove_cost,
1008 &qdf24xx_vector_cost,
1009 &generic_branch_cost,
1010 &generic_approx_modes,
1011 SVE_NOT_IMPLEMENTED, /* sve_width */
1012 4, /* memmov_cost */
1013 4, /* issue_rate */
1014 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1015 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1016 "16", /* function_align. */
1017 "8", /* jump_align. */
1018 "16", /* loop_align. */
1019 2, /* int_reassoc_width. */
1020 4, /* fp_reassoc_width. */
1021 1, /* vec_reassoc_width. */
1022 2, /* min_div_recip_mul_sf. */
1023 2, /* min_div_recip_mul_df. */
1024 0, /* max_case_values. */
1025 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1026 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1027 &qdf24xx_prefetch_tune
1028 };
1029
1030 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1031 for now. */
1032 static const struct tune_params saphira_tunings =
1033 {
1034 &generic_extra_costs,
1035 &generic_addrcost_table,
1036 &generic_regmove_cost,
1037 &generic_vector_cost,
1038 &generic_branch_cost,
1039 &generic_approx_modes,
1040 SVE_NOT_IMPLEMENTED, /* sve_width */
1041 4, /* memmov_cost */
1042 4, /* issue_rate */
1043 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1044 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1045 "16", /* function_align. */
1046 "8", /* jump_align. */
1047 "16", /* loop_align. */
1048 2, /* int_reassoc_width. */
1049 4, /* fp_reassoc_width. */
1050 1, /* vec_reassoc_width. */
1051 2, /* min_div_recip_mul_sf. */
1052 2, /* min_div_recip_mul_df. */
1053 0, /* max_case_values. */
1054 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1055 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1056 &generic_prefetch_tune
1057 };
1058
1059 static const struct tune_params thunderx2t99_tunings =
1060 {
1061 &thunderx2t99_extra_costs,
1062 &thunderx2t99_addrcost_table,
1063 &thunderx2t99_regmove_cost,
1064 &thunderx2t99_vector_cost,
1065 &generic_branch_cost,
1066 &generic_approx_modes,
1067 SVE_NOT_IMPLEMENTED, /* sve_width */
1068 4, /* memmov_cost. */
1069 4, /* issue_rate. */
1070 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1071 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1072 "16", /* function_align. */
1073 "8", /* jump_align. */
1074 "16", /* loop_align. */
1075 3, /* int_reassoc_width. */
1076 2, /* fp_reassoc_width. */
1077 2, /* vec_reassoc_width. */
1078 2, /* min_div_recip_mul_sf. */
1079 2, /* min_div_recip_mul_df. */
1080 0, /* max_case_values. */
1081 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1082 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1083 &thunderx2t99_prefetch_tune
1084 };
1085
1086 /* Support for fine-grained override of the tuning structures. */
1087 struct aarch64_tuning_override_function
1088 {
1089 const char* name;
1090 void (*parse_override)(const char*, struct tune_params*);
1091 };
1092
1093 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1094 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1095 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1096
1097 static const struct aarch64_tuning_override_function
1098 aarch64_tuning_override_functions[] =
1099 {
1100 { "fuse", aarch64_parse_fuse_string },
1101 { "tune", aarch64_parse_tune_string },
1102 { "sve_width", aarch64_parse_sve_width_string },
1103 { NULL, NULL }
1104 };
1105
1106 /* A processor implementing AArch64. */
1107 struct processor
1108 {
1109 const char *const name;
1110 enum aarch64_processor ident;
1111 enum aarch64_processor sched_core;
1112 enum aarch64_arch arch;
1113 unsigned architecture_version;
1114 const unsigned long flags;
1115 const struct tune_params *const tune;
1116 };
1117
1118 /* Architectures implementing AArch64. */
1119 static const struct processor all_architectures[] =
1120 {
1121 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1122 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1123 #include "aarch64-arches.def"
1124 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1125 };
1126
1127 /* Processor cores implementing AArch64. */
1128 static const struct processor all_cores[] =
1129 {
1130 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1131 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1132 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1133 FLAGS, &COSTS##_tunings},
1134 #include "aarch64-cores.def"
1135 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1136 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1137 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1138 };
1139
1140
1141 /* Target specification. These are populated by the -march, -mtune, -mcpu
1142 handling code or by target attributes. */
1143 static const struct processor *selected_arch;
1144 static const struct processor *selected_cpu;
1145 static const struct processor *selected_tune;
1146
1147 /* The current tuning set. */
1148 struct tune_params aarch64_tune_params = generic_tunings;
1149
1150 /* Table of machine attributes. */
1151 static const struct attribute_spec aarch64_attribute_table[] =
1152 {
1153 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1154 affects_type_identity, handler, exclude } */
1155 { "aarch64_vector_pcs", 0, 0, false, true, true, false, NULL, NULL },
1156 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1157 };
1158
1159 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1160
1161 /* An ISA extension in the co-processor and main instruction set space. */
1162 struct aarch64_option_extension
1163 {
1164 const char *const name;
1165 const unsigned long flags_on;
1166 const unsigned long flags_off;
1167 };
1168
1169 typedef enum aarch64_cond_code
1170 {
1171 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1172 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1173 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1174 }
1175 aarch64_cc;
1176
1177 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1178
1179 struct aarch64_branch_protect_type
1180 {
1181 /* The type's name that the user passes to the branch-protection option
1182 string. */
1183 const char* name;
1184 /* Function to handle the protection type and set global variables.
1185 First argument is the string token corresponding with this type and the
1186 second argument is the next token in the option string.
1187 Return values:
1188 * AARCH64_PARSE_OK: Handling was sucessful.
1189 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1190 should print an error.
1191 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1192 own error. */
1193 enum aarch64_parse_opt_result (*handler)(char*, char*);
1194 /* A list of types that can follow this type in the option string. */
1195 const aarch64_branch_protect_type* subtypes;
1196 unsigned int num_subtypes;
1197 };
1198
1199 static enum aarch64_parse_opt_result
1200 aarch64_handle_no_branch_protection (char* str, char* rest)
1201 {
1202 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1203 aarch64_enable_bti = 0;
1204 if (rest)
1205 {
1206 error ("unexpected %<%s%> after %<%s%>", rest, str);
1207 return AARCH64_PARSE_INVALID_FEATURE;
1208 }
1209 return AARCH64_PARSE_OK;
1210 }
1211
1212 static enum aarch64_parse_opt_result
1213 aarch64_handle_standard_branch_protection (char* str, char* rest)
1214 {
1215 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1216 aarch64_enable_bti = 1;
1217 if (rest)
1218 {
1219 error ("unexpected %<%s%> after %<%s%>", rest, str);
1220 return AARCH64_PARSE_INVALID_FEATURE;
1221 }
1222 return AARCH64_PARSE_OK;
1223 }
1224
1225 static enum aarch64_parse_opt_result
1226 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1227 char* rest ATTRIBUTE_UNUSED)
1228 {
1229 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1230 return AARCH64_PARSE_OK;
1231 }
1232
1233 static enum aarch64_parse_opt_result
1234 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1235 char* rest ATTRIBUTE_UNUSED)
1236 {
1237 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1238 return AARCH64_PARSE_OK;
1239 }
1240
1241 static enum aarch64_parse_opt_result
1242 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1243 char* rest ATTRIBUTE_UNUSED)
1244 {
1245 aarch64_enable_bti = 1;
1246 return AARCH64_PARSE_OK;
1247 }
1248
1249 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1250 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1251 { NULL, NULL, NULL, 0 }
1252 };
1253
1254 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1255 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1256 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1257 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1258 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1259 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1260 { NULL, NULL, NULL, 0 }
1261 };
1262
1263 /* The condition codes of the processor, and the inverse function. */
1264 static const char * const aarch64_condition_codes[] =
1265 {
1266 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1267 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1268 };
1269
1270 /* Generate code to enable conditional branches in functions over 1 MiB. */
1271 const char *
1272 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1273 const char * branch_format)
1274 {
1275 rtx_code_label * tmp_label = gen_label_rtx ();
1276 char label_buf[256];
1277 char buffer[128];
1278 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1279 CODE_LABEL_NUMBER (tmp_label));
1280 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1281 rtx dest_label = operands[pos_label];
1282 operands[pos_label] = tmp_label;
1283
1284 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1285 output_asm_insn (buffer, operands);
1286
1287 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1288 operands[pos_label] = dest_label;
1289 output_asm_insn (buffer, operands);
1290 return "";
1291 }
1292
1293 void
1294 aarch64_err_no_fpadvsimd (machine_mode mode)
1295 {
1296 if (TARGET_GENERAL_REGS_ONLY)
1297 if (FLOAT_MODE_P (mode))
1298 error ("%qs is incompatible with the use of floating-point types",
1299 "-mgeneral-regs-only");
1300 else
1301 error ("%qs is incompatible with the use of vector types",
1302 "-mgeneral-regs-only");
1303 else
1304 if (FLOAT_MODE_P (mode))
1305 error ("%qs feature modifier is incompatible with the use of"
1306 " floating-point types", "+nofp");
1307 else
1308 error ("%qs feature modifier is incompatible with the use of"
1309 " vector types", "+nofp");
1310 }
1311
1312 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1313 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1314 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1315 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1316 and GENERAL_REGS is lower than the memory cost (in this case the best class
1317 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1318 cost results in bad allocations with many redundant int<->FP moves which
1319 are expensive on various cores.
1320 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1321 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1322 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1323 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1324 The result of this is that it is no longer inefficient to have a higher
1325 memory move cost than the register move cost.
1326 */
1327
1328 static reg_class_t
1329 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1330 reg_class_t best_class)
1331 {
1332 machine_mode mode;
1333
1334 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1335 || !reg_class_subset_p (FP_REGS, allocno_class))
1336 return allocno_class;
1337
1338 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1339 || !reg_class_subset_p (FP_REGS, best_class))
1340 return best_class;
1341
1342 mode = PSEUDO_REGNO_MODE (regno);
1343 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1344 }
1345
1346 static unsigned int
1347 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1348 {
1349 if (GET_MODE_UNIT_SIZE (mode) == 4)
1350 return aarch64_tune_params.min_div_recip_mul_sf;
1351 return aarch64_tune_params.min_div_recip_mul_df;
1352 }
1353
1354 /* Return the reassociation width of treeop OPC with mode MODE. */
1355 static int
1356 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1357 {
1358 if (VECTOR_MODE_P (mode))
1359 return aarch64_tune_params.vec_reassoc_width;
1360 if (INTEGRAL_MODE_P (mode))
1361 return aarch64_tune_params.int_reassoc_width;
1362 /* Avoid reassociating floating point addition so we emit more FMAs. */
1363 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1364 return aarch64_tune_params.fp_reassoc_width;
1365 return 1;
1366 }
1367
1368 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1369 unsigned
1370 aarch64_dbx_register_number (unsigned regno)
1371 {
1372 if (GP_REGNUM_P (regno))
1373 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1374 else if (regno == SP_REGNUM)
1375 return AARCH64_DWARF_SP;
1376 else if (FP_REGNUM_P (regno))
1377 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1378 else if (PR_REGNUM_P (regno))
1379 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1380 else if (regno == VG_REGNUM)
1381 return AARCH64_DWARF_VG;
1382
1383 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1384 equivalent DWARF register. */
1385 return DWARF_FRAME_REGISTERS;
1386 }
1387
1388 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1389 static bool
1390 aarch64_advsimd_struct_mode_p (machine_mode mode)
1391 {
1392 return (TARGET_SIMD
1393 && (mode == OImode || mode == CImode || mode == XImode));
1394 }
1395
1396 /* Return true if MODE is an SVE predicate mode. */
1397 static bool
1398 aarch64_sve_pred_mode_p (machine_mode mode)
1399 {
1400 return (TARGET_SVE
1401 && (mode == VNx16BImode
1402 || mode == VNx8BImode
1403 || mode == VNx4BImode
1404 || mode == VNx2BImode));
1405 }
1406
1407 /* Three mutually-exclusive flags describing a vector or predicate type. */
1408 const unsigned int VEC_ADVSIMD = 1;
1409 const unsigned int VEC_SVE_DATA = 2;
1410 const unsigned int VEC_SVE_PRED = 4;
1411 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1412 a structure of 2, 3 or 4 vectors. */
1413 const unsigned int VEC_STRUCT = 8;
1414 /* Useful combinations of the above. */
1415 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1416 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1417
1418 /* Return a set of flags describing the vector properties of mode MODE.
1419 Ignore modes that are not supported by the current target. */
1420 static unsigned int
1421 aarch64_classify_vector_mode (machine_mode mode)
1422 {
1423 if (aarch64_advsimd_struct_mode_p (mode))
1424 return VEC_ADVSIMD | VEC_STRUCT;
1425
1426 if (aarch64_sve_pred_mode_p (mode))
1427 return VEC_SVE_PRED;
1428
1429 scalar_mode inner = GET_MODE_INNER (mode);
1430 if (VECTOR_MODE_P (mode)
1431 && (inner == QImode
1432 || inner == HImode
1433 || inner == HFmode
1434 || inner == SImode
1435 || inner == SFmode
1436 || inner == DImode
1437 || inner == DFmode))
1438 {
1439 if (TARGET_SVE)
1440 {
1441 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1442 return VEC_SVE_DATA;
1443 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1444 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1445 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1446 return VEC_SVE_DATA | VEC_STRUCT;
1447 }
1448
1449 /* This includes V1DF but not V1DI (which doesn't exist). */
1450 if (TARGET_SIMD
1451 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1452 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1453 return VEC_ADVSIMD;
1454 }
1455
1456 return 0;
1457 }
1458
1459 /* Return true if MODE is any of the data vector modes, including
1460 structure modes. */
1461 static bool
1462 aarch64_vector_data_mode_p (machine_mode mode)
1463 {
1464 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1465 }
1466
1467 /* Return true if MODE is an SVE data vector mode; either a single vector
1468 or a structure of vectors. */
1469 static bool
1470 aarch64_sve_data_mode_p (machine_mode mode)
1471 {
1472 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1473 }
1474
1475 /* Implement target hook TARGET_ARRAY_MODE. */
1476 static opt_machine_mode
1477 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1478 {
1479 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1480 && IN_RANGE (nelems, 2, 4))
1481 return mode_for_vector (GET_MODE_INNER (mode),
1482 GET_MODE_NUNITS (mode) * nelems);
1483
1484 return opt_machine_mode ();
1485 }
1486
1487 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1488 static bool
1489 aarch64_array_mode_supported_p (machine_mode mode,
1490 unsigned HOST_WIDE_INT nelems)
1491 {
1492 if (TARGET_SIMD
1493 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1494 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1495 && (nelems >= 2 && nelems <= 4))
1496 return true;
1497
1498 return false;
1499 }
1500
1501 /* Return the SVE predicate mode to use for elements that have
1502 ELEM_NBYTES bytes, if such a mode exists. */
1503
1504 opt_machine_mode
1505 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1506 {
1507 if (TARGET_SVE)
1508 {
1509 if (elem_nbytes == 1)
1510 return VNx16BImode;
1511 if (elem_nbytes == 2)
1512 return VNx8BImode;
1513 if (elem_nbytes == 4)
1514 return VNx4BImode;
1515 if (elem_nbytes == 8)
1516 return VNx2BImode;
1517 }
1518 return opt_machine_mode ();
1519 }
1520
1521 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1522
1523 static opt_machine_mode
1524 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1525 {
1526 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1527 {
1528 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1529 machine_mode pred_mode;
1530 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1531 return pred_mode;
1532 }
1533
1534 return default_get_mask_mode (nunits, nbytes);
1535 }
1536
1537 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1538 prefer to use the first arithmetic operand as the else value if
1539 the else value doesn't matter, since that exactly matches the SVE
1540 destructive merging form. For ternary operations we could either
1541 pick the first operand and use FMAD-like instructions or the last
1542 operand and use FMLA-like instructions; the latter seems more
1543 natural. */
1544
1545 static tree
1546 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1547 {
1548 return nops == 3 ? ops[2] : ops[0];
1549 }
1550
1551 /* Implement TARGET_HARD_REGNO_NREGS. */
1552
1553 static unsigned int
1554 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1555 {
1556 /* ??? Logically we should only need to provide a value when
1557 HARD_REGNO_MODE_OK says that the combination is valid,
1558 but at the moment we need to handle all modes. Just ignore
1559 any runtime parts for registers that can't store them. */
1560 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1561 switch (aarch64_regno_regclass (regno))
1562 {
1563 case FP_REGS:
1564 case FP_LO_REGS:
1565 if (aarch64_sve_data_mode_p (mode))
1566 return exact_div (GET_MODE_SIZE (mode),
1567 BYTES_PER_SVE_VECTOR).to_constant ();
1568 return CEIL (lowest_size, UNITS_PER_VREG);
1569 case PR_REGS:
1570 case PR_LO_REGS:
1571 case PR_HI_REGS:
1572 return 1;
1573 default:
1574 return CEIL (lowest_size, UNITS_PER_WORD);
1575 }
1576 gcc_unreachable ();
1577 }
1578
1579 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1580
1581 static bool
1582 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1583 {
1584 if (GET_MODE_CLASS (mode) == MODE_CC)
1585 return regno == CC_REGNUM;
1586
1587 if (regno == VG_REGNUM)
1588 /* This must have the same size as _Unwind_Word. */
1589 return mode == DImode;
1590
1591 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1592 if (vec_flags & VEC_SVE_PRED)
1593 return PR_REGNUM_P (regno);
1594
1595 if (PR_REGNUM_P (regno))
1596 return 0;
1597
1598 if (regno == SP_REGNUM)
1599 /* The purpose of comparing with ptr_mode is to support the
1600 global register variable associated with the stack pointer
1601 register via the syntax of asm ("wsp") in ILP32. */
1602 return mode == Pmode || mode == ptr_mode;
1603
1604 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1605 return mode == Pmode;
1606
1607 if (GP_REGNUM_P (regno))
1608 {
1609 if (known_le (GET_MODE_SIZE (mode), 8))
1610 return true;
1611 else if (known_le (GET_MODE_SIZE (mode), 16))
1612 return (regno & 1) == 0;
1613 }
1614 else if (FP_REGNUM_P (regno))
1615 {
1616 if (vec_flags & VEC_STRUCT)
1617 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1618 else
1619 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1620 }
1621
1622 return false;
1623 }
1624
1625 /* Return true if this is a definition of a vectorized simd function. */
1626
1627 static bool
1628 aarch64_simd_decl_p (tree fndecl)
1629 {
1630 tree fntype;
1631
1632 if (fndecl == NULL)
1633 return false;
1634 fntype = TREE_TYPE (fndecl);
1635 if (fntype == NULL)
1636 return false;
1637
1638 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1639 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1640 return true;
1641
1642 return false;
1643 }
1644
1645 /* Return the mode a register save/restore should use. DImode for integer
1646 registers, DFmode for FP registers in non-SIMD functions (they only save
1647 the bottom half of a 128 bit register), or TFmode for FP registers in
1648 SIMD functions. */
1649
1650 static machine_mode
1651 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1652 {
1653 return GP_REGNUM_P (regno)
1654 ? E_DImode
1655 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1656 }
1657
1658 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1659 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1660 clobbers the top 64 bits when restoring the bottom 64 bits. */
1661
1662 static bool
1663 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1664 {
1665 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1666 }
1667
1668 /* Implement REGMODE_NATURAL_SIZE. */
1669 poly_uint64
1670 aarch64_regmode_natural_size (machine_mode mode)
1671 {
1672 /* The natural size for SVE data modes is one SVE data vector,
1673 and similarly for predicates. We can't independently modify
1674 anything smaller than that. */
1675 /* ??? For now, only do this for variable-width SVE registers.
1676 Doing it for constant-sized registers breaks lower-subreg.c. */
1677 /* ??? And once that's fixed, we should probably have similar
1678 code for Advanced SIMD. */
1679 if (!aarch64_sve_vg.is_constant ())
1680 {
1681 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1682 if (vec_flags & VEC_SVE_PRED)
1683 return BYTES_PER_SVE_PRED;
1684 if (vec_flags & VEC_SVE_DATA)
1685 return BYTES_PER_SVE_VECTOR;
1686 }
1687 return UNITS_PER_WORD;
1688 }
1689
1690 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1691 machine_mode
1692 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1693 machine_mode mode)
1694 {
1695 /* The predicate mode determines which bits are significant and
1696 which are "don't care". Decreasing the number of lanes would
1697 lose data while increasing the number of lanes would make bits
1698 unnecessarily significant. */
1699 if (PR_REGNUM_P (regno))
1700 return mode;
1701 if (known_ge (GET_MODE_SIZE (mode), 4))
1702 return mode;
1703 else
1704 return SImode;
1705 }
1706
1707 /* Return true if I's bits are consecutive ones from the MSB. */
1708 bool
1709 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1710 {
1711 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1712 }
1713
1714 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1715 that strcpy from constants will be faster. */
1716
1717 static HOST_WIDE_INT
1718 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1719 {
1720 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1721 return MAX (align, BITS_PER_WORD);
1722 return align;
1723 }
1724
1725 /* Return true if calls to DECL should be treated as
1726 long-calls (ie called via a register). */
1727 static bool
1728 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1729 {
1730 return false;
1731 }
1732
1733 /* Return true if calls to symbol-ref SYM should be treated as
1734 long-calls (ie called via a register). */
1735 bool
1736 aarch64_is_long_call_p (rtx sym)
1737 {
1738 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1739 }
1740
1741 /* Return true if calls to symbol-ref SYM should not go through
1742 plt stubs. */
1743
1744 bool
1745 aarch64_is_noplt_call_p (rtx sym)
1746 {
1747 const_tree decl = SYMBOL_REF_DECL (sym);
1748
1749 if (flag_pic
1750 && decl
1751 && (!flag_plt
1752 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1753 && !targetm.binds_local_p (decl))
1754 return true;
1755
1756 return false;
1757 }
1758
1759 /* Return true if the offsets to a zero/sign-extract operation
1760 represent an expression that matches an extend operation. The
1761 operands represent the paramters from
1762
1763 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1764 bool
1765 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1766 rtx extract_imm)
1767 {
1768 HOST_WIDE_INT mult_val, extract_val;
1769
1770 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1771 return false;
1772
1773 mult_val = INTVAL (mult_imm);
1774 extract_val = INTVAL (extract_imm);
1775
1776 if (extract_val > 8
1777 && extract_val < GET_MODE_BITSIZE (mode)
1778 && exact_log2 (extract_val & ~7) > 0
1779 && (extract_val & 7) <= 4
1780 && mult_val == (1 << (extract_val & 7)))
1781 return true;
1782
1783 return false;
1784 }
1785
1786 /* Emit an insn that's a simple single-set. Both the operands must be
1787 known to be valid. */
1788 inline static rtx_insn *
1789 emit_set_insn (rtx x, rtx y)
1790 {
1791 return emit_insn (gen_rtx_SET (x, y));
1792 }
1793
1794 /* X and Y are two things to compare using CODE. Emit the compare insn and
1795 return the rtx for register 0 in the proper mode. */
1796 rtx
1797 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1798 {
1799 machine_mode mode = SELECT_CC_MODE (code, x, y);
1800 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1801
1802 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1803 return cc_reg;
1804 }
1805
1806 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
1807
1808 static rtx
1809 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1810 machine_mode y_mode)
1811 {
1812 if (y_mode == E_QImode || y_mode == E_HImode)
1813 {
1814 if (CONST_INT_P (y))
1815 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1816 else
1817 {
1818 rtx t, cc_reg;
1819 machine_mode cc_mode;
1820
1821 t = gen_rtx_ZERO_EXTEND (SImode, y);
1822 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1823 cc_mode = CC_SWPmode;
1824 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1825 emit_set_insn (cc_reg, t);
1826 return cc_reg;
1827 }
1828 }
1829
1830 return aarch64_gen_compare_reg (code, x, y);
1831 }
1832
1833 /* Build the SYMBOL_REF for __tls_get_addr. */
1834
1835 static GTY(()) rtx tls_get_addr_libfunc;
1836
1837 rtx
1838 aarch64_tls_get_addr (void)
1839 {
1840 if (!tls_get_addr_libfunc)
1841 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1842 return tls_get_addr_libfunc;
1843 }
1844
1845 /* Return the TLS model to use for ADDR. */
1846
1847 static enum tls_model
1848 tls_symbolic_operand_type (rtx addr)
1849 {
1850 enum tls_model tls_kind = TLS_MODEL_NONE;
1851 if (GET_CODE (addr) == CONST)
1852 {
1853 poly_int64 addend;
1854 rtx sym = strip_offset (addr, &addend);
1855 if (GET_CODE (sym) == SYMBOL_REF)
1856 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1857 }
1858 else if (GET_CODE (addr) == SYMBOL_REF)
1859 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1860
1861 return tls_kind;
1862 }
1863
1864 /* We'll allow lo_sum's in addresses in our legitimate addresses
1865 so that combine would take care of combining addresses where
1866 necessary, but for generation purposes, we'll generate the address
1867 as :
1868 RTL Absolute
1869 tmp = hi (symbol_ref); adrp x1, foo
1870 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1871 nop
1872
1873 PIC TLS
1874 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1875 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1876 bl __tls_get_addr
1877 nop
1878
1879 Load TLS symbol, depending on TLS mechanism and TLS access model.
1880
1881 Global Dynamic - Traditional TLS:
1882 adrp tmp, :tlsgd:imm
1883 add dest, tmp, #:tlsgd_lo12:imm
1884 bl __tls_get_addr
1885
1886 Global Dynamic - TLS Descriptors:
1887 adrp dest, :tlsdesc:imm
1888 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1889 add dest, dest, #:tlsdesc_lo12:imm
1890 blr tmp
1891 mrs tp, tpidr_el0
1892 add dest, dest, tp
1893
1894 Initial Exec:
1895 mrs tp, tpidr_el0
1896 adrp tmp, :gottprel:imm
1897 ldr dest, [tmp, #:gottprel_lo12:imm]
1898 add dest, dest, tp
1899
1900 Local Exec:
1901 mrs tp, tpidr_el0
1902 add t0, tp, #:tprel_hi12:imm, lsl #12
1903 add t0, t0, #:tprel_lo12_nc:imm
1904 */
1905
1906 static void
1907 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1908 enum aarch64_symbol_type type)
1909 {
1910 switch (type)
1911 {
1912 case SYMBOL_SMALL_ABSOLUTE:
1913 {
1914 /* In ILP32, the mode of dest can be either SImode or DImode. */
1915 rtx tmp_reg = dest;
1916 machine_mode mode = GET_MODE (dest);
1917
1918 gcc_assert (mode == Pmode || mode == ptr_mode);
1919
1920 if (can_create_pseudo_p ())
1921 tmp_reg = gen_reg_rtx (mode);
1922
1923 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1924 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1925 return;
1926 }
1927
1928 case SYMBOL_TINY_ABSOLUTE:
1929 emit_insn (gen_rtx_SET (dest, imm));
1930 return;
1931
1932 case SYMBOL_SMALL_GOT_28K:
1933 {
1934 machine_mode mode = GET_MODE (dest);
1935 rtx gp_rtx = pic_offset_table_rtx;
1936 rtx insn;
1937 rtx mem;
1938
1939 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1940 here before rtl expand. Tree IVOPT will generate rtl pattern to
1941 decide rtx costs, in which case pic_offset_table_rtx is not
1942 initialized. For that case no need to generate the first adrp
1943 instruction as the final cost for global variable access is
1944 one instruction. */
1945 if (gp_rtx != NULL)
1946 {
1947 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1948 using the page base as GOT base, the first page may be wasted,
1949 in the worst scenario, there is only 28K space for GOT).
1950
1951 The generate instruction sequence for accessing global variable
1952 is:
1953
1954 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1955
1956 Only one instruction needed. But we must initialize
1957 pic_offset_table_rtx properly. We generate initialize insn for
1958 every global access, and allow CSE to remove all redundant.
1959
1960 The final instruction sequences will look like the following
1961 for multiply global variables access.
1962
1963 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1964
1965 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1966 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1967 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1968 ... */
1969
1970 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1971 crtl->uses_pic_offset_table = 1;
1972 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1973
1974 if (mode != GET_MODE (gp_rtx))
1975 gp_rtx = gen_lowpart (mode, gp_rtx);
1976
1977 }
1978
1979 if (mode == ptr_mode)
1980 {
1981 if (mode == DImode)
1982 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1983 else
1984 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1985
1986 mem = XVECEXP (SET_SRC (insn), 0, 0);
1987 }
1988 else
1989 {
1990 gcc_assert (mode == Pmode);
1991
1992 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1993 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1994 }
1995
1996 /* The operand is expected to be MEM. Whenever the related insn
1997 pattern changed, above code which calculate mem should be
1998 updated. */
1999 gcc_assert (GET_CODE (mem) == MEM);
2000 MEM_READONLY_P (mem) = 1;
2001 MEM_NOTRAP_P (mem) = 1;
2002 emit_insn (insn);
2003 return;
2004 }
2005
2006 case SYMBOL_SMALL_GOT_4G:
2007 {
2008 /* In ILP32, the mode of dest can be either SImode or DImode,
2009 while the got entry is always of SImode size. The mode of
2010 dest depends on how dest is used: if dest is assigned to a
2011 pointer (e.g. in the memory), it has SImode; it may have
2012 DImode if dest is dereferenced to access the memeory.
2013 This is why we have to handle three different ldr_got_small
2014 patterns here (two patterns for ILP32). */
2015
2016 rtx insn;
2017 rtx mem;
2018 rtx tmp_reg = dest;
2019 machine_mode mode = GET_MODE (dest);
2020
2021 if (can_create_pseudo_p ())
2022 tmp_reg = gen_reg_rtx (mode);
2023
2024 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2025 if (mode == ptr_mode)
2026 {
2027 if (mode == DImode)
2028 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2029 else
2030 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2031
2032 mem = XVECEXP (SET_SRC (insn), 0, 0);
2033 }
2034 else
2035 {
2036 gcc_assert (mode == Pmode);
2037
2038 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2039 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2040 }
2041
2042 gcc_assert (GET_CODE (mem) == MEM);
2043 MEM_READONLY_P (mem) = 1;
2044 MEM_NOTRAP_P (mem) = 1;
2045 emit_insn (insn);
2046 return;
2047 }
2048
2049 case SYMBOL_SMALL_TLSGD:
2050 {
2051 rtx_insn *insns;
2052 machine_mode mode = GET_MODE (dest);
2053 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2054
2055 start_sequence ();
2056 if (TARGET_ILP32)
2057 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2058 else
2059 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2060 insns = get_insns ();
2061 end_sequence ();
2062
2063 RTL_CONST_CALL_P (insns) = 1;
2064 emit_libcall_block (insns, dest, result, imm);
2065 return;
2066 }
2067
2068 case SYMBOL_SMALL_TLSDESC:
2069 {
2070 machine_mode mode = GET_MODE (dest);
2071 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2072 rtx tp;
2073
2074 gcc_assert (mode == Pmode || mode == ptr_mode);
2075
2076 /* In ILP32, the got entry is always of SImode size. Unlike
2077 small GOT, the dest is fixed at reg 0. */
2078 if (TARGET_ILP32)
2079 emit_insn (gen_tlsdesc_small_si (imm));
2080 else
2081 emit_insn (gen_tlsdesc_small_di (imm));
2082 tp = aarch64_load_tp (NULL);
2083
2084 if (mode != Pmode)
2085 tp = gen_lowpart (mode, tp);
2086
2087 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2088 if (REG_P (dest))
2089 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2090 return;
2091 }
2092
2093 case SYMBOL_SMALL_TLSIE:
2094 {
2095 /* In ILP32, the mode of dest can be either SImode or DImode,
2096 while the got entry is always of SImode size. The mode of
2097 dest depends on how dest is used: if dest is assigned to a
2098 pointer (e.g. in the memory), it has SImode; it may have
2099 DImode if dest is dereferenced to access the memeory.
2100 This is why we have to handle three different tlsie_small
2101 patterns here (two patterns for ILP32). */
2102 machine_mode mode = GET_MODE (dest);
2103 rtx tmp_reg = gen_reg_rtx (mode);
2104 rtx tp = aarch64_load_tp (NULL);
2105
2106 if (mode == ptr_mode)
2107 {
2108 if (mode == DImode)
2109 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2110 else
2111 {
2112 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2113 tp = gen_lowpart (mode, tp);
2114 }
2115 }
2116 else
2117 {
2118 gcc_assert (mode == Pmode);
2119 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2120 }
2121
2122 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2123 if (REG_P (dest))
2124 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2125 return;
2126 }
2127
2128 case SYMBOL_TLSLE12:
2129 case SYMBOL_TLSLE24:
2130 case SYMBOL_TLSLE32:
2131 case SYMBOL_TLSLE48:
2132 {
2133 machine_mode mode = GET_MODE (dest);
2134 rtx tp = aarch64_load_tp (NULL);
2135
2136 if (mode != Pmode)
2137 tp = gen_lowpart (mode, tp);
2138
2139 switch (type)
2140 {
2141 case SYMBOL_TLSLE12:
2142 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2143 (dest, tp, imm));
2144 break;
2145 case SYMBOL_TLSLE24:
2146 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2147 (dest, tp, imm));
2148 break;
2149 case SYMBOL_TLSLE32:
2150 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2151 (dest, imm));
2152 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2153 (dest, dest, tp));
2154 break;
2155 case SYMBOL_TLSLE48:
2156 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2157 (dest, imm));
2158 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2159 (dest, dest, tp));
2160 break;
2161 default:
2162 gcc_unreachable ();
2163 }
2164
2165 if (REG_P (dest))
2166 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2167 return;
2168 }
2169
2170 case SYMBOL_TINY_GOT:
2171 emit_insn (gen_ldr_got_tiny (dest, imm));
2172 return;
2173
2174 case SYMBOL_TINY_TLSIE:
2175 {
2176 machine_mode mode = GET_MODE (dest);
2177 rtx tp = aarch64_load_tp (NULL);
2178
2179 if (mode == ptr_mode)
2180 {
2181 if (mode == DImode)
2182 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2183 else
2184 {
2185 tp = gen_lowpart (mode, tp);
2186 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2187 }
2188 }
2189 else
2190 {
2191 gcc_assert (mode == Pmode);
2192 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2193 }
2194
2195 if (REG_P (dest))
2196 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2197 return;
2198 }
2199
2200 default:
2201 gcc_unreachable ();
2202 }
2203 }
2204
2205 /* Emit a move from SRC to DEST. Assume that the move expanders can
2206 handle all moves if !can_create_pseudo_p (). The distinction is
2207 important because, unlike emit_move_insn, the move expanders know
2208 how to force Pmode objects into the constant pool even when the
2209 constant pool address is not itself legitimate. */
2210 static rtx
2211 aarch64_emit_move (rtx dest, rtx src)
2212 {
2213 return (can_create_pseudo_p ()
2214 ? emit_move_insn (dest, src)
2215 : emit_move_insn_1 (dest, src));
2216 }
2217
2218 /* Apply UNOPTAB to OP and store the result in DEST. */
2219
2220 static void
2221 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2222 {
2223 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2224 if (dest != tmp)
2225 emit_move_insn (dest, tmp);
2226 }
2227
2228 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2229
2230 static void
2231 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2232 {
2233 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2234 OPTAB_DIRECT);
2235 if (dest != tmp)
2236 emit_move_insn (dest, tmp);
2237 }
2238
2239 /* Split a 128-bit move operation into two 64-bit move operations,
2240 taking care to handle partial overlap of register to register
2241 copies. Special cases are needed when moving between GP regs and
2242 FP regs. SRC can be a register, constant or memory; DST a register
2243 or memory. If either operand is memory it must not have any side
2244 effects. */
2245 void
2246 aarch64_split_128bit_move (rtx dst, rtx src)
2247 {
2248 rtx dst_lo, dst_hi;
2249 rtx src_lo, src_hi;
2250
2251 machine_mode mode = GET_MODE (dst);
2252
2253 gcc_assert (mode == TImode || mode == TFmode);
2254 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2255 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2256
2257 if (REG_P (dst) && REG_P (src))
2258 {
2259 int src_regno = REGNO (src);
2260 int dst_regno = REGNO (dst);
2261
2262 /* Handle FP <-> GP regs. */
2263 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2264 {
2265 src_lo = gen_lowpart (word_mode, src);
2266 src_hi = gen_highpart (word_mode, src);
2267
2268 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2269 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2270 return;
2271 }
2272 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2273 {
2274 dst_lo = gen_lowpart (word_mode, dst);
2275 dst_hi = gen_highpart (word_mode, dst);
2276
2277 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2278 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2279 return;
2280 }
2281 }
2282
2283 dst_lo = gen_lowpart (word_mode, dst);
2284 dst_hi = gen_highpart (word_mode, dst);
2285 src_lo = gen_lowpart (word_mode, src);
2286 src_hi = gen_highpart_mode (word_mode, mode, src);
2287
2288 /* At most one pairing may overlap. */
2289 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2290 {
2291 aarch64_emit_move (dst_hi, src_hi);
2292 aarch64_emit_move (dst_lo, src_lo);
2293 }
2294 else
2295 {
2296 aarch64_emit_move (dst_lo, src_lo);
2297 aarch64_emit_move (dst_hi, src_hi);
2298 }
2299 }
2300
2301 bool
2302 aarch64_split_128bit_move_p (rtx dst, rtx src)
2303 {
2304 return (! REG_P (src)
2305 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2306 }
2307
2308 /* Split a complex SIMD combine. */
2309
2310 void
2311 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2312 {
2313 machine_mode src_mode = GET_MODE (src1);
2314 machine_mode dst_mode = GET_MODE (dst);
2315
2316 gcc_assert (VECTOR_MODE_P (dst_mode));
2317 gcc_assert (register_operand (dst, dst_mode)
2318 && register_operand (src1, src_mode)
2319 && register_operand (src2, src_mode));
2320
2321 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2322 return;
2323 }
2324
2325 /* Split a complex SIMD move. */
2326
2327 void
2328 aarch64_split_simd_move (rtx dst, rtx src)
2329 {
2330 machine_mode src_mode = GET_MODE (src);
2331 machine_mode dst_mode = GET_MODE (dst);
2332
2333 gcc_assert (VECTOR_MODE_P (dst_mode));
2334
2335 if (REG_P (dst) && REG_P (src))
2336 {
2337 gcc_assert (VECTOR_MODE_P (src_mode));
2338 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2339 }
2340 }
2341
2342 bool
2343 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2344 machine_mode ymode, rtx y)
2345 {
2346 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2347 gcc_assert (r != NULL);
2348 return rtx_equal_p (x, r);
2349 }
2350
2351
2352 static rtx
2353 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2354 {
2355 if (can_create_pseudo_p ())
2356 return force_reg (mode, value);
2357 else
2358 {
2359 gcc_assert (x);
2360 aarch64_emit_move (x, value);
2361 return x;
2362 }
2363 }
2364
2365 /* Return true if we can move VALUE into a register using a single
2366 CNT[BHWD] instruction. */
2367
2368 static bool
2369 aarch64_sve_cnt_immediate_p (poly_int64 value)
2370 {
2371 HOST_WIDE_INT factor = value.coeffs[0];
2372 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2373 return (value.coeffs[1] == factor
2374 && IN_RANGE (factor, 2, 16 * 16)
2375 && (factor & 1) == 0
2376 && factor <= 16 * (factor & -factor));
2377 }
2378
2379 /* Likewise for rtx X. */
2380
2381 bool
2382 aarch64_sve_cnt_immediate_p (rtx x)
2383 {
2384 poly_int64 value;
2385 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2386 }
2387
2388 /* Return the asm string for an instruction with a CNT-like vector size
2389 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2390 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2391 first part of the operands template (the part that comes before the
2392 vector size itself). FACTOR is the number of quadwords.
2393 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2394 If it is zero, we can use any element size. */
2395
2396 static char *
2397 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2398 unsigned int factor,
2399 unsigned int nelts_per_vq)
2400 {
2401 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2402
2403 if (nelts_per_vq == 0)
2404 /* There is some overlap in the ranges of the four CNT instructions.
2405 Here we always use the smallest possible element size, so that the
2406 multiplier is 1 whereever possible. */
2407 nelts_per_vq = factor & -factor;
2408 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2409 gcc_assert (IN_RANGE (shift, 1, 4));
2410 char suffix = "dwhb"[shift - 1];
2411
2412 factor >>= shift;
2413 unsigned int written;
2414 if (factor == 1)
2415 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2416 prefix, suffix, operands);
2417 else
2418 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2419 prefix, suffix, operands, factor);
2420 gcc_assert (written < sizeof (buffer));
2421 return buffer;
2422 }
2423
2424 /* Return the asm string for an instruction with a CNT-like vector size
2425 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2426 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2427 first part of the operands template (the part that comes before the
2428 vector size itself). X is the value of the vector size operand,
2429 as a polynomial integer rtx. */
2430
2431 char *
2432 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2433 rtx x)
2434 {
2435 poly_int64 value = rtx_to_poly_int64 (x);
2436 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2437 return aarch64_output_sve_cnt_immediate (prefix, operands,
2438 value.coeffs[1], 0);
2439 }
2440
2441 /* Return true if we can add VALUE to a register using a single ADDVL
2442 or ADDPL instruction. */
2443
2444 static bool
2445 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2446 {
2447 HOST_WIDE_INT factor = value.coeffs[0];
2448 if (factor == 0 || value.coeffs[1] != factor)
2449 return false;
2450 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2451 and a value of 16 is one vector width. */
2452 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2453 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2454 }
2455
2456 /* Likewise for rtx X. */
2457
2458 bool
2459 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2460 {
2461 poly_int64 value;
2462 return (poly_int_rtx_p (x, &value)
2463 && aarch64_sve_addvl_addpl_immediate_p (value));
2464 }
2465
2466 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2467 and storing the result in operand 0. */
2468
2469 char *
2470 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2471 {
2472 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2473 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2474 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2475
2476 /* Use INC or DEC if possible. */
2477 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2478 {
2479 if (aarch64_sve_cnt_immediate_p (offset_value))
2480 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2481 offset_value.coeffs[1], 0);
2482 if (aarch64_sve_cnt_immediate_p (-offset_value))
2483 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2484 -offset_value.coeffs[1], 0);
2485 }
2486
2487 int factor = offset_value.coeffs[1];
2488 if ((factor & 15) == 0)
2489 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2490 else
2491 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2492 return buffer;
2493 }
2494
2495 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2496 instruction. If it is, store the number of elements in each vector
2497 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2498 factor in *FACTOR_OUT (if nonnull). */
2499
2500 bool
2501 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2502 unsigned int *nelts_per_vq_out)
2503 {
2504 rtx elt;
2505 poly_int64 value;
2506
2507 if (!const_vec_duplicate_p (x, &elt)
2508 || !poly_int_rtx_p (elt, &value))
2509 return false;
2510
2511 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2512 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2513 /* There's no vector INCB. */
2514 return false;
2515
2516 HOST_WIDE_INT factor = value.coeffs[0];
2517 if (value.coeffs[1] != factor)
2518 return false;
2519
2520 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2521 if ((factor % nelts_per_vq) != 0
2522 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2523 return false;
2524
2525 if (factor_out)
2526 *factor_out = factor;
2527 if (nelts_per_vq_out)
2528 *nelts_per_vq_out = nelts_per_vq;
2529 return true;
2530 }
2531
2532 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2533 instruction. */
2534
2535 bool
2536 aarch64_sve_inc_dec_immediate_p (rtx x)
2537 {
2538 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2539 }
2540
2541 /* Return the asm template for an SVE vector INC or DEC instruction.
2542 OPERANDS gives the operands before the vector count and X is the
2543 value of the vector count operand itself. */
2544
2545 char *
2546 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2547 {
2548 int factor;
2549 unsigned int nelts_per_vq;
2550 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2551 gcc_unreachable ();
2552 if (factor < 0)
2553 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2554 nelts_per_vq);
2555 else
2556 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2557 nelts_per_vq);
2558 }
2559
2560 static int
2561 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2562 scalar_int_mode mode)
2563 {
2564 int i;
2565 unsigned HOST_WIDE_INT val, val2, mask;
2566 int one_match, zero_match;
2567 int num_insns;
2568
2569 val = INTVAL (imm);
2570
2571 if (aarch64_move_imm (val, mode))
2572 {
2573 if (generate)
2574 emit_insn (gen_rtx_SET (dest, imm));
2575 return 1;
2576 }
2577
2578 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2579 (with XXXX non-zero). In that case check to see if the move can be done in
2580 a smaller mode. */
2581 val2 = val & 0xffffffff;
2582 if (mode == DImode
2583 && aarch64_move_imm (val2, SImode)
2584 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2585 {
2586 if (generate)
2587 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2588
2589 /* Check if we have to emit a second instruction by checking to see
2590 if any of the upper 32 bits of the original DI mode value is set. */
2591 if (val == val2)
2592 return 1;
2593
2594 i = (val >> 48) ? 48 : 32;
2595
2596 if (generate)
2597 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2598 GEN_INT ((val >> i) & 0xffff)));
2599
2600 return 2;
2601 }
2602
2603 if ((val >> 32) == 0 || mode == SImode)
2604 {
2605 if (generate)
2606 {
2607 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2608 if (mode == SImode)
2609 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2610 GEN_INT ((val >> 16) & 0xffff)));
2611 else
2612 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2613 GEN_INT ((val >> 16) & 0xffff)));
2614 }
2615 return 2;
2616 }
2617
2618 /* Remaining cases are all for DImode. */
2619
2620 mask = 0xffff;
2621 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2622 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2623 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2624 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2625
2626 if (zero_match != 2 && one_match != 2)
2627 {
2628 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2629 For a 64-bit bitmask try whether changing 16 bits to all ones or
2630 zeroes creates a valid bitmask. To check any repeated bitmask,
2631 try using 16 bits from the other 32-bit half of val. */
2632
2633 for (i = 0; i < 64; i += 16, mask <<= 16)
2634 {
2635 val2 = val & ~mask;
2636 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2637 break;
2638 val2 = val | mask;
2639 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2640 break;
2641 val2 = val2 & ~mask;
2642 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2643 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2644 break;
2645 }
2646 if (i != 64)
2647 {
2648 if (generate)
2649 {
2650 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2651 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2652 GEN_INT ((val >> i) & 0xffff)));
2653 }
2654 return 2;
2655 }
2656 }
2657
2658 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2659 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2660 otherwise skip zero bits. */
2661
2662 num_insns = 1;
2663 mask = 0xffff;
2664 val2 = one_match > zero_match ? ~val : val;
2665 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2666
2667 if (generate)
2668 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2669 ? (val | ~(mask << i))
2670 : (val & (mask << i)))));
2671 for (i += 16; i < 64; i += 16)
2672 {
2673 if ((val2 & (mask << i)) == 0)
2674 continue;
2675 if (generate)
2676 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2677 GEN_INT ((val >> i) & 0xffff)));
2678 num_insns ++;
2679 }
2680
2681 return num_insns;
2682 }
2683
2684 /* Return whether imm is a 128-bit immediate which is simple enough to
2685 expand inline. */
2686 bool
2687 aarch64_mov128_immediate (rtx imm)
2688 {
2689 if (GET_CODE (imm) == CONST_INT)
2690 return true;
2691
2692 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2693
2694 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2695 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2696
2697 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2698 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2699 }
2700
2701
2702 /* Return the number of temporary registers that aarch64_add_offset_1
2703 would need to add OFFSET to a register. */
2704
2705 static unsigned int
2706 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2707 {
2708 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2709 }
2710
2711 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2712 a non-polynomial OFFSET. MODE is the mode of the addition.
2713 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2714 be set and CFA adjustments added to the generated instructions.
2715
2716 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2717 temporary if register allocation is already complete. This temporary
2718 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2719 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2720 the immediate again.
2721
2722 Since this function may be used to adjust the stack pointer, we must
2723 ensure that it cannot cause transient stack deallocation (for example
2724 by first incrementing SP and then decrementing when adjusting by a
2725 large immediate). */
2726
2727 static void
2728 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2729 rtx src, HOST_WIDE_INT offset, rtx temp1,
2730 bool frame_related_p, bool emit_move_imm)
2731 {
2732 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2733 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2734
2735 HOST_WIDE_INT moffset = abs_hwi (offset);
2736 rtx_insn *insn;
2737
2738 if (!moffset)
2739 {
2740 if (!rtx_equal_p (dest, src))
2741 {
2742 insn = emit_insn (gen_rtx_SET (dest, src));
2743 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2744 }
2745 return;
2746 }
2747
2748 /* Single instruction adjustment. */
2749 if (aarch64_uimm12_shift (moffset))
2750 {
2751 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2752 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2753 return;
2754 }
2755
2756 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2757 and either:
2758
2759 a) the offset cannot be loaded by a 16-bit move or
2760 b) there is no spare register into which we can move it. */
2761 if (moffset < 0x1000000
2762 && ((!temp1 && !can_create_pseudo_p ())
2763 || !aarch64_move_imm (moffset, mode)))
2764 {
2765 HOST_WIDE_INT low_off = moffset & 0xfff;
2766
2767 low_off = offset < 0 ? -low_off : low_off;
2768 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2769 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2770 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2771 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2772 return;
2773 }
2774
2775 /* Emit a move immediate if required and an addition/subtraction. */
2776 if (emit_move_imm)
2777 {
2778 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2779 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2780 }
2781 insn = emit_insn (offset < 0
2782 ? gen_sub3_insn (dest, src, temp1)
2783 : gen_add3_insn (dest, src, temp1));
2784 if (frame_related_p)
2785 {
2786 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2787 rtx adj = plus_constant (mode, src, offset);
2788 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2789 }
2790 }
2791
2792 /* Return the number of temporary registers that aarch64_add_offset
2793 would need to move OFFSET into a register or add OFFSET to a register;
2794 ADD_P is true if we want the latter rather than the former. */
2795
2796 static unsigned int
2797 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2798 {
2799 /* This follows the same structure as aarch64_add_offset. */
2800 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2801 return 0;
2802
2803 unsigned int count = 0;
2804 HOST_WIDE_INT factor = offset.coeffs[1];
2805 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2806 poly_int64 poly_offset (factor, factor);
2807 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2808 /* Need one register for the ADDVL/ADDPL result. */
2809 count += 1;
2810 else if (factor != 0)
2811 {
2812 factor = abs (factor);
2813 if (factor > 16 * (factor & -factor))
2814 /* Need one register for the CNT result and one for the multiplication
2815 factor. If necessary, the second temporary can be reused for the
2816 constant part of the offset. */
2817 return 2;
2818 /* Need one register for the CNT result (which might then
2819 be shifted). */
2820 count += 1;
2821 }
2822 return count + aarch64_add_offset_1_temporaries (constant);
2823 }
2824
2825 /* If X can be represented as a poly_int64, return the number
2826 of temporaries that are required to add it to a register.
2827 Return -1 otherwise. */
2828
2829 int
2830 aarch64_add_offset_temporaries (rtx x)
2831 {
2832 poly_int64 offset;
2833 if (!poly_int_rtx_p (x, &offset))
2834 return -1;
2835 return aarch64_offset_temporaries (true, offset);
2836 }
2837
2838 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2839 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2840 be set and CFA adjustments added to the generated instructions.
2841
2842 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2843 temporary if register allocation is already complete. This temporary
2844 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2845 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2846 false to avoid emitting the immediate again.
2847
2848 TEMP2, if nonnull, is a second temporary register that doesn't
2849 overlap either DEST or REG.
2850
2851 Since this function may be used to adjust the stack pointer, we must
2852 ensure that it cannot cause transient stack deallocation (for example
2853 by first incrementing SP and then decrementing when adjusting by a
2854 large immediate). */
2855
2856 static void
2857 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2858 poly_int64 offset, rtx temp1, rtx temp2,
2859 bool frame_related_p, bool emit_move_imm = true)
2860 {
2861 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2862 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2863 gcc_assert (temp1 == NULL_RTX
2864 || !frame_related_p
2865 || !reg_overlap_mentioned_p (temp1, dest));
2866 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2867
2868 /* Try using ADDVL or ADDPL to add the whole value. */
2869 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2870 {
2871 rtx offset_rtx = gen_int_mode (offset, mode);
2872 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2873 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2874 return;
2875 }
2876
2877 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2878 SVE vector register, over and above the minimum size of 128 bits.
2879 This is equivalent to half the value returned by CNTD with a
2880 vector shape of ALL. */
2881 HOST_WIDE_INT factor = offset.coeffs[1];
2882 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2883
2884 /* Try using ADDVL or ADDPL to add the VG-based part. */
2885 poly_int64 poly_offset (factor, factor);
2886 if (src != const0_rtx
2887 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2888 {
2889 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2890 if (frame_related_p)
2891 {
2892 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2893 RTX_FRAME_RELATED_P (insn) = true;
2894 src = dest;
2895 }
2896 else
2897 {
2898 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2899 src = aarch64_force_temporary (mode, temp1, addr);
2900 temp1 = temp2;
2901 temp2 = NULL_RTX;
2902 }
2903 }
2904 /* Otherwise use a CNT-based sequence. */
2905 else if (factor != 0)
2906 {
2907 /* Use a subtraction if we have a negative factor. */
2908 rtx_code code = PLUS;
2909 if (factor < 0)
2910 {
2911 factor = -factor;
2912 code = MINUS;
2913 }
2914
2915 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2916 into the multiplication. */
2917 rtx val;
2918 int shift = 0;
2919 if (factor & 1)
2920 /* Use a right shift by 1. */
2921 shift = -1;
2922 else
2923 factor /= 2;
2924 HOST_WIDE_INT low_bit = factor & -factor;
2925 if (factor <= 16 * low_bit)
2926 {
2927 if (factor > 16 * 8)
2928 {
2929 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2930 the value with the minimum multiplier and shift it into
2931 position. */
2932 int extra_shift = exact_log2 (low_bit);
2933 shift += extra_shift;
2934 factor >>= extra_shift;
2935 }
2936 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2937 }
2938 else
2939 {
2940 /* Use CNTD, then multiply it by FACTOR. */
2941 val = gen_int_mode (poly_int64 (2, 2), mode);
2942 val = aarch64_force_temporary (mode, temp1, val);
2943
2944 /* Go back to using a negative multiplication factor if we have
2945 no register from which to subtract. */
2946 if (code == MINUS && src == const0_rtx)
2947 {
2948 factor = -factor;
2949 code = PLUS;
2950 }
2951 rtx coeff1 = gen_int_mode (factor, mode);
2952 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2953 val = gen_rtx_MULT (mode, val, coeff1);
2954 }
2955
2956 if (shift > 0)
2957 {
2958 /* Multiply by 1 << SHIFT. */
2959 val = aarch64_force_temporary (mode, temp1, val);
2960 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2961 }
2962 else if (shift == -1)
2963 {
2964 /* Divide by 2. */
2965 val = aarch64_force_temporary (mode, temp1, val);
2966 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2967 }
2968
2969 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2970 if (src != const0_rtx)
2971 {
2972 val = aarch64_force_temporary (mode, temp1, val);
2973 val = gen_rtx_fmt_ee (code, mode, src, val);
2974 }
2975 else if (code == MINUS)
2976 {
2977 val = aarch64_force_temporary (mode, temp1, val);
2978 val = gen_rtx_NEG (mode, val);
2979 }
2980
2981 if (constant == 0 || frame_related_p)
2982 {
2983 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2984 if (frame_related_p)
2985 {
2986 RTX_FRAME_RELATED_P (insn) = true;
2987 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2988 gen_rtx_SET (dest, plus_constant (Pmode, src,
2989 poly_offset)));
2990 }
2991 src = dest;
2992 if (constant == 0)
2993 return;
2994 }
2995 else
2996 {
2997 src = aarch64_force_temporary (mode, temp1, val);
2998 temp1 = temp2;
2999 temp2 = NULL_RTX;
3000 }
3001
3002 emit_move_imm = true;
3003 }
3004
3005 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3006 frame_related_p, emit_move_imm);
3007 }
3008
3009 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3010 than a poly_int64. */
3011
3012 void
3013 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3014 rtx offset_rtx, rtx temp1, rtx temp2)
3015 {
3016 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3017 temp1, temp2, false);
3018 }
3019
3020 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3021 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3022 if TEMP1 already contains abs (DELTA). */
3023
3024 static inline void
3025 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3026 {
3027 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3028 temp1, temp2, true, emit_move_imm);
3029 }
3030
3031 /* Subtract DELTA from the stack pointer, marking the instructions
3032 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3033 if nonnull. */
3034
3035 static inline void
3036 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3037 bool emit_move_imm = true)
3038 {
3039 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3040 temp1, temp2, frame_related_p, emit_move_imm);
3041 }
3042
3043 /* Set DEST to (vec_series BASE STEP). */
3044
3045 static void
3046 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3047 {
3048 machine_mode mode = GET_MODE (dest);
3049 scalar_mode inner = GET_MODE_INNER (mode);
3050
3051 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3052 if (!aarch64_sve_index_immediate_p (base))
3053 base = force_reg (inner, base);
3054 if (!aarch64_sve_index_immediate_p (step))
3055 step = force_reg (inner, step);
3056
3057 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3058 }
3059
3060 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3061 integer of mode INT_MODE. Return true on success. */
3062
3063 static bool
3064 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3065 rtx src)
3066 {
3067 /* If the constant is smaller than 128 bits, we can do the move
3068 using a vector of SRC_MODEs. */
3069 if (src_mode != TImode)
3070 {
3071 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3072 GET_MODE_SIZE (src_mode));
3073 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3074 emit_move_insn (gen_lowpart (dup_mode, dest),
3075 gen_const_vec_duplicate (dup_mode, src));
3076 return true;
3077 }
3078
3079 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3080 src = force_const_mem (src_mode, src);
3081 if (!src)
3082 return false;
3083
3084 /* Make sure that the address is legitimate. */
3085 if (!aarch64_sve_ld1r_operand_p (src))
3086 {
3087 rtx addr = force_reg (Pmode, XEXP (src, 0));
3088 src = replace_equiv_address (src, addr);
3089 }
3090
3091 machine_mode mode = GET_MODE (dest);
3092 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3093 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3094 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3095 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3096 emit_insn (gen_rtx_SET (dest, src));
3097 return true;
3098 }
3099
3100 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3101 isn't a simple duplicate or series. */
3102
3103 static void
3104 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3105 {
3106 machine_mode mode = GET_MODE (src);
3107 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3108 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3109 gcc_assert (npatterns > 1);
3110
3111 if (nelts_per_pattern == 1)
3112 {
3113 /* The constant is a repeating seqeuence of at least two elements,
3114 where the repeating elements occupy no more than 128 bits.
3115 Get an integer representation of the replicated value. */
3116 scalar_int_mode int_mode;
3117 if (BYTES_BIG_ENDIAN)
3118 /* For now, always use LD1RQ to load the value on big-endian
3119 targets, since the handling of smaller integers includes a
3120 subreg that is semantically an element reverse. */
3121 int_mode = TImode;
3122 else
3123 {
3124 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3125 gcc_assert (int_bits <= 128);
3126 int_mode = int_mode_for_size (int_bits, 0).require ();
3127 }
3128 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3129 if (int_value
3130 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3131 return;
3132 }
3133
3134 /* Expand each pattern individually. */
3135 rtx_vector_builder builder;
3136 auto_vec<rtx, 16> vectors (npatterns);
3137 for (unsigned int i = 0; i < npatterns; ++i)
3138 {
3139 builder.new_vector (mode, 1, nelts_per_pattern);
3140 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3141 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3142 vectors.quick_push (force_reg (mode, builder.build ()));
3143 }
3144
3145 /* Use permutes to interleave the separate vectors. */
3146 while (npatterns > 1)
3147 {
3148 npatterns /= 2;
3149 for (unsigned int i = 0; i < npatterns; ++i)
3150 {
3151 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3152 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3153 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3154 vectors[i] = tmp;
3155 }
3156 }
3157 gcc_assert (vectors[0] == dest);
3158 }
3159
3160 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3161 is a pattern that can be used to set DEST to a replicated scalar
3162 element. */
3163
3164 void
3165 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3166 rtx (*gen_vec_duplicate) (rtx, rtx))
3167 {
3168 machine_mode mode = GET_MODE (dest);
3169
3170 /* Check on what type of symbol it is. */
3171 scalar_int_mode int_mode;
3172 if ((GET_CODE (imm) == SYMBOL_REF
3173 || GET_CODE (imm) == LABEL_REF
3174 || GET_CODE (imm) == CONST
3175 || GET_CODE (imm) == CONST_POLY_INT)
3176 && is_a <scalar_int_mode> (mode, &int_mode))
3177 {
3178 rtx mem;
3179 poly_int64 offset;
3180 HOST_WIDE_INT const_offset;
3181 enum aarch64_symbol_type sty;
3182
3183 /* If we have (const (plus symbol offset)), separate out the offset
3184 before we start classifying the symbol. */
3185 rtx base = strip_offset (imm, &offset);
3186
3187 /* We must always add an offset involving VL separately, rather than
3188 folding it into the relocation. */
3189 if (!offset.is_constant (&const_offset))
3190 {
3191 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3192 emit_insn (gen_rtx_SET (dest, imm));
3193 else
3194 {
3195 /* Do arithmetic on 32-bit values if the result is smaller
3196 than that. */
3197 if (partial_subreg_p (int_mode, SImode))
3198 {
3199 /* It is invalid to do symbol calculations in modes
3200 narrower than SImode. */
3201 gcc_assert (base == const0_rtx);
3202 dest = gen_lowpart (SImode, dest);
3203 int_mode = SImode;
3204 }
3205 if (base != const0_rtx)
3206 {
3207 base = aarch64_force_temporary (int_mode, dest, base);
3208 aarch64_add_offset (int_mode, dest, base, offset,
3209 NULL_RTX, NULL_RTX, false);
3210 }
3211 else
3212 aarch64_add_offset (int_mode, dest, base, offset,
3213 dest, NULL_RTX, false);
3214 }
3215 return;
3216 }
3217
3218 sty = aarch64_classify_symbol (base, const_offset);
3219 switch (sty)
3220 {
3221 case SYMBOL_FORCE_TO_MEM:
3222 if (const_offset != 0
3223 && targetm.cannot_force_const_mem (int_mode, imm))
3224 {
3225 gcc_assert (can_create_pseudo_p ());
3226 base = aarch64_force_temporary (int_mode, dest, base);
3227 aarch64_add_offset (int_mode, dest, base, const_offset,
3228 NULL_RTX, NULL_RTX, false);
3229 return;
3230 }
3231
3232 mem = force_const_mem (ptr_mode, imm);
3233 gcc_assert (mem);
3234
3235 /* If we aren't generating PC relative literals, then
3236 we need to expand the literal pool access carefully.
3237 This is something that needs to be done in a number
3238 of places, so could well live as a separate function. */
3239 if (!aarch64_pcrelative_literal_loads)
3240 {
3241 gcc_assert (can_create_pseudo_p ());
3242 base = gen_reg_rtx (ptr_mode);
3243 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3244 if (ptr_mode != Pmode)
3245 base = convert_memory_address (Pmode, base);
3246 mem = gen_rtx_MEM (ptr_mode, base);
3247 }
3248
3249 if (int_mode != ptr_mode)
3250 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3251
3252 emit_insn (gen_rtx_SET (dest, mem));
3253
3254 return;
3255
3256 case SYMBOL_SMALL_TLSGD:
3257 case SYMBOL_SMALL_TLSDESC:
3258 case SYMBOL_SMALL_TLSIE:
3259 case SYMBOL_SMALL_GOT_28K:
3260 case SYMBOL_SMALL_GOT_4G:
3261 case SYMBOL_TINY_GOT:
3262 case SYMBOL_TINY_TLSIE:
3263 if (const_offset != 0)
3264 {
3265 gcc_assert(can_create_pseudo_p ());
3266 base = aarch64_force_temporary (int_mode, dest, base);
3267 aarch64_add_offset (int_mode, dest, base, const_offset,
3268 NULL_RTX, NULL_RTX, false);
3269 return;
3270 }
3271 /* FALLTHRU */
3272
3273 case SYMBOL_SMALL_ABSOLUTE:
3274 case SYMBOL_TINY_ABSOLUTE:
3275 case SYMBOL_TLSLE12:
3276 case SYMBOL_TLSLE24:
3277 case SYMBOL_TLSLE32:
3278 case SYMBOL_TLSLE48:
3279 aarch64_load_symref_appropriately (dest, imm, sty);
3280 return;
3281
3282 default:
3283 gcc_unreachable ();
3284 }
3285 }
3286
3287 if (!CONST_INT_P (imm))
3288 {
3289 rtx base, step, value;
3290 if (GET_CODE (imm) == HIGH
3291 || aarch64_simd_valid_immediate (imm, NULL))
3292 emit_insn (gen_rtx_SET (dest, imm));
3293 else if (const_vec_series_p (imm, &base, &step))
3294 aarch64_expand_vec_series (dest, base, step);
3295 else if (const_vec_duplicate_p (imm, &value))
3296 {
3297 /* If the constant is out of range of an SVE vector move,
3298 load it from memory if we can, otherwise move it into
3299 a register and use a DUP. */
3300 scalar_mode inner_mode = GET_MODE_INNER (mode);
3301 rtx op = force_const_mem (inner_mode, value);
3302 if (!op)
3303 op = force_reg (inner_mode, value);
3304 else if (!aarch64_sve_ld1r_operand_p (op))
3305 {
3306 rtx addr = force_reg (Pmode, XEXP (op, 0));
3307 op = replace_equiv_address (op, addr);
3308 }
3309 emit_insn (gen_vec_duplicate (dest, op));
3310 }
3311 else if (GET_CODE (imm) == CONST_VECTOR
3312 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3313 aarch64_expand_sve_const_vector (dest, imm);
3314 else
3315 {
3316 rtx mem = force_const_mem (mode, imm);
3317 gcc_assert (mem);
3318 emit_move_insn (dest, mem);
3319 }
3320
3321 return;
3322 }
3323
3324 aarch64_internal_mov_immediate (dest, imm, true,
3325 as_a <scalar_int_mode> (mode));
3326 }
3327
3328 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3329 that is known to contain PTRUE. */
3330
3331 void
3332 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3333 {
3334 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3335 gen_rtvec (2, pred, src),
3336 UNSPEC_MERGE_PTRUE)));
3337 }
3338
3339 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3340 operand is in memory. In this case we need to use the predicated LD1
3341 and ST1 instead of LDR and STR, both for correctness on big-endian
3342 targets and because LD1 and ST1 support a wider range of addressing modes.
3343 PRED_MODE is the mode of the predicate.
3344
3345 See the comment at the head of aarch64-sve.md for details about the
3346 big-endian handling. */
3347
3348 void
3349 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3350 {
3351 machine_mode mode = GET_MODE (dest);
3352 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3353 if (!register_operand (src, mode)
3354 && !register_operand (dest, mode))
3355 {
3356 rtx tmp = gen_reg_rtx (mode);
3357 if (MEM_P (src))
3358 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3359 else
3360 emit_move_insn (tmp, src);
3361 src = tmp;
3362 }
3363 aarch64_emit_sve_pred_move (dest, ptrue, src);
3364 }
3365
3366 /* Called only on big-endian targets. See whether an SVE vector move
3367 from SRC to DEST is effectively a REV[BHW] instruction, because at
3368 least one operand is a subreg of an SVE vector that has wider or
3369 narrower elements. Return true and emit the instruction if so.
3370
3371 For example:
3372
3373 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3374
3375 represents a VIEW_CONVERT between the following vectors, viewed
3376 in memory order:
3377
3378 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3379 R1: { [0], [1], [2], [3], ... }
3380
3381 The high part of lane X in R2 should therefore correspond to lane X*2
3382 of R1, but the register representations are:
3383
3384 msb lsb
3385 R2: ...... [1].high [1].low [0].high [0].low
3386 R1: ...... [3] [2] [1] [0]
3387
3388 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3389 We therefore need a reverse operation to swap the high and low values
3390 around.
3391
3392 This is purely an optimization. Without it we would spill the
3393 subreg operand to the stack in one mode and reload it in the
3394 other mode, which has the same effect as the REV. */
3395
3396 bool
3397 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3398 {
3399 gcc_assert (BYTES_BIG_ENDIAN);
3400 if (GET_CODE (dest) == SUBREG)
3401 dest = SUBREG_REG (dest);
3402 if (GET_CODE (src) == SUBREG)
3403 src = SUBREG_REG (src);
3404
3405 /* The optimization handles two single SVE REGs with different element
3406 sizes. */
3407 if (!REG_P (dest)
3408 || !REG_P (src)
3409 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3410 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3411 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3412 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3413 return false;
3414
3415 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3416 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3417 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3418 UNSPEC_REV_SUBREG);
3419 emit_insn (gen_rtx_SET (dest, unspec));
3420 return true;
3421 }
3422
3423 /* Return a copy of X with mode MODE, without changing its other
3424 attributes. Unlike gen_lowpart, this doesn't care whether the
3425 mode change is valid. */
3426
3427 static rtx
3428 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3429 {
3430 if (GET_MODE (x) == mode)
3431 return x;
3432
3433 x = shallow_copy_rtx (x);
3434 set_mode_and_regno (x, mode, REGNO (x));
3435 return x;
3436 }
3437
3438 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3439 operands. */
3440
3441 void
3442 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3443 {
3444 /* Decide which REV operation we need. The mode with narrower elements
3445 determines the mode of the operands and the mode with the wider
3446 elements determines the reverse width. */
3447 machine_mode mode_with_wider_elts = GET_MODE (dest);
3448 machine_mode mode_with_narrower_elts = GET_MODE (src);
3449 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3450 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3451 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3452
3453 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3454 unsigned int unspec;
3455 if (wider_bytes == 8)
3456 unspec = UNSPEC_REV64;
3457 else if (wider_bytes == 4)
3458 unspec = UNSPEC_REV32;
3459 else if (wider_bytes == 2)
3460 unspec = UNSPEC_REV16;
3461 else
3462 gcc_unreachable ();
3463 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3464
3465 /* Emit:
3466
3467 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3468 UNSPEC_MERGE_PTRUE))
3469
3470 with the appropriate modes. */
3471 ptrue = gen_lowpart (pred_mode, ptrue);
3472 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3473 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3474 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3475 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3476 UNSPEC_MERGE_PTRUE);
3477 emit_insn (gen_rtx_SET (dest, src));
3478 }
3479
3480 static bool
3481 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3482 tree exp ATTRIBUTE_UNUSED)
3483 {
3484 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3485 return false;
3486
3487 return true;
3488 }
3489
3490 /* Implement TARGET_PASS_BY_REFERENCE. */
3491
3492 static bool
3493 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3494 machine_mode mode,
3495 const_tree type,
3496 bool named ATTRIBUTE_UNUSED)
3497 {
3498 HOST_WIDE_INT size;
3499 machine_mode dummymode;
3500 int nregs;
3501
3502 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3503 if (mode == BLKmode && type)
3504 size = int_size_in_bytes (type);
3505 else
3506 /* No frontends can create types with variable-sized modes, so we
3507 shouldn't be asked to pass or return them. */
3508 size = GET_MODE_SIZE (mode).to_constant ();
3509
3510 /* Aggregates are passed by reference based on their size. */
3511 if (type && AGGREGATE_TYPE_P (type))
3512 {
3513 size = int_size_in_bytes (type);
3514 }
3515
3516 /* Variable sized arguments are always returned by reference. */
3517 if (size < 0)
3518 return true;
3519
3520 /* Can this be a candidate to be passed in fp/simd register(s)? */
3521 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3522 &dummymode, &nregs,
3523 NULL))
3524 return false;
3525
3526 /* Arguments which are variable sized or larger than 2 registers are
3527 passed by reference unless they are a homogenous floating point
3528 aggregate. */
3529 return size > 2 * UNITS_PER_WORD;
3530 }
3531
3532 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3533 static bool
3534 aarch64_return_in_msb (const_tree valtype)
3535 {
3536 machine_mode dummy_mode;
3537 int dummy_int;
3538
3539 /* Never happens in little-endian mode. */
3540 if (!BYTES_BIG_ENDIAN)
3541 return false;
3542
3543 /* Only composite types smaller than or equal to 16 bytes can
3544 be potentially returned in registers. */
3545 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3546 || int_size_in_bytes (valtype) <= 0
3547 || int_size_in_bytes (valtype) > 16)
3548 return false;
3549
3550 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3551 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3552 is always passed/returned in the least significant bits of fp/simd
3553 register(s). */
3554 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3555 &dummy_mode, &dummy_int, NULL))
3556 return false;
3557
3558 return true;
3559 }
3560
3561 /* Implement TARGET_FUNCTION_VALUE.
3562 Define how to find the value returned by a function. */
3563
3564 static rtx
3565 aarch64_function_value (const_tree type, const_tree func,
3566 bool outgoing ATTRIBUTE_UNUSED)
3567 {
3568 machine_mode mode;
3569 int unsignedp;
3570 int count;
3571 machine_mode ag_mode;
3572
3573 mode = TYPE_MODE (type);
3574 if (INTEGRAL_TYPE_P (type))
3575 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3576
3577 if (aarch64_return_in_msb (type))
3578 {
3579 HOST_WIDE_INT size = int_size_in_bytes (type);
3580
3581 if (size % UNITS_PER_WORD != 0)
3582 {
3583 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3584 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3585 }
3586 }
3587
3588 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3589 &ag_mode, &count, NULL))
3590 {
3591 if (!aarch64_composite_type_p (type, mode))
3592 {
3593 gcc_assert (count == 1 && mode == ag_mode);
3594 return gen_rtx_REG (mode, V0_REGNUM);
3595 }
3596 else
3597 {
3598 int i;
3599 rtx par;
3600
3601 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3602 for (i = 0; i < count; i++)
3603 {
3604 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3605 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3606 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3607 XVECEXP (par, 0, i) = tmp;
3608 }
3609 return par;
3610 }
3611 }
3612 else
3613 return gen_rtx_REG (mode, R0_REGNUM);
3614 }
3615
3616 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3617 Return true if REGNO is the number of a hard register in which the values
3618 of called function may come back. */
3619
3620 static bool
3621 aarch64_function_value_regno_p (const unsigned int regno)
3622 {
3623 /* Maximum of 16 bytes can be returned in the general registers. Examples
3624 of 16-byte return values are: 128-bit integers and 16-byte small
3625 structures (excluding homogeneous floating-point aggregates). */
3626 if (regno == R0_REGNUM || regno == R1_REGNUM)
3627 return true;
3628
3629 /* Up to four fp/simd registers can return a function value, e.g. a
3630 homogeneous floating-point aggregate having four members. */
3631 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3632 return TARGET_FLOAT;
3633
3634 return false;
3635 }
3636
3637 /* Implement TARGET_RETURN_IN_MEMORY.
3638
3639 If the type T of the result of a function is such that
3640 void func (T arg)
3641 would require that arg be passed as a value in a register (or set of
3642 registers) according to the parameter passing rules, then the result
3643 is returned in the same registers as would be used for such an
3644 argument. */
3645
3646 static bool
3647 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3648 {
3649 HOST_WIDE_INT size;
3650 machine_mode ag_mode;
3651 int count;
3652
3653 if (!AGGREGATE_TYPE_P (type)
3654 && TREE_CODE (type) != COMPLEX_TYPE
3655 && TREE_CODE (type) != VECTOR_TYPE)
3656 /* Simple scalar types always returned in registers. */
3657 return false;
3658
3659 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3660 type,
3661 &ag_mode,
3662 &count,
3663 NULL))
3664 return false;
3665
3666 /* Types larger than 2 registers returned in memory. */
3667 size = int_size_in_bytes (type);
3668 return (size < 0 || size > 2 * UNITS_PER_WORD);
3669 }
3670
3671 static bool
3672 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3673 const_tree type, int *nregs)
3674 {
3675 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3676 return aarch64_vfp_is_call_or_return_candidate (mode,
3677 type,
3678 &pcum->aapcs_vfp_rmode,
3679 nregs,
3680 NULL);
3681 }
3682
3683 /* Given MODE and TYPE of a function argument, return the alignment in
3684 bits. The idea is to suppress any stronger alignment requested by
3685 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3686 This is a helper function for local use only. */
3687
3688 static unsigned int
3689 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3690 {
3691 if (!type)
3692 return GET_MODE_ALIGNMENT (mode);
3693
3694 if (integer_zerop (TYPE_SIZE (type)))
3695 return 0;
3696
3697 gcc_assert (TYPE_MODE (type) == mode);
3698
3699 if (!AGGREGATE_TYPE_P (type))
3700 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3701
3702 if (TREE_CODE (type) == ARRAY_TYPE)
3703 return TYPE_ALIGN (TREE_TYPE (type));
3704
3705 unsigned int alignment = 0;
3706 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3707 if (TREE_CODE (field) == FIELD_DECL)
3708 alignment = std::max (alignment, DECL_ALIGN (field));
3709
3710 return alignment;
3711 }
3712
3713 /* Layout a function argument according to the AAPCS64 rules. The rule
3714 numbers refer to the rule numbers in the AAPCS64. */
3715
3716 static void
3717 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3718 const_tree type,
3719 bool named ATTRIBUTE_UNUSED)
3720 {
3721 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3722 int ncrn, nvrn, nregs;
3723 bool allocate_ncrn, allocate_nvrn;
3724 HOST_WIDE_INT size;
3725
3726 /* We need to do this once per argument. */
3727 if (pcum->aapcs_arg_processed)
3728 return;
3729
3730 pcum->aapcs_arg_processed = true;
3731
3732 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3733 if (type)
3734 size = int_size_in_bytes (type);
3735 else
3736 /* No frontends can create types with variable-sized modes, so we
3737 shouldn't be asked to pass or return them. */
3738 size = GET_MODE_SIZE (mode).to_constant ();
3739 size = ROUND_UP (size, UNITS_PER_WORD);
3740
3741 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3742 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3743 mode,
3744 type,
3745 &nregs);
3746
3747 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3748 The following code thus handles passing by SIMD/FP registers first. */
3749
3750 nvrn = pcum->aapcs_nvrn;
3751
3752 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3753 and homogenous short-vector aggregates (HVA). */
3754 if (allocate_nvrn)
3755 {
3756 if (!TARGET_FLOAT)
3757 aarch64_err_no_fpadvsimd (mode);
3758
3759 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3760 {
3761 pcum->aapcs_nextnvrn = nvrn + nregs;
3762 if (!aarch64_composite_type_p (type, mode))
3763 {
3764 gcc_assert (nregs == 1);
3765 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3766 }
3767 else
3768 {
3769 rtx par;
3770 int i;
3771 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3772 for (i = 0; i < nregs; i++)
3773 {
3774 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3775 V0_REGNUM + nvrn + i);
3776 rtx offset = gen_int_mode
3777 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3778 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3779 XVECEXP (par, 0, i) = tmp;
3780 }
3781 pcum->aapcs_reg = par;
3782 }
3783 return;
3784 }
3785 else
3786 {
3787 /* C.3 NSRN is set to 8. */
3788 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3789 goto on_stack;
3790 }
3791 }
3792
3793 ncrn = pcum->aapcs_ncrn;
3794 nregs = size / UNITS_PER_WORD;
3795
3796 /* C6 - C9. though the sign and zero extension semantics are
3797 handled elsewhere. This is the case where the argument fits
3798 entirely general registers. */
3799 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3800 {
3801
3802 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3803
3804 /* C.8 if the argument has an alignment of 16 then the NGRN is
3805 rounded up to the next even number. */
3806 if (nregs == 2
3807 && ncrn % 2
3808 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3809 comparison is there because for > 16 * BITS_PER_UNIT
3810 alignment nregs should be > 2 and therefore it should be
3811 passed by reference rather than value. */
3812 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3813 {
3814 ++ncrn;
3815 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3816 }
3817
3818 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3819 A reg is still generated for it, but the caller should be smart
3820 enough not to use it. */
3821 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3822 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3823 else
3824 {
3825 rtx par;
3826 int i;
3827
3828 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3829 for (i = 0; i < nregs; i++)
3830 {
3831 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3832 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3833 GEN_INT (i * UNITS_PER_WORD));
3834 XVECEXP (par, 0, i) = tmp;
3835 }
3836 pcum->aapcs_reg = par;
3837 }
3838
3839 pcum->aapcs_nextncrn = ncrn + nregs;
3840 return;
3841 }
3842
3843 /* C.11 */
3844 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3845
3846 /* The argument is passed on stack; record the needed number of words for
3847 this argument and align the total size if necessary. */
3848 on_stack:
3849 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3850
3851 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3852 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3853 16 / UNITS_PER_WORD);
3854 return;
3855 }
3856
3857 /* Implement TARGET_FUNCTION_ARG. */
3858
3859 static rtx
3860 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3861 const_tree type, bool named)
3862 {
3863 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3864 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3865
3866 if (mode == VOIDmode)
3867 return NULL_RTX;
3868
3869 aarch64_layout_arg (pcum_v, mode, type, named);
3870 return pcum->aapcs_reg;
3871 }
3872
3873 void
3874 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3875 const_tree fntype ATTRIBUTE_UNUSED,
3876 rtx libname ATTRIBUTE_UNUSED,
3877 const_tree fndecl ATTRIBUTE_UNUSED,
3878 unsigned n_named ATTRIBUTE_UNUSED)
3879 {
3880 pcum->aapcs_ncrn = 0;
3881 pcum->aapcs_nvrn = 0;
3882 pcum->aapcs_nextncrn = 0;
3883 pcum->aapcs_nextnvrn = 0;
3884 pcum->pcs_variant = ARM_PCS_AAPCS64;
3885 pcum->aapcs_reg = NULL_RTX;
3886 pcum->aapcs_arg_processed = false;
3887 pcum->aapcs_stack_words = 0;
3888 pcum->aapcs_stack_size = 0;
3889
3890 if (!TARGET_FLOAT
3891 && fndecl && TREE_PUBLIC (fndecl)
3892 && fntype && fntype != error_mark_node)
3893 {
3894 const_tree type = TREE_TYPE (fntype);
3895 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3896 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3897 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3898 &mode, &nregs, NULL))
3899 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3900 }
3901 return;
3902 }
3903
3904 static void
3905 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3906 machine_mode mode,
3907 const_tree type,
3908 bool named)
3909 {
3910 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3911 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3912 {
3913 aarch64_layout_arg (pcum_v, mode, type, named);
3914 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3915 != (pcum->aapcs_stack_words != 0));
3916 pcum->aapcs_arg_processed = false;
3917 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3918 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3919 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3920 pcum->aapcs_stack_words = 0;
3921 pcum->aapcs_reg = NULL_RTX;
3922 }
3923 }
3924
3925 bool
3926 aarch64_function_arg_regno_p (unsigned regno)
3927 {
3928 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3929 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3930 }
3931
3932 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3933 PARM_BOUNDARY bits of alignment, but will be given anything up
3934 to STACK_BOUNDARY bits if the type requires it. This makes sure
3935 that both before and after the layout of each argument, the Next
3936 Stacked Argument Address (NSAA) will have a minimum alignment of
3937 8 bytes. */
3938
3939 static unsigned int
3940 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3941 {
3942 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3943 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3944 }
3945
3946 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3947
3948 static fixed_size_mode
3949 aarch64_get_reg_raw_mode (int regno)
3950 {
3951 if (TARGET_SVE && FP_REGNUM_P (regno))
3952 /* Don't use the SVE part of the register for __builtin_apply and
3953 __builtin_return. The SVE registers aren't used by the normal PCS,
3954 so using them there would be a waste of time. The PCS extensions
3955 for SVE types are fundamentally incompatible with the
3956 __builtin_return/__builtin_apply interface. */
3957 return as_a <fixed_size_mode> (V16QImode);
3958 return default_get_reg_raw_mode (regno);
3959 }
3960
3961 /* Implement TARGET_FUNCTION_ARG_PADDING.
3962
3963 Small aggregate types are placed in the lowest memory address.
3964
3965 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3966
3967 static pad_direction
3968 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3969 {
3970 /* On little-endian targets, the least significant byte of every stack
3971 argument is passed at the lowest byte address of the stack slot. */
3972 if (!BYTES_BIG_ENDIAN)
3973 return PAD_UPWARD;
3974
3975 /* Otherwise, integral, floating-point and pointer types are padded downward:
3976 the least significant byte of a stack argument is passed at the highest
3977 byte address of the stack slot. */
3978 if (type
3979 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3980 || POINTER_TYPE_P (type))
3981 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3982 return PAD_DOWNWARD;
3983
3984 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3985 return PAD_UPWARD;
3986 }
3987
3988 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3989
3990 It specifies padding for the last (may also be the only)
3991 element of a block move between registers and memory. If
3992 assuming the block is in the memory, padding upward means that
3993 the last element is padded after its highest significant byte,
3994 while in downward padding, the last element is padded at the
3995 its least significant byte side.
3996
3997 Small aggregates and small complex types are always padded
3998 upwards.
3999
4000 We don't need to worry about homogeneous floating-point or
4001 short-vector aggregates; their move is not affected by the
4002 padding direction determined here. Regardless of endianness,
4003 each element of such an aggregate is put in the least
4004 significant bits of a fp/simd register.
4005
4006 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4007 register has useful data, and return the opposite if the most
4008 significant byte does. */
4009
4010 bool
4011 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4012 bool first ATTRIBUTE_UNUSED)
4013 {
4014
4015 /* Small composite types are always padded upward. */
4016 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4017 {
4018 HOST_WIDE_INT size;
4019 if (type)
4020 size = int_size_in_bytes (type);
4021 else
4022 /* No frontends can create types with variable-sized modes, so we
4023 shouldn't be asked to pass or return them. */
4024 size = GET_MODE_SIZE (mode).to_constant ();
4025 if (size < 2 * UNITS_PER_WORD)
4026 return true;
4027 }
4028
4029 /* Otherwise, use the default padding. */
4030 return !BYTES_BIG_ENDIAN;
4031 }
4032
4033 static scalar_int_mode
4034 aarch64_libgcc_cmp_return_mode (void)
4035 {
4036 return SImode;
4037 }
4038
4039 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4040
4041 /* We use the 12-bit shifted immediate arithmetic instructions so values
4042 must be multiple of (1 << 12), i.e. 4096. */
4043 #define ARITH_FACTOR 4096
4044
4045 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4046 #error Cannot use simple address calculation for stack probing
4047 #endif
4048
4049 /* The pair of scratch registers used for stack probing. */
4050 #define PROBE_STACK_FIRST_REG R9_REGNUM
4051 #define PROBE_STACK_SECOND_REG R10_REGNUM
4052
4053 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4054 inclusive. These are offsets from the current stack pointer. */
4055
4056 static void
4057 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4058 {
4059 HOST_WIDE_INT size;
4060 if (!poly_size.is_constant (&size))
4061 {
4062 sorry ("stack probes for SVE frames");
4063 return;
4064 }
4065
4066 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4067
4068 /* See the same assertion on PROBE_INTERVAL above. */
4069 gcc_assert ((first % ARITH_FACTOR) == 0);
4070
4071 /* See if we have a constant small number of probes to generate. If so,
4072 that's the easy case. */
4073 if (size <= PROBE_INTERVAL)
4074 {
4075 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4076
4077 emit_set_insn (reg1,
4078 plus_constant (Pmode,
4079 stack_pointer_rtx, -(first + base)));
4080 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4081 }
4082
4083 /* The run-time loop is made up of 8 insns in the generic case while the
4084 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4085 else if (size <= 4 * PROBE_INTERVAL)
4086 {
4087 HOST_WIDE_INT i, rem;
4088
4089 emit_set_insn (reg1,
4090 plus_constant (Pmode,
4091 stack_pointer_rtx,
4092 -(first + PROBE_INTERVAL)));
4093 emit_stack_probe (reg1);
4094
4095 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4096 it exceeds SIZE. If only two probes are needed, this will not
4097 generate any code. Then probe at FIRST + SIZE. */
4098 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4099 {
4100 emit_set_insn (reg1,
4101 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4102 emit_stack_probe (reg1);
4103 }
4104
4105 rem = size - (i - PROBE_INTERVAL);
4106 if (rem > 256)
4107 {
4108 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4109
4110 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4111 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4112 }
4113 else
4114 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4115 }
4116
4117 /* Otherwise, do the same as above, but in a loop. Note that we must be
4118 extra careful with variables wrapping around because we might be at
4119 the very top (or the very bottom) of the address space and we have
4120 to be able to handle this case properly; in particular, we use an
4121 equality test for the loop condition. */
4122 else
4123 {
4124 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4125
4126 /* Step 1: round SIZE to the previous multiple of the interval. */
4127
4128 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4129
4130
4131 /* Step 2: compute initial and final value of the loop counter. */
4132
4133 /* TEST_ADDR = SP + FIRST. */
4134 emit_set_insn (reg1,
4135 plus_constant (Pmode, stack_pointer_rtx, -first));
4136
4137 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4138 HOST_WIDE_INT adjustment = - (first + rounded_size);
4139 if (! aarch64_uimm12_shift (adjustment))
4140 {
4141 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4142 true, Pmode);
4143 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4144 }
4145 else
4146 emit_set_insn (reg2,
4147 plus_constant (Pmode, stack_pointer_rtx, adjustment));
4148
4149 /* Step 3: the loop
4150
4151 do
4152 {
4153 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4154 probe at TEST_ADDR
4155 }
4156 while (TEST_ADDR != LAST_ADDR)
4157
4158 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4159 until it is equal to ROUNDED_SIZE. */
4160
4161 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4162
4163
4164 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4165 that SIZE is equal to ROUNDED_SIZE. */
4166
4167 if (size != rounded_size)
4168 {
4169 HOST_WIDE_INT rem = size - rounded_size;
4170
4171 if (rem > 256)
4172 {
4173 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4174
4175 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4176 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4177 }
4178 else
4179 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4180 }
4181 }
4182
4183 /* Make sure nothing is scheduled before we are done. */
4184 emit_insn (gen_blockage ());
4185 }
4186
4187 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4188 absolute addresses. */
4189
4190 const char *
4191 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4192 {
4193 static int labelno = 0;
4194 char loop_lab[32];
4195 rtx xops[2];
4196
4197 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4198
4199 /* Loop. */
4200 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4201
4202 HOST_WIDE_INT stack_clash_probe_interval
4203 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4204
4205 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4206 xops[0] = reg1;
4207 HOST_WIDE_INT interval;
4208 if (flag_stack_clash_protection)
4209 interval = stack_clash_probe_interval;
4210 else
4211 interval = PROBE_INTERVAL;
4212
4213 gcc_assert (aarch64_uimm12_shift (interval));
4214 xops[1] = GEN_INT (interval);
4215
4216 output_asm_insn ("sub\t%0, %0, %1", xops);
4217
4218 /* If doing stack clash protection then we probe up by the ABI specified
4219 amount. We do this because we're dropping full pages at a time in the
4220 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4221 if (flag_stack_clash_protection)
4222 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4223 else
4224 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4225
4226 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4227 by this amount for each iteration. */
4228 output_asm_insn ("str\txzr, [%0, %1]", xops);
4229
4230 /* Test if TEST_ADDR == LAST_ADDR. */
4231 xops[1] = reg2;
4232 output_asm_insn ("cmp\t%0, %1", xops);
4233
4234 /* Branch. */
4235 fputs ("\tb.ne\t", asm_out_file);
4236 assemble_name_raw (asm_out_file, loop_lab);
4237 fputc ('\n', asm_out_file);
4238
4239 return "";
4240 }
4241
4242 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4243 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4244 of GUARD_SIZE. When a probe is emitted it is done at most
4245 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4246 at most MIN_PROBE_THRESHOLD. By the end of this function
4247 BASE = BASE - ADJUSTMENT. */
4248
4249 const char *
4250 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4251 rtx min_probe_threshold, rtx guard_size)
4252 {
4253 /* This function is not allowed to use any instruction generation function
4254 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4255 so instead emit the code you want using output_asm_insn. */
4256 gcc_assert (flag_stack_clash_protection);
4257 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4258 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4259
4260 /* The minimum required allocation before the residual requires probing. */
4261 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4262
4263 /* Clamp the value down to the nearest value that can be used with a cmp. */
4264 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4265 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4266
4267 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4268 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4269
4270 static int labelno = 0;
4271 char loop_start_lab[32];
4272 char loop_end_lab[32];
4273 rtx xops[2];
4274
4275 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4276 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4277
4278 /* Emit loop start label. */
4279 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4280
4281 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4282 xops[0] = adjustment;
4283 xops[1] = probe_offset_value_rtx;
4284 output_asm_insn ("cmp\t%0, %1", xops);
4285
4286 /* Branch to end if not enough adjustment to probe. */
4287 fputs ("\tb.lt\t", asm_out_file);
4288 assemble_name_raw (asm_out_file, loop_end_lab);
4289 fputc ('\n', asm_out_file);
4290
4291 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4292 xops[0] = base;
4293 xops[1] = probe_offset_value_rtx;
4294 output_asm_insn ("sub\t%0, %0, %1", xops);
4295
4296 /* Probe at BASE. */
4297 xops[1] = const0_rtx;
4298 output_asm_insn ("str\txzr, [%0, %1]", xops);
4299
4300 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4301 xops[0] = adjustment;
4302 xops[1] = probe_offset_value_rtx;
4303 output_asm_insn ("sub\t%0, %0, %1", xops);
4304
4305 /* Branch to start if still more bytes to allocate. */
4306 fputs ("\tb\t", asm_out_file);
4307 assemble_name_raw (asm_out_file, loop_start_lab);
4308 fputc ('\n', asm_out_file);
4309
4310 /* No probe leave. */
4311 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4312
4313 /* BASE = BASE - ADJUSTMENT. */
4314 xops[0] = base;
4315 xops[1] = adjustment;
4316 output_asm_insn ("sub\t%0, %0, %1", xops);
4317 return "";
4318 }
4319
4320 /* Determine whether a frame chain needs to be generated. */
4321 static bool
4322 aarch64_needs_frame_chain (void)
4323 {
4324 /* Force a frame chain for EH returns so the return address is at FP+8. */
4325 if (frame_pointer_needed || crtl->calls_eh_return)
4326 return true;
4327
4328 /* A leaf function cannot have calls or write LR. */
4329 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4330
4331 /* Don't use a frame chain in leaf functions if leaf frame pointers
4332 are disabled. */
4333 if (flag_omit_leaf_frame_pointer && is_leaf)
4334 return false;
4335
4336 return aarch64_use_frame_pointer;
4337 }
4338
4339 /* Mark the registers that need to be saved by the callee and calculate
4340 the size of the callee-saved registers area and frame record (both FP
4341 and LR may be omitted). */
4342 static void
4343 aarch64_layout_frame (void)
4344 {
4345 HOST_WIDE_INT offset = 0;
4346 int regno, last_fp_reg = INVALID_REGNUM;
4347 bool simd_function = aarch64_simd_decl_p (cfun->decl);
4348
4349 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4350
4351 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4352 the mid-end is doing. */
4353 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4354
4355 #define SLOT_NOT_REQUIRED (-2)
4356 #define SLOT_REQUIRED (-1)
4357
4358 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4359 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4360
4361 /* If this is a non-leaf simd function with calls we assume that
4362 at least one of those calls is to a non-simd function and thus
4363 we must save V8 to V23 in the prologue. */
4364
4365 if (simd_function && !crtl->is_leaf)
4366 {
4367 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4368 if (FP_SIMD_SAVED_REGNUM_P (regno))
4369 df_set_regs_ever_live (regno, true);
4370 }
4371
4372 /* First mark all the registers that really need to be saved... */
4373 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4374 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4375
4376 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4377 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4378
4379 /* ... that includes the eh data registers (if needed)... */
4380 if (crtl->calls_eh_return)
4381 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4382 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4383 = SLOT_REQUIRED;
4384
4385 /* ... and any callee saved register that dataflow says is live. */
4386 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4387 if (df_regs_ever_live_p (regno)
4388 && (regno == R30_REGNUM
4389 || !call_used_regs[regno]))
4390 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4391
4392 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4393 if (df_regs_ever_live_p (regno)
4394 && (!call_used_regs[regno]
4395 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4396 {
4397 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4398 last_fp_reg = regno;
4399 }
4400
4401 if (cfun->machine->frame.emit_frame_chain)
4402 {
4403 /* FP and LR are placed in the linkage record. */
4404 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4405 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4406 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4407 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4408 offset = 2 * UNITS_PER_WORD;
4409 }
4410
4411 /* With stack-clash, LR must be saved in non-leaf functions. */
4412 gcc_assert (crtl->is_leaf
4413 || (cfun->machine->frame.reg_offset[R30_REGNUM]
4414 != SLOT_NOT_REQUIRED));
4415
4416 /* Now assign stack slots for them. */
4417 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4418 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4419 {
4420 cfun->machine->frame.reg_offset[regno] = offset;
4421 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4422 cfun->machine->frame.wb_candidate1 = regno;
4423 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4424 cfun->machine->frame.wb_candidate2 = regno;
4425 offset += UNITS_PER_WORD;
4426 }
4427
4428 HOST_WIDE_INT max_int_offset = offset;
4429 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4430 bool has_align_gap = offset != max_int_offset;
4431
4432 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4433 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4434 {
4435 /* If there is an alignment gap between integer and fp callee-saves,
4436 allocate the last fp register to it if possible. */
4437 if (regno == last_fp_reg
4438 && has_align_gap
4439 && !simd_function
4440 && (offset & 8) == 0)
4441 {
4442 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4443 break;
4444 }
4445
4446 cfun->machine->frame.reg_offset[regno] = offset;
4447 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4448 cfun->machine->frame.wb_candidate1 = regno;
4449 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4450 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4451 cfun->machine->frame.wb_candidate2 = regno;
4452 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4453 }
4454
4455 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4456
4457 cfun->machine->frame.saved_regs_size = offset;
4458
4459 HOST_WIDE_INT varargs_and_saved_regs_size
4460 = offset + cfun->machine->frame.saved_varargs_size;
4461
4462 cfun->machine->frame.hard_fp_offset
4463 = aligned_upper_bound (varargs_and_saved_regs_size
4464 + get_frame_size (),
4465 STACK_BOUNDARY / BITS_PER_UNIT);
4466
4467 /* Both these values are already aligned. */
4468 gcc_assert (multiple_p (crtl->outgoing_args_size,
4469 STACK_BOUNDARY / BITS_PER_UNIT));
4470 cfun->machine->frame.frame_size
4471 = (cfun->machine->frame.hard_fp_offset
4472 + crtl->outgoing_args_size);
4473
4474 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4475
4476 cfun->machine->frame.initial_adjust = 0;
4477 cfun->machine->frame.final_adjust = 0;
4478 cfun->machine->frame.callee_adjust = 0;
4479 cfun->machine->frame.callee_offset = 0;
4480
4481 HOST_WIDE_INT max_push_offset = 0;
4482 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4483 max_push_offset = 512;
4484 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4485 max_push_offset = 256;
4486
4487 HOST_WIDE_INT const_size, const_fp_offset;
4488 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4489 && const_size < max_push_offset
4490 && known_eq (crtl->outgoing_args_size, 0))
4491 {
4492 /* Simple, small frame with no outgoing arguments:
4493 stp reg1, reg2, [sp, -frame_size]!
4494 stp reg3, reg4, [sp, 16] */
4495 cfun->machine->frame.callee_adjust = const_size;
4496 }
4497 else if (known_lt (crtl->outgoing_args_size
4498 + cfun->machine->frame.saved_regs_size, 512)
4499 && !(cfun->calls_alloca
4500 && known_lt (cfun->machine->frame.hard_fp_offset,
4501 max_push_offset)))
4502 {
4503 /* Frame with small outgoing arguments:
4504 sub sp, sp, frame_size
4505 stp reg1, reg2, [sp, outgoing_args_size]
4506 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4507 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4508 cfun->machine->frame.callee_offset
4509 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4510 }
4511 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4512 && const_fp_offset < max_push_offset)
4513 {
4514 /* Frame with large outgoing arguments but a small local area:
4515 stp reg1, reg2, [sp, -hard_fp_offset]!
4516 stp reg3, reg4, [sp, 16]
4517 sub sp, sp, outgoing_args_size */
4518 cfun->machine->frame.callee_adjust = const_fp_offset;
4519 cfun->machine->frame.final_adjust
4520 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4521 }
4522 else
4523 {
4524 /* Frame with large local area and outgoing arguments using frame pointer:
4525 sub sp, sp, hard_fp_offset
4526 stp x29, x30, [sp, 0]
4527 add x29, sp, 0
4528 stp reg3, reg4, [sp, 16]
4529 sub sp, sp, outgoing_args_size */
4530 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4531 cfun->machine->frame.final_adjust
4532 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4533 }
4534
4535 cfun->machine->frame.laid_out = true;
4536 }
4537
4538 /* Return true if the register REGNO is saved on entry to
4539 the current function. */
4540
4541 static bool
4542 aarch64_register_saved_on_entry (int regno)
4543 {
4544 return cfun->machine->frame.reg_offset[regno] >= 0;
4545 }
4546
4547 /* Return the next register up from REGNO up to LIMIT for the callee
4548 to save. */
4549
4550 static unsigned
4551 aarch64_next_callee_save (unsigned regno, unsigned limit)
4552 {
4553 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4554 regno ++;
4555 return regno;
4556 }
4557
4558 /* Push the register number REGNO of mode MODE to the stack with write-back
4559 adjusting the stack by ADJUSTMENT. */
4560
4561 static void
4562 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4563 HOST_WIDE_INT adjustment)
4564 {
4565 rtx base_rtx = stack_pointer_rtx;
4566 rtx insn, reg, mem;
4567
4568 reg = gen_rtx_REG (mode, regno);
4569 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4570 plus_constant (Pmode, base_rtx, -adjustment));
4571 mem = gen_frame_mem (mode, mem);
4572
4573 insn = emit_move_insn (mem, reg);
4574 RTX_FRAME_RELATED_P (insn) = 1;
4575 }
4576
4577 /* Generate and return an instruction to store the pair of registers
4578 REG and REG2 of mode MODE to location BASE with write-back adjusting
4579 the stack location BASE by ADJUSTMENT. */
4580
4581 static rtx
4582 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4583 HOST_WIDE_INT adjustment)
4584 {
4585 switch (mode)
4586 {
4587 case E_DImode:
4588 return gen_storewb_pairdi_di (base, base, reg, reg2,
4589 GEN_INT (-adjustment),
4590 GEN_INT (UNITS_PER_WORD - adjustment));
4591 case E_DFmode:
4592 return gen_storewb_pairdf_di (base, base, reg, reg2,
4593 GEN_INT (-adjustment),
4594 GEN_INT (UNITS_PER_WORD - adjustment));
4595 case E_TFmode:
4596 return gen_storewb_pairtf_di (base, base, reg, reg2,
4597 GEN_INT (-adjustment),
4598 GEN_INT (UNITS_PER_VREG - adjustment));
4599 default:
4600 gcc_unreachable ();
4601 }
4602 }
4603
4604 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4605 stack pointer by ADJUSTMENT. */
4606
4607 static void
4608 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4609 {
4610 rtx_insn *insn;
4611 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4612
4613 if (regno2 == INVALID_REGNUM)
4614 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4615
4616 rtx reg1 = gen_rtx_REG (mode, regno1);
4617 rtx reg2 = gen_rtx_REG (mode, regno2);
4618
4619 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4620 reg2, adjustment));
4621 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4622 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4623 RTX_FRAME_RELATED_P (insn) = 1;
4624 }
4625
4626 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4627 adjusting it by ADJUSTMENT afterwards. */
4628
4629 static rtx
4630 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4631 HOST_WIDE_INT adjustment)
4632 {
4633 switch (mode)
4634 {
4635 case E_DImode:
4636 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4637 GEN_INT (UNITS_PER_WORD));
4638 case E_DFmode:
4639 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4640 GEN_INT (UNITS_PER_WORD));
4641 case E_TFmode:
4642 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4643 GEN_INT (UNITS_PER_VREG));
4644 default:
4645 gcc_unreachable ();
4646 }
4647 }
4648
4649 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4650 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4651 into CFI_OPS. */
4652
4653 static void
4654 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4655 rtx *cfi_ops)
4656 {
4657 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4658 rtx reg1 = gen_rtx_REG (mode, regno1);
4659
4660 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4661
4662 if (regno2 == INVALID_REGNUM)
4663 {
4664 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4665 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4666 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4667 }
4668 else
4669 {
4670 rtx reg2 = gen_rtx_REG (mode, regno2);
4671 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4672 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4673 reg2, adjustment));
4674 }
4675 }
4676
4677 /* Generate and return a store pair instruction of mode MODE to store
4678 register REG1 to MEM1 and register REG2 to MEM2. */
4679
4680 static rtx
4681 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4682 rtx reg2)
4683 {
4684 switch (mode)
4685 {
4686 case E_DImode:
4687 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4688
4689 case E_DFmode:
4690 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4691
4692 case E_TFmode:
4693 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4694
4695 default:
4696 gcc_unreachable ();
4697 }
4698 }
4699
4700 /* Generate and regurn a load pair isntruction of mode MODE to load register
4701 REG1 from MEM1 and register REG2 from MEM2. */
4702
4703 static rtx
4704 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4705 rtx mem2)
4706 {
4707 switch (mode)
4708 {
4709 case E_DImode:
4710 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4711
4712 case E_DFmode:
4713 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4714
4715 case E_TFmode:
4716 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4717
4718 default:
4719 gcc_unreachable ();
4720 }
4721 }
4722
4723 /* Return TRUE if return address signing should be enabled for the current
4724 function, otherwise return FALSE. */
4725
4726 bool
4727 aarch64_return_address_signing_enabled (void)
4728 {
4729 /* This function should only be called after frame laid out. */
4730 gcc_assert (cfun->machine->frame.laid_out);
4731
4732 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4733 if it's LR is pushed onto stack. */
4734 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4735 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4736 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4737 }
4738
4739 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
4740 bool
4741 aarch64_bti_enabled (void)
4742 {
4743 return (aarch64_enable_bti == 1);
4744 }
4745
4746 /* Emit code to save the callee-saved registers from register number START
4747 to LIMIT to the stack at the location starting at offset START_OFFSET,
4748 skipping any write-back candidates if SKIP_WB is true. */
4749
4750 static void
4751 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4752 unsigned start, unsigned limit, bool skip_wb)
4753 {
4754 rtx_insn *insn;
4755 unsigned regno;
4756 unsigned regno2;
4757
4758 for (regno = aarch64_next_callee_save (start, limit);
4759 regno <= limit;
4760 regno = aarch64_next_callee_save (regno + 1, limit))
4761 {
4762 rtx reg, mem;
4763 poly_int64 offset;
4764 int offset_diff;
4765
4766 if (skip_wb
4767 && (regno == cfun->machine->frame.wb_candidate1
4768 || regno == cfun->machine->frame.wb_candidate2))
4769 continue;
4770
4771 if (cfun->machine->reg_is_wrapped_separately[regno])
4772 continue;
4773
4774 reg = gen_rtx_REG (mode, regno);
4775 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4776 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4777 offset));
4778
4779 regno2 = aarch64_next_callee_save (regno + 1, limit);
4780 offset_diff = cfun->machine->frame.reg_offset[regno2]
4781 - cfun->machine->frame.reg_offset[regno];
4782
4783 if (regno2 <= limit
4784 && !cfun->machine->reg_is_wrapped_separately[regno2]
4785 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4786 {
4787 rtx reg2 = gen_rtx_REG (mode, regno2);
4788 rtx mem2;
4789
4790 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4791 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4792 offset));
4793 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4794 reg2));
4795
4796 /* The first part of a frame-related parallel insn is
4797 always assumed to be relevant to the frame
4798 calculations; subsequent parts, are only
4799 frame-related if explicitly marked. */
4800 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4801 regno = regno2;
4802 }
4803 else
4804 insn = emit_move_insn (mem, reg);
4805
4806 RTX_FRAME_RELATED_P (insn) = 1;
4807 }
4808 }
4809
4810 /* Emit code to restore the callee registers of mode MODE from register
4811 number START up to and including LIMIT. Restore from the stack offset
4812 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4813 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4814
4815 static void
4816 aarch64_restore_callee_saves (machine_mode mode,
4817 poly_int64 start_offset, unsigned start,
4818 unsigned limit, bool skip_wb, rtx *cfi_ops)
4819 {
4820 rtx base_rtx = stack_pointer_rtx;
4821 unsigned regno;
4822 unsigned regno2;
4823 poly_int64 offset;
4824
4825 for (regno = aarch64_next_callee_save (start, limit);
4826 regno <= limit;
4827 regno = aarch64_next_callee_save (regno + 1, limit))
4828 {
4829 if (cfun->machine->reg_is_wrapped_separately[regno])
4830 continue;
4831
4832 rtx reg, mem;
4833 int offset_diff;
4834
4835 if (skip_wb
4836 && (regno == cfun->machine->frame.wb_candidate1
4837 || regno == cfun->machine->frame.wb_candidate2))
4838 continue;
4839
4840 reg = gen_rtx_REG (mode, regno);
4841 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4842 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4843
4844 regno2 = aarch64_next_callee_save (regno + 1, limit);
4845 offset_diff = cfun->machine->frame.reg_offset[regno2]
4846 - cfun->machine->frame.reg_offset[regno];
4847
4848 if (regno2 <= limit
4849 && !cfun->machine->reg_is_wrapped_separately[regno2]
4850 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4851 {
4852 rtx reg2 = gen_rtx_REG (mode, regno2);
4853 rtx mem2;
4854
4855 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4856 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4857 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4858
4859 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4860 regno = regno2;
4861 }
4862 else
4863 emit_move_insn (reg, mem);
4864 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4865 }
4866 }
4867
4868 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4869 of MODE. */
4870
4871 static inline bool
4872 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4873 {
4874 HOST_WIDE_INT multiple;
4875 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4876 && IN_RANGE (multiple, -8, 7));
4877 }
4878
4879 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4880 of MODE. */
4881
4882 static inline bool
4883 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4884 {
4885 HOST_WIDE_INT multiple;
4886 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4887 && IN_RANGE (multiple, 0, 63));
4888 }
4889
4890 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4891 of MODE. */
4892
4893 bool
4894 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4895 {
4896 HOST_WIDE_INT multiple;
4897 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4898 && IN_RANGE (multiple, -64, 63));
4899 }
4900
4901 /* Return true if OFFSET is a signed 9-bit value. */
4902
4903 bool
4904 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4905 poly_int64 offset)
4906 {
4907 HOST_WIDE_INT const_offset;
4908 return (offset.is_constant (&const_offset)
4909 && IN_RANGE (const_offset, -256, 255));
4910 }
4911
4912 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4913 of MODE. */
4914
4915 static inline bool
4916 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4917 {
4918 HOST_WIDE_INT multiple;
4919 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4920 && IN_RANGE (multiple, -256, 255));
4921 }
4922
4923 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4924 of MODE. */
4925
4926 static inline bool
4927 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4928 {
4929 HOST_WIDE_INT multiple;
4930 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4931 && IN_RANGE (multiple, 0, 4095));
4932 }
4933
4934 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4935
4936 static sbitmap
4937 aarch64_get_separate_components (void)
4938 {
4939 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4940 bitmap_clear (components);
4941
4942 /* The registers we need saved to the frame. */
4943 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4944 if (aarch64_register_saved_on_entry (regno))
4945 {
4946 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4947 if (!frame_pointer_needed)
4948 offset += cfun->machine->frame.frame_size
4949 - cfun->machine->frame.hard_fp_offset;
4950 /* Check that we can access the stack slot of the register with one
4951 direct load with no adjustments needed. */
4952 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4953 bitmap_set_bit (components, regno);
4954 }
4955
4956 /* Don't mess with the hard frame pointer. */
4957 if (frame_pointer_needed)
4958 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4959
4960 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4961 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4962 /* If registers have been chosen to be stored/restored with
4963 writeback don't interfere with them to avoid having to output explicit
4964 stack adjustment instructions. */
4965 if (reg2 != INVALID_REGNUM)
4966 bitmap_clear_bit (components, reg2);
4967 if (reg1 != INVALID_REGNUM)
4968 bitmap_clear_bit (components, reg1);
4969
4970 bitmap_clear_bit (components, LR_REGNUM);
4971 bitmap_clear_bit (components, SP_REGNUM);
4972
4973 return components;
4974 }
4975
4976 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4977
4978 static sbitmap
4979 aarch64_components_for_bb (basic_block bb)
4980 {
4981 bitmap in = DF_LIVE_IN (bb);
4982 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4983 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4984 bool simd_function = aarch64_simd_decl_p (cfun->decl);
4985
4986 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4987 bitmap_clear (components);
4988
4989 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4990 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4991 if ((!call_used_regs[regno]
4992 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
4993 && (bitmap_bit_p (in, regno)
4994 || bitmap_bit_p (gen, regno)
4995 || bitmap_bit_p (kill, regno)))
4996 {
4997 unsigned regno2, offset, offset2;
4998 bitmap_set_bit (components, regno);
4999
5000 /* If there is a callee-save at an adjacent offset, add it too
5001 to increase the use of LDP/STP. */
5002 offset = cfun->machine->frame.reg_offset[regno];
5003 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5004
5005 if (regno2 <= LAST_SAVED_REGNUM)
5006 {
5007 offset2 = cfun->machine->frame.reg_offset[regno2];
5008 if ((offset & ~8) == (offset2 & ~8))
5009 bitmap_set_bit (components, regno2);
5010 }
5011 }
5012
5013 return components;
5014 }
5015
5016 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5017 Nothing to do for aarch64. */
5018
5019 static void
5020 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5021 {
5022 }
5023
5024 /* Return the next set bit in BMP from START onwards. Return the total number
5025 of bits in BMP if no set bit is found at or after START. */
5026
5027 static unsigned int
5028 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5029 {
5030 unsigned int nbits = SBITMAP_SIZE (bmp);
5031 if (start == nbits)
5032 return start;
5033
5034 gcc_assert (start < nbits);
5035 for (unsigned int i = start; i < nbits; i++)
5036 if (bitmap_bit_p (bmp, i))
5037 return i;
5038
5039 return nbits;
5040 }
5041
5042 /* Do the work for aarch64_emit_prologue_components and
5043 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5044 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5045 for these components or the epilogue sequence. That is, it determines
5046 whether we should emit stores or loads and what kind of CFA notes to attach
5047 to the insns. Otherwise the logic for the two sequences is very
5048 similar. */
5049
5050 static void
5051 aarch64_process_components (sbitmap components, bool prologue_p)
5052 {
5053 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5054 ? HARD_FRAME_POINTER_REGNUM
5055 : STACK_POINTER_REGNUM);
5056
5057 unsigned last_regno = SBITMAP_SIZE (components);
5058 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5059 rtx_insn *insn = NULL;
5060
5061 while (regno != last_regno)
5062 {
5063 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5064 so DFmode for the vector registers is enough. For simd functions
5065 we want to save the low 128 bits. */
5066 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5067
5068 rtx reg = gen_rtx_REG (mode, regno);
5069 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5070 if (!frame_pointer_needed)
5071 offset += cfun->machine->frame.frame_size
5072 - cfun->machine->frame.hard_fp_offset;
5073 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5074 rtx mem = gen_frame_mem (mode, addr);
5075
5076 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5077 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5078 /* No more registers to handle after REGNO.
5079 Emit a single save/restore and exit. */
5080 if (regno2 == last_regno)
5081 {
5082 insn = emit_insn (set);
5083 RTX_FRAME_RELATED_P (insn) = 1;
5084 if (prologue_p)
5085 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5086 else
5087 add_reg_note (insn, REG_CFA_RESTORE, reg);
5088 break;
5089 }
5090
5091 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5092 /* The next register is not of the same class or its offset is not
5093 mergeable with the current one into a pair. */
5094 if (!satisfies_constraint_Ump (mem)
5095 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5096 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5097 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5098 GET_MODE_SIZE (mode)))
5099 {
5100 insn = emit_insn (set);
5101 RTX_FRAME_RELATED_P (insn) = 1;
5102 if (prologue_p)
5103 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5104 else
5105 add_reg_note (insn, REG_CFA_RESTORE, reg);
5106
5107 regno = regno2;
5108 continue;
5109 }
5110
5111 /* REGNO2 can be saved/restored in a pair with REGNO. */
5112 rtx reg2 = gen_rtx_REG (mode, regno2);
5113 if (!frame_pointer_needed)
5114 offset2 += cfun->machine->frame.frame_size
5115 - cfun->machine->frame.hard_fp_offset;
5116 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5117 rtx mem2 = gen_frame_mem (mode, addr2);
5118 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5119 : gen_rtx_SET (reg2, mem2);
5120
5121 if (prologue_p)
5122 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5123 else
5124 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5125
5126 RTX_FRAME_RELATED_P (insn) = 1;
5127 if (prologue_p)
5128 {
5129 add_reg_note (insn, REG_CFA_OFFSET, set);
5130 add_reg_note (insn, REG_CFA_OFFSET, set2);
5131 }
5132 else
5133 {
5134 add_reg_note (insn, REG_CFA_RESTORE, reg);
5135 add_reg_note (insn, REG_CFA_RESTORE, reg2);
5136 }
5137
5138 regno = aarch64_get_next_set_bit (components, regno2 + 1);
5139 }
5140 }
5141
5142 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5143
5144 static void
5145 aarch64_emit_prologue_components (sbitmap components)
5146 {
5147 aarch64_process_components (components, true);
5148 }
5149
5150 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5151
5152 static void
5153 aarch64_emit_epilogue_components (sbitmap components)
5154 {
5155 aarch64_process_components (components, false);
5156 }
5157
5158 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5159
5160 static void
5161 aarch64_set_handled_components (sbitmap components)
5162 {
5163 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5164 if (bitmap_bit_p (components, regno))
5165 cfun->machine->reg_is_wrapped_separately[regno] = true;
5166 }
5167
5168 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5169 determining the probe offset for alloca. */
5170
5171 static HOST_WIDE_INT
5172 aarch64_stack_clash_protection_alloca_probe_range (void)
5173 {
5174 return STACK_CLASH_CALLER_GUARD;
5175 }
5176
5177
5178 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5179 registers. If POLY_SIZE is not large enough to require a probe this function
5180 will only adjust the stack. When allocating the stack space
5181 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5182 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5183 arguments. If we are then we ensure that any allocation larger than the ABI
5184 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5185 maintained.
5186
5187 We emit barriers after each stack adjustment to prevent optimizations from
5188 breaking the invariant that we never drop the stack more than a page. This
5189 invariant is needed to make it easier to correctly handle asynchronous
5190 events, e.g. if we were to allow the stack to be dropped by more than a page
5191 and then have multiple probes up and we take a signal somewhere in between
5192 then the signal handler doesn't know the state of the stack and can make no
5193 assumptions about which pages have been probed. */
5194
5195 static void
5196 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5197 poly_int64 poly_size,
5198 bool frame_related_p,
5199 bool final_adjustment_p)
5200 {
5201 HOST_WIDE_INT guard_size
5202 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5203 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5204 /* When doing the final adjustment for the outgoing argument size we can't
5205 assume that LR was saved at position 0. So subtract it's offset from the
5206 ABI safe buffer so that we don't accidentally allow an adjustment that
5207 would result in an allocation larger than the ABI buffer without
5208 probing. */
5209 HOST_WIDE_INT min_probe_threshold
5210 = final_adjustment_p
5211 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5212 : guard_size - guard_used_by_caller;
5213
5214 poly_int64 frame_size = cfun->machine->frame.frame_size;
5215
5216 /* We should always have a positive probe threshold. */
5217 gcc_assert (min_probe_threshold > 0);
5218
5219 if (flag_stack_clash_protection && !final_adjustment_p)
5220 {
5221 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5222 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5223
5224 if (known_eq (frame_size, 0))
5225 {
5226 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5227 }
5228 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5229 && known_lt (final_adjust, guard_used_by_caller))
5230 {
5231 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5232 }
5233 }
5234
5235 /* If SIZE is not large enough to require probing, just adjust the stack and
5236 exit. */
5237 if (known_lt (poly_size, min_probe_threshold)
5238 || !flag_stack_clash_protection)
5239 {
5240 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5241 return;
5242 }
5243
5244 HOST_WIDE_INT size;
5245 /* Handle the SVE non-constant case first. */
5246 if (!poly_size.is_constant (&size))
5247 {
5248 if (dump_file)
5249 {
5250 fprintf (dump_file, "Stack clash SVE prologue: ");
5251 print_dec (poly_size, dump_file);
5252 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5253 }
5254
5255 /* First calculate the amount of bytes we're actually spilling. */
5256 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5257 poly_size, temp1, temp2, false, true);
5258
5259 rtx_insn *insn = get_last_insn ();
5260
5261 if (frame_related_p)
5262 {
5263 /* This is done to provide unwinding information for the stack
5264 adjustments we're about to do, however to prevent the optimizers
5265 from removing the R15 move and leaving the CFA note (which would be
5266 very wrong) we tie the old and new stack pointer together.
5267 The tie will expand to nothing but the optimizers will not touch
5268 the instruction. */
5269 rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
5270 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5271 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5272
5273 /* We want the CFA independent of the stack pointer for the
5274 duration of the loop. */
5275 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5276 RTX_FRAME_RELATED_P (insn) = 1;
5277 }
5278
5279 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5280 rtx guard_const = gen_int_mode (guard_size, Pmode);
5281
5282 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5283 stack_pointer_rtx, temp1,
5284 probe_const, guard_const));
5285
5286 /* Now reset the CFA register if needed. */
5287 if (frame_related_p)
5288 {
5289 add_reg_note (insn, REG_CFA_DEF_CFA,
5290 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5291 gen_int_mode (poly_size, Pmode)));
5292 RTX_FRAME_RELATED_P (insn) = 1;
5293 }
5294
5295 return;
5296 }
5297
5298 if (dump_file)
5299 fprintf (dump_file,
5300 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5301 " bytes, probing will be required.\n", size);
5302
5303 /* Round size to the nearest multiple of guard_size, and calculate the
5304 residual as the difference between the original size and the rounded
5305 size. */
5306 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5307 HOST_WIDE_INT residual = size - rounded_size;
5308
5309 /* We can handle a small number of allocations/probes inline. Otherwise
5310 punt to a loop. */
5311 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5312 {
5313 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5314 {
5315 aarch64_sub_sp (NULL, temp2, guard_size, true);
5316 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5317 guard_used_by_caller));
5318 emit_insn (gen_blockage ());
5319 }
5320 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5321 }
5322 else
5323 {
5324 /* Compute the ending address. */
5325 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5326 temp1, NULL, false, true);
5327 rtx_insn *insn = get_last_insn ();
5328
5329 /* For the initial allocation, we don't have a frame pointer
5330 set up, so we always need CFI notes. If we're doing the
5331 final allocation, then we may have a frame pointer, in which
5332 case it is the CFA, otherwise we need CFI notes.
5333
5334 We can determine which allocation we are doing by looking at
5335 the value of FRAME_RELATED_P since the final allocations are not
5336 frame related. */
5337 if (frame_related_p)
5338 {
5339 /* We want the CFA independent of the stack pointer for the
5340 duration of the loop. */
5341 add_reg_note (insn, REG_CFA_DEF_CFA,
5342 plus_constant (Pmode, temp1, rounded_size));
5343 RTX_FRAME_RELATED_P (insn) = 1;
5344 }
5345
5346 /* This allocates and probes the stack. Note that this re-uses some of
5347 the existing Ada stack protection code. However we are guaranteed not
5348 to enter the non loop or residual branches of that code.
5349
5350 The non-loop part won't be entered because if our allocation amount
5351 doesn't require a loop, the case above would handle it.
5352
5353 The residual amount won't be entered because TEMP1 is a mutliple of
5354 the allocation size. The residual will always be 0. As such, the only
5355 part we are actually using from that code is the loop setup. The
5356 actual probing is done in aarch64_output_probe_stack_range. */
5357 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5358 stack_pointer_rtx, temp1));
5359
5360 /* Now reset the CFA register if needed. */
5361 if (frame_related_p)
5362 {
5363 add_reg_note (insn, REG_CFA_DEF_CFA,
5364 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5365 RTX_FRAME_RELATED_P (insn) = 1;
5366 }
5367
5368 emit_insn (gen_blockage ());
5369 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5370 }
5371
5372 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5373 be probed. This maintains the requirement that each page is probed at
5374 least once. For initial probing we probe only if the allocation is
5375 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5376 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5377 GUARD_SIZE. This works that for any allocation that is large enough to
5378 trigger a probe here, we'll have at least one, and if they're not large
5379 enough for this code to emit anything for them, The page would have been
5380 probed by the saving of FP/LR either by this function or any callees. If
5381 we don't have any callees then we won't have more stack adjustments and so
5382 are still safe. */
5383 if (residual)
5384 {
5385 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5386 /* If we're doing final adjustments, and we've done any full page
5387 allocations then any residual needs to be probed. */
5388 if (final_adjustment_p && rounded_size != 0)
5389 min_probe_threshold = 0;
5390 /* If doing a small final adjustment, we always probe at offset 0.
5391 This is done to avoid issues when LR is not at position 0 or when
5392 the final adjustment is smaller than the probing offset. */
5393 else if (final_adjustment_p && rounded_size == 0)
5394 residual_probe_offset = 0;
5395
5396 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5397 if (residual >= min_probe_threshold)
5398 {
5399 if (dump_file)
5400 fprintf (dump_file,
5401 "Stack clash AArch64 prologue residuals: "
5402 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5403 "\n", residual);
5404
5405 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5406 residual_probe_offset));
5407 emit_insn (gen_blockage ());
5408 }
5409 }
5410 }
5411
5412 /* Return 1 if the register is used by the epilogue. We need to say the
5413 return register is used, but only after epilogue generation is complete.
5414 Note that in the case of sibcalls, the values "used by the epilogue" are
5415 considered live at the start of the called function.
5416
5417 For SIMD functions we need to return 1 for FP registers that are saved and
5418 restored by a function but are not zero in call_used_regs. If we do not do
5419 this optimizations may remove the restore of the register. */
5420
5421 int
5422 aarch64_epilogue_uses (int regno)
5423 {
5424 if (epilogue_completed)
5425 {
5426 if (regno == LR_REGNUM)
5427 return 1;
5428 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5429 return 1;
5430 }
5431 return 0;
5432 }
5433
5434 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5435 is saved at BASE + OFFSET. */
5436
5437 static void
5438 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5439 rtx base, poly_int64 offset)
5440 {
5441 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5442 add_reg_note (insn, REG_CFA_EXPRESSION,
5443 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5444 }
5445
5446 /* AArch64 stack frames generated by this compiler look like:
5447
5448 +-------------------------------+
5449 | |
5450 | incoming stack arguments |
5451 | |
5452 +-------------------------------+
5453 | | <-- incoming stack pointer (aligned)
5454 | callee-allocated save area |
5455 | for register varargs |
5456 | |
5457 +-------------------------------+
5458 | local variables | <-- frame_pointer_rtx
5459 | |
5460 +-------------------------------+
5461 | padding | \
5462 +-------------------------------+ |
5463 | callee-saved registers | | frame.saved_regs_size
5464 +-------------------------------+ |
5465 | LR' | |
5466 +-------------------------------+ |
5467 | FP' | / <- hard_frame_pointer_rtx (aligned)
5468 +-------------------------------+
5469 | dynamic allocation |
5470 +-------------------------------+
5471 | padding |
5472 +-------------------------------+
5473 | outgoing stack arguments | <-- arg_pointer
5474 | |
5475 +-------------------------------+
5476 | | <-- stack_pointer_rtx (aligned)
5477
5478 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5479 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5480 unchanged.
5481
5482 By default for stack-clash we assume the guard is at least 64KB, but this
5483 value is configurable to either 4KB or 64KB. We also force the guard size to
5484 be the same as the probing interval and both values are kept in sync.
5485
5486 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5487 on the guard size) of stack space without probing.
5488
5489 When probing is needed, we emit a probe at the start of the prologue
5490 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5491
5492 We have to track how much space has been allocated and the only stores
5493 to the stack we track as implicit probes are the FP/LR stores.
5494
5495 For outgoing arguments we probe if the size is larger than 1KB, such that
5496 the ABI specified buffer is maintained for the next callee. */
5497
5498 /* Generate the prologue instructions for entry into a function.
5499 Establish the stack frame by decreasing the stack pointer with a
5500 properly calculated size and, if necessary, create a frame record
5501 filled with the values of LR and previous frame pointer. The
5502 current FP is also set up if it is in use. */
5503
5504 void
5505 aarch64_expand_prologue (void)
5506 {
5507 poly_int64 frame_size = cfun->machine->frame.frame_size;
5508 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5509 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5510 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5511 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5512 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5513 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5514 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5515 rtx_insn *insn;
5516
5517 /* Sign return address for functions. */
5518 if (aarch64_return_address_signing_enabled ())
5519 {
5520 insn = emit_insn (gen_pacisp ());
5521 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5522 RTX_FRAME_RELATED_P (insn) = 1;
5523 }
5524
5525 if (flag_stack_usage_info)
5526 current_function_static_stack_size = constant_lower_bound (frame_size);
5527
5528 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5529 {
5530 if (crtl->is_leaf && !cfun->calls_alloca)
5531 {
5532 if (maybe_gt (frame_size, PROBE_INTERVAL)
5533 && maybe_gt (frame_size, get_stack_check_protect ()))
5534 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5535 (frame_size
5536 - get_stack_check_protect ()));
5537 }
5538 else if (maybe_gt (frame_size, 0))
5539 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5540 }
5541
5542 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5543 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5544
5545 /* In theory we should never have both an initial adjustment
5546 and a callee save adjustment. Verify that is the case since the
5547 code below does not handle it for -fstack-clash-protection. */
5548 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5549
5550 /* Will only probe if the initial adjustment is larger than the guard
5551 less the amount of the guard reserved for use by the caller's
5552 outgoing args. */
5553 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5554 true, false);
5555
5556 if (callee_adjust != 0)
5557 aarch64_push_regs (reg1, reg2, callee_adjust);
5558
5559 if (emit_frame_chain)
5560 {
5561 poly_int64 reg_offset = callee_adjust;
5562 if (callee_adjust == 0)
5563 {
5564 reg1 = R29_REGNUM;
5565 reg2 = R30_REGNUM;
5566 reg_offset = callee_offset;
5567 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5568 }
5569 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5570 stack_pointer_rtx, callee_offset,
5571 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5572 if (frame_pointer_needed && !frame_size.is_constant ())
5573 {
5574 /* Variable-sized frames need to describe the save slot
5575 address using DW_CFA_expression rather than DW_CFA_offset.
5576 This means that, without taking further action, the
5577 locations of the registers that we've already saved would
5578 remain based on the stack pointer even after we redefine
5579 the CFA based on the frame pointer. We therefore need new
5580 DW_CFA_expressions to re-express the save slots with addresses
5581 based on the frame pointer. */
5582 rtx_insn *insn = get_last_insn ();
5583 gcc_assert (RTX_FRAME_RELATED_P (insn));
5584
5585 /* Add an explicit CFA definition if this was previously
5586 implicit. */
5587 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5588 {
5589 rtx src = plus_constant (Pmode, stack_pointer_rtx,
5590 callee_offset);
5591 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5592 gen_rtx_SET (hard_frame_pointer_rtx, src));
5593 }
5594
5595 /* Change the save slot expressions for the registers that
5596 we've already saved. */
5597 reg_offset -= callee_offset;
5598 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5599 reg_offset + UNITS_PER_WORD);
5600 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5601 reg_offset);
5602 }
5603 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5604 }
5605
5606 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5607 callee_adjust != 0 || emit_frame_chain);
5608 if (aarch64_simd_decl_p (cfun->decl))
5609 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5610 callee_adjust != 0 || emit_frame_chain);
5611 else
5612 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5613 callee_adjust != 0 || emit_frame_chain);
5614
5615 /* We may need to probe the final adjustment if it is larger than the guard
5616 that is assumed by the called. */
5617 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5618 !frame_pointer_needed, true);
5619 }
5620
5621 /* Return TRUE if we can use a simple_return insn.
5622
5623 This function checks whether the callee saved stack is empty, which
5624 means no restore actions are need. The pro_and_epilogue will use
5625 this to check whether shrink-wrapping opt is feasible. */
5626
5627 bool
5628 aarch64_use_return_insn_p (void)
5629 {
5630 if (!reload_completed)
5631 return false;
5632
5633 if (crtl->profile)
5634 return false;
5635
5636 return known_eq (cfun->machine->frame.frame_size, 0);
5637 }
5638
5639 /* Return false for non-leaf SIMD functions in order to avoid
5640 shrink-wrapping them. Doing this will lose the necessary
5641 save/restore of FP registers. */
5642
5643 bool
5644 aarch64_use_simple_return_insn_p (void)
5645 {
5646 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5647 return false;
5648
5649 return true;
5650 }
5651
5652 /* Generate the epilogue instructions for returning from a function.
5653 This is almost exactly the reverse of the prolog sequence, except
5654 that we need to insert barriers to avoid scheduling loads that read
5655 from a deallocated stack, and we optimize the unwind records by
5656 emitting them all together if possible. */
5657 void
5658 aarch64_expand_epilogue (bool for_sibcall)
5659 {
5660 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5661 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5662 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5663 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5664 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5665 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5666 rtx cfi_ops = NULL;
5667 rtx_insn *insn;
5668 /* A stack clash protection prologue may not have left EP0_REGNUM or
5669 EP1_REGNUM in a usable state. The same is true for allocations
5670 with an SVE component, since we then need both temporary registers
5671 for each allocation. For stack clash we are in a usable state if
5672 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5673 HOST_WIDE_INT guard_size
5674 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5675 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5676
5677 /* We can re-use the registers when the allocation amount is smaller than
5678 guard_size - guard_used_by_caller because we won't be doing any probes
5679 then. In such situations the register should remain live with the correct
5680 value. */
5681 bool can_inherit_p = (initial_adjust.is_constant ()
5682 && final_adjust.is_constant ())
5683 && (!flag_stack_clash_protection
5684 || known_lt (initial_adjust,
5685 guard_size - guard_used_by_caller));
5686
5687 /* We need to add memory barrier to prevent read from deallocated stack. */
5688 bool need_barrier_p
5689 = maybe_ne (get_frame_size ()
5690 + cfun->machine->frame.saved_varargs_size, 0);
5691
5692 /* Emit a barrier to prevent loads from a deallocated stack. */
5693 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5694 || cfun->calls_alloca
5695 || crtl->calls_eh_return)
5696 {
5697 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5698 need_barrier_p = false;
5699 }
5700
5701 /* Restore the stack pointer from the frame pointer if it may not
5702 be the same as the stack pointer. */
5703 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5704 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5705 if (frame_pointer_needed
5706 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5707 /* If writeback is used when restoring callee-saves, the CFA
5708 is restored on the instruction doing the writeback. */
5709 aarch64_add_offset (Pmode, stack_pointer_rtx,
5710 hard_frame_pointer_rtx, -callee_offset,
5711 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5712 else
5713 /* The case where we need to re-use the register here is very rare, so
5714 avoid the complicated condition and just always emit a move if the
5715 immediate doesn't fit. */
5716 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5717
5718 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5719 callee_adjust != 0, &cfi_ops);
5720 if (aarch64_simd_decl_p (cfun->decl))
5721 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5722 callee_adjust != 0, &cfi_ops);
5723 else
5724 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5725 callee_adjust != 0, &cfi_ops);
5726
5727 if (need_barrier_p)
5728 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5729
5730 if (callee_adjust != 0)
5731 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5732
5733 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5734 {
5735 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5736 insn = get_last_insn ();
5737 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5738 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5739 RTX_FRAME_RELATED_P (insn) = 1;
5740 cfi_ops = NULL;
5741 }
5742
5743 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5744 add restriction on emit_move optimization to leaf functions. */
5745 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5746 (!can_inherit_p || !crtl->is_leaf
5747 || df_regs_ever_live_p (EP0_REGNUM)));
5748
5749 if (cfi_ops)
5750 {
5751 /* Emit delayed restores and reset the CFA to be SP. */
5752 insn = get_last_insn ();
5753 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5754 REG_NOTES (insn) = cfi_ops;
5755 RTX_FRAME_RELATED_P (insn) = 1;
5756 }
5757
5758 /* We prefer to emit the combined return/authenticate instruction RETAA,
5759 however there are three cases in which we must instead emit an explicit
5760 authentication instruction.
5761
5762 1) Sibcalls don't return in a normal way, so if we're about to call one
5763 we must authenticate.
5764
5765 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5766 generating code for !TARGET_ARMV8_3 we can't use it and must
5767 explicitly authenticate.
5768
5769 3) On an eh_return path we make extra stack adjustments to update the
5770 canonical frame address to be the exception handler's CFA. We want
5771 to authenticate using the CFA of the function which calls eh_return.
5772 */
5773 if (aarch64_return_address_signing_enabled ()
5774 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5775 {
5776 insn = emit_insn (gen_autisp ());
5777 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5778 RTX_FRAME_RELATED_P (insn) = 1;
5779 }
5780
5781 /* Stack adjustment for exception handler. */
5782 if (crtl->calls_eh_return)
5783 {
5784 /* We need to unwind the stack by the offset computed by
5785 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5786 to be SP; letting the CFA move during this adjustment
5787 is just as correct as retaining the CFA from the body
5788 of the function. Therefore, do nothing special. */
5789 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5790 }
5791
5792 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5793 if (!for_sibcall)
5794 emit_jump_insn (ret_rtx);
5795 }
5796
5797 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5798 normally or return to a previous frame after unwinding.
5799
5800 An EH return uses a single shared return sequence. The epilogue is
5801 exactly like a normal epilogue except that it has an extra input
5802 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5803 that must be applied after the frame has been destroyed. An extra label
5804 is inserted before the epilogue which initializes this register to zero,
5805 and this is the entry point for a normal return.
5806
5807 An actual EH return updates the return address, initializes the stack
5808 adjustment and jumps directly into the epilogue (bypassing the zeroing
5809 of the adjustment). Since the return address is typically saved on the
5810 stack when a function makes a call, the saved LR must be updated outside
5811 the epilogue.
5812
5813 This poses problems as the store is generated well before the epilogue,
5814 so the offset of LR is not known yet. Also optimizations will remove the
5815 store as it appears dead, even after the epilogue is generated (as the
5816 base or offset for loading LR is different in many cases).
5817
5818 To avoid these problems this implementation forces the frame pointer
5819 in eh_return functions so that the location of LR is fixed and known early.
5820 It also marks the store volatile, so no optimization is permitted to
5821 remove the store. */
5822 rtx
5823 aarch64_eh_return_handler_rtx (void)
5824 {
5825 rtx tmp = gen_frame_mem (Pmode,
5826 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5827
5828 /* Mark the store volatile, so no optimization is permitted to remove it. */
5829 MEM_VOLATILE_P (tmp) = true;
5830 return tmp;
5831 }
5832
5833 /* Output code to add DELTA to the first argument, and then jump
5834 to FUNCTION. Used for C++ multiple inheritance. */
5835 static void
5836 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5837 HOST_WIDE_INT delta,
5838 HOST_WIDE_INT vcall_offset,
5839 tree function)
5840 {
5841 /* The this pointer is always in x0. Note that this differs from
5842 Arm where the this pointer maybe bumped to r1 if r0 is required
5843 to return a pointer to an aggregate. On AArch64 a result value
5844 pointer will be in x8. */
5845 int this_regno = R0_REGNUM;
5846 rtx this_rtx, temp0, temp1, addr, funexp;
5847 rtx_insn *insn;
5848
5849 reload_completed = 1;
5850 emit_note (NOTE_INSN_PROLOGUE_END);
5851
5852 this_rtx = gen_rtx_REG (Pmode, this_regno);
5853 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5854 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5855
5856 if (vcall_offset == 0)
5857 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5858 else
5859 {
5860 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5861
5862 addr = this_rtx;
5863 if (delta != 0)
5864 {
5865 if (delta >= -256 && delta < 256)
5866 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5867 plus_constant (Pmode, this_rtx, delta));
5868 else
5869 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5870 temp1, temp0, false);
5871 }
5872
5873 if (Pmode == ptr_mode)
5874 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5875 else
5876 aarch64_emit_move (temp0,
5877 gen_rtx_ZERO_EXTEND (Pmode,
5878 gen_rtx_MEM (ptr_mode, addr)));
5879
5880 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5881 addr = plus_constant (Pmode, temp0, vcall_offset);
5882 else
5883 {
5884 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5885 Pmode);
5886 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5887 }
5888
5889 if (Pmode == ptr_mode)
5890 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5891 else
5892 aarch64_emit_move (temp1,
5893 gen_rtx_SIGN_EXTEND (Pmode,
5894 gen_rtx_MEM (ptr_mode, addr)));
5895
5896 emit_insn (gen_add2_insn (this_rtx, temp1));
5897 }
5898
5899 /* Generate a tail call to the target function. */
5900 if (!TREE_USED (function))
5901 {
5902 assemble_external (function);
5903 TREE_USED (function) = 1;
5904 }
5905 funexp = XEXP (DECL_RTL (function), 0);
5906 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5907 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5908 SIBLING_CALL_P (insn) = 1;
5909
5910 insn = get_insns ();
5911 shorten_branches (insn);
5912 final_start_function (insn, file, 1);
5913 final (insn, file, 1);
5914 final_end_function ();
5915
5916 /* Stop pretending to be a post-reload pass. */
5917 reload_completed = 0;
5918 }
5919
5920 static bool
5921 aarch64_tls_referenced_p (rtx x)
5922 {
5923 if (!TARGET_HAVE_TLS)
5924 return false;
5925 subrtx_iterator::array_type array;
5926 FOR_EACH_SUBRTX (iter, array, x, ALL)
5927 {
5928 const_rtx x = *iter;
5929 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5930 return true;
5931 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5932 TLS offsets, not real symbol references. */
5933 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5934 iter.skip_subrtxes ();
5935 }
5936 return false;
5937 }
5938
5939
5940 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5941 a left shift of 0 or 12 bits. */
5942 bool
5943 aarch64_uimm12_shift (HOST_WIDE_INT val)
5944 {
5945 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5946 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5947 );
5948 }
5949
5950 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5951 that can be created with a left shift of 0 or 12. */
5952 static HOST_WIDE_INT
5953 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
5954 {
5955 /* Check to see if the value fits in 24 bits, as that is the maximum we can
5956 handle correctly. */
5957 gcc_assert ((val & 0xffffff) == val);
5958
5959 if (((val & 0xfff) << 0) == val)
5960 return val;
5961
5962 return val & (0xfff << 12);
5963 }
5964
5965 /* Return true if val is an immediate that can be loaded into a
5966 register by a MOVZ instruction. */
5967 static bool
5968 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5969 {
5970 if (GET_MODE_SIZE (mode) > 4)
5971 {
5972 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5973 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5974 return 1;
5975 }
5976 else
5977 {
5978 /* Ignore sign extension. */
5979 val &= (HOST_WIDE_INT) 0xffffffff;
5980 }
5981 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5982 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5983 }
5984
5985 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5986 64-bit (DImode) integer. */
5987
5988 static unsigned HOST_WIDE_INT
5989 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5990 {
5991 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5992 while (size < 64)
5993 {
5994 val &= (HOST_WIDE_INT_1U << size) - 1;
5995 val |= val << size;
5996 size *= 2;
5997 }
5998 return val;
5999 }
6000
6001 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6002
6003 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6004 {
6005 0x0000000100000001ull,
6006 0x0001000100010001ull,
6007 0x0101010101010101ull,
6008 0x1111111111111111ull,
6009 0x5555555555555555ull,
6010 };
6011
6012
6013 /* Return true if val is a valid bitmask immediate. */
6014
6015 bool
6016 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6017 {
6018 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6019 int bits;
6020
6021 /* Check for a single sequence of one bits and return quickly if so.
6022 The special cases of all ones and all zeroes returns false. */
6023 val = aarch64_replicate_bitmask_imm (val_in, mode);
6024 tmp = val + (val & -val);
6025
6026 if (tmp == (tmp & -tmp))
6027 return (val + 1) > 1;
6028
6029 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6030 if (mode == SImode)
6031 val = (val << 32) | (val & 0xffffffff);
6032
6033 /* Invert if the immediate doesn't start with a zero bit - this means we
6034 only need to search for sequences of one bits. */
6035 if (val & 1)
6036 val = ~val;
6037
6038 /* Find the first set bit and set tmp to val with the first sequence of one
6039 bits removed. Return success if there is a single sequence of ones. */
6040 first_one = val & -val;
6041 tmp = val & (val + first_one);
6042
6043 if (tmp == 0)
6044 return true;
6045
6046 /* Find the next set bit and compute the difference in bit position. */
6047 next_one = tmp & -tmp;
6048 bits = clz_hwi (first_one) - clz_hwi (next_one);
6049 mask = val ^ tmp;
6050
6051 /* Check the bit position difference is a power of 2, and that the first
6052 sequence of one bits fits within 'bits' bits. */
6053 if ((mask >> bits) != 0 || bits != (bits & -bits))
6054 return false;
6055
6056 /* Check the sequence of one bits is repeated 64/bits times. */
6057 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6058 }
6059
6060 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6061 Assumed precondition: VAL_IN Is not zero. */
6062
6063 unsigned HOST_WIDE_INT
6064 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6065 {
6066 int lowest_bit_set = ctz_hwi (val_in);
6067 int highest_bit_set = floor_log2 (val_in);
6068 gcc_assert (val_in != 0);
6069
6070 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6071 (HOST_WIDE_INT_1U << lowest_bit_set));
6072 }
6073
6074 /* Create constant where bits outside of lowest bit set to highest bit set
6075 are set to 1. */
6076
6077 unsigned HOST_WIDE_INT
6078 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6079 {
6080 return val_in | ~aarch64_and_split_imm1 (val_in);
6081 }
6082
6083 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6084
6085 bool
6086 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6087 {
6088 scalar_int_mode int_mode;
6089 if (!is_a <scalar_int_mode> (mode, &int_mode))
6090 return false;
6091
6092 if (aarch64_bitmask_imm (val_in, int_mode))
6093 return false;
6094
6095 if (aarch64_move_imm (val_in, int_mode))
6096 return false;
6097
6098 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6099
6100 return aarch64_bitmask_imm (imm2, int_mode);
6101 }
6102
6103 /* Return true if val is an immediate that can be loaded into a
6104 register in a single instruction. */
6105 bool
6106 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6107 {
6108 scalar_int_mode int_mode;
6109 if (!is_a <scalar_int_mode> (mode, &int_mode))
6110 return false;
6111
6112 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6113 return 1;
6114 return aarch64_bitmask_imm (val, int_mode);
6115 }
6116
6117 static bool
6118 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6119 {
6120 rtx base, offset;
6121
6122 if (GET_CODE (x) == HIGH)
6123 return true;
6124
6125 /* There's no way to calculate VL-based values using relocations. */
6126 subrtx_iterator::array_type array;
6127 FOR_EACH_SUBRTX (iter, array, x, ALL)
6128 if (GET_CODE (*iter) == CONST_POLY_INT)
6129 return true;
6130
6131 split_const (x, &base, &offset);
6132 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6133 {
6134 if (aarch64_classify_symbol (base, INTVAL (offset))
6135 != SYMBOL_FORCE_TO_MEM)
6136 return true;
6137 else
6138 /* Avoid generating a 64-bit relocation in ILP32; leave
6139 to aarch64_expand_mov_immediate to handle it properly. */
6140 return mode != ptr_mode;
6141 }
6142
6143 return aarch64_tls_referenced_p (x);
6144 }
6145
6146 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6147 The expansion for a table switch is quite expensive due to the number
6148 of instructions, the table lookup and hard to predict indirect jump.
6149 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6150 set, otherwise use tables for > 16 cases as a tradeoff between size and
6151 performance. When optimizing for size, use the default setting. */
6152
6153 static unsigned int
6154 aarch64_case_values_threshold (void)
6155 {
6156 /* Use the specified limit for the number of cases before using jump
6157 tables at higher optimization levels. */
6158 if (optimize > 2
6159 && selected_cpu->tune->max_case_values != 0)
6160 return selected_cpu->tune->max_case_values;
6161 else
6162 return optimize_size ? default_case_values_threshold () : 17;
6163 }
6164
6165 /* Return true if register REGNO is a valid index register.
6166 STRICT_P is true if REG_OK_STRICT is in effect. */
6167
6168 bool
6169 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6170 {
6171 if (!HARD_REGISTER_NUM_P (regno))
6172 {
6173 if (!strict_p)
6174 return true;
6175
6176 if (!reg_renumber)
6177 return false;
6178
6179 regno = reg_renumber[regno];
6180 }
6181 return GP_REGNUM_P (regno);
6182 }
6183
6184 /* Return true if register REGNO is a valid base register for mode MODE.
6185 STRICT_P is true if REG_OK_STRICT is in effect. */
6186
6187 bool
6188 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6189 {
6190 if (!HARD_REGISTER_NUM_P (regno))
6191 {
6192 if (!strict_p)
6193 return true;
6194
6195 if (!reg_renumber)
6196 return false;
6197
6198 regno = reg_renumber[regno];
6199 }
6200
6201 /* The fake registers will be eliminated to either the stack or
6202 hard frame pointer, both of which are usually valid base registers.
6203 Reload deals with the cases where the eliminated form isn't valid. */
6204 return (GP_REGNUM_P (regno)
6205 || regno == SP_REGNUM
6206 || regno == FRAME_POINTER_REGNUM
6207 || regno == ARG_POINTER_REGNUM);
6208 }
6209
6210 /* Return true if X is a valid base register for mode MODE.
6211 STRICT_P is true if REG_OK_STRICT is in effect. */
6212
6213 static bool
6214 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6215 {
6216 if (!strict_p
6217 && GET_CODE (x) == SUBREG
6218 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6219 x = SUBREG_REG (x);
6220
6221 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6222 }
6223
6224 /* Return true if address offset is a valid index. If it is, fill in INFO
6225 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6226
6227 static bool
6228 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6229 machine_mode mode, bool strict_p)
6230 {
6231 enum aarch64_address_type type;
6232 rtx index;
6233 int shift;
6234
6235 /* (reg:P) */
6236 if ((REG_P (x) || GET_CODE (x) == SUBREG)
6237 && GET_MODE (x) == Pmode)
6238 {
6239 type = ADDRESS_REG_REG;
6240 index = x;
6241 shift = 0;
6242 }
6243 /* (sign_extend:DI (reg:SI)) */
6244 else if ((GET_CODE (x) == SIGN_EXTEND
6245 || GET_CODE (x) == ZERO_EXTEND)
6246 && GET_MODE (x) == DImode
6247 && GET_MODE (XEXP (x, 0)) == SImode)
6248 {
6249 type = (GET_CODE (x) == SIGN_EXTEND)
6250 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6251 index = XEXP (x, 0);
6252 shift = 0;
6253 }
6254 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6255 else if (GET_CODE (x) == MULT
6256 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6257 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6258 && GET_MODE (XEXP (x, 0)) == DImode
6259 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6260 && CONST_INT_P (XEXP (x, 1)))
6261 {
6262 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6263 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6264 index = XEXP (XEXP (x, 0), 0);
6265 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6266 }
6267 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6268 else if (GET_CODE (x) == ASHIFT
6269 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6270 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6271 && GET_MODE (XEXP (x, 0)) == DImode
6272 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6273 && CONST_INT_P (XEXP (x, 1)))
6274 {
6275 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6276 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6277 index = XEXP (XEXP (x, 0), 0);
6278 shift = INTVAL (XEXP (x, 1));
6279 }
6280 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6281 else if ((GET_CODE (x) == SIGN_EXTRACT
6282 || GET_CODE (x) == ZERO_EXTRACT)
6283 && GET_MODE (x) == DImode
6284 && GET_CODE (XEXP (x, 0)) == MULT
6285 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6286 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6287 {
6288 type = (GET_CODE (x) == SIGN_EXTRACT)
6289 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6290 index = XEXP (XEXP (x, 0), 0);
6291 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6292 if (INTVAL (XEXP (x, 1)) != 32 + shift
6293 || INTVAL (XEXP (x, 2)) != 0)
6294 shift = -1;
6295 }
6296 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6297 (const_int 0xffffffff<<shift)) */
6298 else if (GET_CODE (x) == AND
6299 && GET_MODE (x) == DImode
6300 && GET_CODE (XEXP (x, 0)) == MULT
6301 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6302 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6303 && CONST_INT_P (XEXP (x, 1)))
6304 {
6305 type = ADDRESS_REG_UXTW;
6306 index = XEXP (XEXP (x, 0), 0);
6307 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6308 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6309 shift = -1;
6310 }
6311 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6312 else if ((GET_CODE (x) == SIGN_EXTRACT
6313 || GET_CODE (x) == ZERO_EXTRACT)
6314 && GET_MODE (x) == DImode
6315 && GET_CODE (XEXP (x, 0)) == ASHIFT
6316 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6317 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6318 {
6319 type = (GET_CODE (x) == SIGN_EXTRACT)
6320 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6321 index = XEXP (XEXP (x, 0), 0);
6322 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6323 if (INTVAL (XEXP (x, 1)) != 32 + shift
6324 || INTVAL (XEXP (x, 2)) != 0)
6325 shift = -1;
6326 }
6327 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6328 (const_int 0xffffffff<<shift)) */
6329 else if (GET_CODE (x) == AND
6330 && GET_MODE (x) == DImode
6331 && GET_CODE (XEXP (x, 0)) == ASHIFT
6332 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6333 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6334 && CONST_INT_P (XEXP (x, 1)))
6335 {
6336 type = ADDRESS_REG_UXTW;
6337 index = XEXP (XEXP (x, 0), 0);
6338 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6339 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6340 shift = -1;
6341 }
6342 /* (mult:P (reg:P) (const_int scale)) */
6343 else if (GET_CODE (x) == MULT
6344 && GET_MODE (x) == Pmode
6345 && GET_MODE (XEXP (x, 0)) == Pmode
6346 && CONST_INT_P (XEXP (x, 1)))
6347 {
6348 type = ADDRESS_REG_REG;
6349 index = XEXP (x, 0);
6350 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6351 }
6352 /* (ashift:P (reg:P) (const_int shift)) */
6353 else if (GET_CODE (x) == ASHIFT
6354 && GET_MODE (x) == Pmode
6355 && GET_MODE (XEXP (x, 0)) == Pmode
6356 && CONST_INT_P (XEXP (x, 1)))
6357 {
6358 type = ADDRESS_REG_REG;
6359 index = XEXP (x, 0);
6360 shift = INTVAL (XEXP (x, 1));
6361 }
6362 else
6363 return false;
6364
6365 if (!strict_p
6366 && GET_CODE (index) == SUBREG
6367 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6368 index = SUBREG_REG (index);
6369
6370 if (aarch64_sve_data_mode_p (mode))
6371 {
6372 if (type != ADDRESS_REG_REG
6373 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6374 return false;
6375 }
6376 else
6377 {
6378 if (shift != 0
6379 && !(IN_RANGE (shift, 1, 3)
6380 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6381 return false;
6382 }
6383
6384 if (REG_P (index)
6385 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6386 {
6387 info->type = type;
6388 info->offset = index;
6389 info->shift = shift;
6390 return true;
6391 }
6392
6393 return false;
6394 }
6395
6396 /* Return true if MODE is one of the modes for which we
6397 support LDP/STP operations. */
6398
6399 static bool
6400 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6401 {
6402 return mode == SImode || mode == DImode
6403 || mode == SFmode || mode == DFmode
6404 || (aarch64_vector_mode_supported_p (mode)
6405 && (known_eq (GET_MODE_SIZE (mode), 8)
6406 || (known_eq (GET_MODE_SIZE (mode), 16)
6407 && (aarch64_tune_params.extra_tuning_flags
6408 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6409 }
6410
6411 /* Return true if REGNO is a virtual pointer register, or an eliminable
6412 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6413 include stack_pointer or hard_frame_pointer. */
6414 static bool
6415 virt_or_elim_regno_p (unsigned regno)
6416 {
6417 return ((regno >= FIRST_VIRTUAL_REGISTER
6418 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6419 || regno == FRAME_POINTER_REGNUM
6420 || regno == ARG_POINTER_REGNUM);
6421 }
6422
6423 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6424 If it is, fill in INFO appropriately. STRICT_P is true if
6425 REG_OK_STRICT is in effect. */
6426
6427 bool
6428 aarch64_classify_address (struct aarch64_address_info *info,
6429 rtx x, machine_mode mode, bool strict_p,
6430 aarch64_addr_query_type type)
6431 {
6432 enum rtx_code code = GET_CODE (x);
6433 rtx op0, op1;
6434 poly_int64 offset;
6435
6436 HOST_WIDE_INT const_size;
6437
6438 /* On BE, we use load/store pair for all large int mode load/stores.
6439 TI/TFmode may also use a load/store pair. */
6440 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6441 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6442 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6443 || type == ADDR_QUERY_LDP_STP_N
6444 || mode == TImode
6445 || mode == TFmode
6446 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6447
6448 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6449 corresponds to the actual size of the memory being loaded/stored and the
6450 mode of the corresponding addressing mode is half of that. */
6451 if (type == ADDR_QUERY_LDP_STP_N
6452 && known_eq (GET_MODE_SIZE (mode), 16))
6453 mode = DFmode;
6454
6455 bool allow_reg_index_p = (!load_store_pair_p
6456 && (known_lt (GET_MODE_SIZE (mode), 16)
6457 || vec_flags == VEC_ADVSIMD
6458 || vec_flags == VEC_SVE_DATA));
6459
6460 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6461 [Rn, #offset, MUL VL]. */
6462 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6463 && (code != REG && code != PLUS))
6464 return false;
6465
6466 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6467 REG addressing. */
6468 if (advsimd_struct_p
6469 && !BYTES_BIG_ENDIAN
6470 && (code != POST_INC && code != REG))
6471 return false;
6472
6473 gcc_checking_assert (GET_MODE (x) == VOIDmode
6474 || SCALAR_INT_MODE_P (GET_MODE (x)));
6475
6476 switch (code)
6477 {
6478 case REG:
6479 case SUBREG:
6480 info->type = ADDRESS_REG_IMM;
6481 info->base = x;
6482 info->offset = const0_rtx;
6483 info->const_offset = 0;
6484 return aarch64_base_register_rtx_p (x, strict_p);
6485
6486 case PLUS:
6487 op0 = XEXP (x, 0);
6488 op1 = XEXP (x, 1);
6489
6490 if (! strict_p
6491 && REG_P (op0)
6492 && virt_or_elim_regno_p (REGNO (op0))
6493 && poly_int_rtx_p (op1, &offset))
6494 {
6495 info->type = ADDRESS_REG_IMM;
6496 info->base = op0;
6497 info->offset = op1;
6498 info->const_offset = offset;
6499
6500 return true;
6501 }
6502
6503 if (maybe_ne (GET_MODE_SIZE (mode), 0)
6504 && aarch64_base_register_rtx_p (op0, strict_p)
6505 && poly_int_rtx_p (op1, &offset))
6506 {
6507 info->type = ADDRESS_REG_IMM;
6508 info->base = op0;
6509 info->offset = op1;
6510 info->const_offset = offset;
6511
6512 /* TImode and TFmode values are allowed in both pairs of X
6513 registers and individual Q registers. The available
6514 address modes are:
6515 X,X: 7-bit signed scaled offset
6516 Q: 9-bit signed offset
6517 We conservatively require an offset representable in either mode.
6518 When performing the check for pairs of X registers i.e. LDP/STP
6519 pass down DImode since that is the natural size of the LDP/STP
6520 instruction memory accesses. */
6521 if (mode == TImode || mode == TFmode)
6522 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6523 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6524 || offset_12bit_unsigned_scaled_p (mode, offset)));
6525
6526 /* A 7bit offset check because OImode will emit a ldp/stp
6527 instruction (only big endian will get here).
6528 For ldp/stp instructions, the offset is scaled for the size of a
6529 single element of the pair. */
6530 if (mode == OImode)
6531 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6532
6533 /* Three 9/12 bit offsets checks because CImode will emit three
6534 ldr/str instructions (only big endian will get here). */
6535 if (mode == CImode)
6536 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6537 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6538 offset + 32)
6539 || offset_12bit_unsigned_scaled_p (V16QImode,
6540 offset + 32)));
6541
6542 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6543 instructions (only big endian will get here). */
6544 if (mode == XImode)
6545 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6546 && aarch64_offset_7bit_signed_scaled_p (TImode,
6547 offset + 32));
6548
6549 /* Make "m" use the LD1 offset range for SVE data modes, so
6550 that pre-RTL optimizers like ivopts will work to that
6551 instead of the wider LDR/STR range. */
6552 if (vec_flags == VEC_SVE_DATA)
6553 return (type == ADDR_QUERY_M
6554 ? offset_4bit_signed_scaled_p (mode, offset)
6555 : offset_9bit_signed_scaled_p (mode, offset));
6556
6557 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6558 {
6559 poly_int64 end_offset = (offset
6560 + GET_MODE_SIZE (mode)
6561 - BYTES_PER_SVE_VECTOR);
6562 return (type == ADDR_QUERY_M
6563 ? offset_4bit_signed_scaled_p (mode, offset)
6564 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6565 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6566 end_offset)));
6567 }
6568
6569 if (vec_flags == VEC_SVE_PRED)
6570 return offset_9bit_signed_scaled_p (mode, offset);
6571
6572 if (load_store_pair_p)
6573 return ((known_eq (GET_MODE_SIZE (mode), 4)
6574 || known_eq (GET_MODE_SIZE (mode), 8)
6575 || known_eq (GET_MODE_SIZE (mode), 16))
6576 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6577 else
6578 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6579 || offset_12bit_unsigned_scaled_p (mode, offset));
6580 }
6581
6582 if (allow_reg_index_p)
6583 {
6584 /* Look for base + (scaled/extended) index register. */
6585 if (aarch64_base_register_rtx_p (op0, strict_p)
6586 && aarch64_classify_index (info, op1, mode, strict_p))
6587 {
6588 info->base = op0;
6589 return true;
6590 }
6591 if (aarch64_base_register_rtx_p (op1, strict_p)
6592 && aarch64_classify_index (info, op0, mode, strict_p))
6593 {
6594 info->base = op1;
6595 return true;
6596 }
6597 }
6598
6599 return false;
6600
6601 case POST_INC:
6602 case POST_DEC:
6603 case PRE_INC:
6604 case PRE_DEC:
6605 info->type = ADDRESS_REG_WB;
6606 info->base = XEXP (x, 0);
6607 info->offset = NULL_RTX;
6608 return aarch64_base_register_rtx_p (info->base, strict_p);
6609
6610 case POST_MODIFY:
6611 case PRE_MODIFY:
6612 info->type = ADDRESS_REG_WB;
6613 info->base = XEXP (x, 0);
6614 if (GET_CODE (XEXP (x, 1)) == PLUS
6615 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6616 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6617 && aarch64_base_register_rtx_p (info->base, strict_p))
6618 {
6619 info->offset = XEXP (XEXP (x, 1), 1);
6620 info->const_offset = offset;
6621
6622 /* TImode and TFmode values are allowed in both pairs of X
6623 registers and individual Q registers. The available
6624 address modes are:
6625 X,X: 7-bit signed scaled offset
6626 Q: 9-bit signed offset
6627 We conservatively require an offset representable in either mode.
6628 */
6629 if (mode == TImode || mode == TFmode)
6630 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6631 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6632
6633 if (load_store_pair_p)
6634 return ((known_eq (GET_MODE_SIZE (mode), 4)
6635 || known_eq (GET_MODE_SIZE (mode), 8)
6636 || known_eq (GET_MODE_SIZE (mode), 16))
6637 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6638 else
6639 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6640 }
6641 return false;
6642
6643 case CONST:
6644 case SYMBOL_REF:
6645 case LABEL_REF:
6646 /* load literal: pc-relative constant pool entry. Only supported
6647 for SI mode or larger. */
6648 info->type = ADDRESS_SYMBOLIC;
6649
6650 if (!load_store_pair_p
6651 && GET_MODE_SIZE (mode).is_constant (&const_size)
6652 && const_size >= 4)
6653 {
6654 rtx sym, addend;
6655
6656 split_const (x, &sym, &addend);
6657 return ((GET_CODE (sym) == LABEL_REF
6658 || (GET_CODE (sym) == SYMBOL_REF
6659 && CONSTANT_POOL_ADDRESS_P (sym)
6660 && aarch64_pcrelative_literal_loads)));
6661 }
6662 return false;
6663
6664 case LO_SUM:
6665 info->type = ADDRESS_LO_SUM;
6666 info->base = XEXP (x, 0);
6667 info->offset = XEXP (x, 1);
6668 if (allow_reg_index_p
6669 && aarch64_base_register_rtx_p (info->base, strict_p))
6670 {
6671 rtx sym, offs;
6672 split_const (info->offset, &sym, &offs);
6673 if (GET_CODE (sym) == SYMBOL_REF
6674 && (aarch64_classify_symbol (sym, INTVAL (offs))
6675 == SYMBOL_SMALL_ABSOLUTE))
6676 {
6677 /* The symbol and offset must be aligned to the access size. */
6678 unsigned int align;
6679
6680 if (CONSTANT_POOL_ADDRESS_P (sym))
6681 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6682 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6683 {
6684 tree exp = SYMBOL_REF_DECL (sym);
6685 align = TYPE_ALIGN (TREE_TYPE (exp));
6686 align = aarch64_constant_alignment (exp, align);
6687 }
6688 else if (SYMBOL_REF_DECL (sym))
6689 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6690 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6691 && SYMBOL_REF_BLOCK (sym) != NULL)
6692 align = SYMBOL_REF_BLOCK (sym)->alignment;
6693 else
6694 align = BITS_PER_UNIT;
6695
6696 poly_int64 ref_size = GET_MODE_SIZE (mode);
6697 if (known_eq (ref_size, 0))
6698 ref_size = GET_MODE_SIZE (DImode);
6699
6700 return (multiple_p (INTVAL (offs), ref_size)
6701 && multiple_p (align / BITS_PER_UNIT, ref_size));
6702 }
6703 }
6704 return false;
6705
6706 default:
6707 return false;
6708 }
6709 }
6710
6711 /* Return true if the address X is valid for a PRFM instruction.
6712 STRICT_P is true if we should do strict checking with
6713 aarch64_classify_address. */
6714
6715 bool
6716 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6717 {
6718 struct aarch64_address_info addr;
6719
6720 /* PRFM accepts the same addresses as DImode... */
6721 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6722 if (!res)
6723 return false;
6724
6725 /* ... except writeback forms. */
6726 return addr.type != ADDRESS_REG_WB;
6727 }
6728
6729 bool
6730 aarch64_symbolic_address_p (rtx x)
6731 {
6732 rtx offset;
6733
6734 split_const (x, &x, &offset);
6735 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6736 }
6737
6738 /* Classify the base of symbolic expression X. */
6739
6740 enum aarch64_symbol_type
6741 aarch64_classify_symbolic_expression (rtx x)
6742 {
6743 rtx offset;
6744
6745 split_const (x, &x, &offset);
6746 return aarch64_classify_symbol (x, INTVAL (offset));
6747 }
6748
6749
6750 /* Return TRUE if X is a legitimate address for accessing memory in
6751 mode MODE. */
6752 static bool
6753 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6754 {
6755 struct aarch64_address_info addr;
6756
6757 return aarch64_classify_address (&addr, x, mode, strict_p);
6758 }
6759
6760 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6761 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6762 bool
6763 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6764 aarch64_addr_query_type type)
6765 {
6766 struct aarch64_address_info addr;
6767
6768 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6769 }
6770
6771 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6772
6773 static bool
6774 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6775 poly_int64 orig_offset,
6776 machine_mode mode)
6777 {
6778 HOST_WIDE_INT size;
6779 if (GET_MODE_SIZE (mode).is_constant (&size))
6780 {
6781 HOST_WIDE_INT const_offset, second_offset;
6782
6783 /* A general SVE offset is A * VQ + B. Remove the A component from
6784 coefficient 0 in order to get the constant B. */
6785 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6786
6787 /* Split an out-of-range address displacement into a base and
6788 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6789 range otherwise to increase opportunities for sharing the base
6790 address of different sizes. Unaligned accesses use the signed
6791 9-bit range, TImode/TFmode use the intersection of signed
6792 scaled 7-bit and signed 9-bit offset. */
6793 if (mode == TImode || mode == TFmode)
6794 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6795 else if ((const_offset & (size - 1)) != 0)
6796 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6797 else
6798 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6799
6800 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6801 return false;
6802
6803 /* Split the offset into second_offset and the rest. */
6804 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6805 *offset2 = gen_int_mode (second_offset, Pmode);
6806 return true;
6807 }
6808 else
6809 {
6810 /* Get the mode we should use as the basis of the range. For structure
6811 modes this is the mode of one vector. */
6812 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6813 machine_mode step_mode
6814 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6815
6816 /* Get the "mul vl" multiplier we'd like to use. */
6817 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6818 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6819 if (vec_flags & VEC_SVE_DATA)
6820 /* LDR supports a 9-bit range, but the move patterns for
6821 structure modes require all vectors to be in range of the
6822 same base. The simplest way of accomodating that while still
6823 promoting reuse of anchor points between different modes is
6824 to use an 8-bit range unconditionally. */
6825 vnum = ((vnum + 128) & 255) - 128;
6826 else
6827 /* Predicates are only handled singly, so we might as well use
6828 the full range. */
6829 vnum = ((vnum + 256) & 511) - 256;
6830 if (vnum == 0)
6831 return false;
6832
6833 /* Convert the "mul vl" multiplier into a byte offset. */
6834 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6835 if (known_eq (second_offset, orig_offset))
6836 return false;
6837
6838 /* Split the offset into second_offset and the rest. */
6839 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6840 *offset2 = gen_int_mode (second_offset, Pmode);
6841 return true;
6842 }
6843 }
6844
6845 /* Return the binary representation of floating point constant VALUE in INTVAL.
6846 If the value cannot be converted, return false without setting INTVAL.
6847 The conversion is done in the given MODE. */
6848 bool
6849 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6850 {
6851
6852 /* We make a general exception for 0. */
6853 if (aarch64_float_const_zero_rtx_p (value))
6854 {
6855 *intval = 0;
6856 return true;
6857 }
6858
6859 scalar_float_mode mode;
6860 if (GET_CODE (value) != CONST_DOUBLE
6861 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6862 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6863 /* Only support up to DF mode. */
6864 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6865 return false;
6866
6867 unsigned HOST_WIDE_INT ival = 0;
6868
6869 long res[2];
6870 real_to_target (res,
6871 CONST_DOUBLE_REAL_VALUE (value),
6872 REAL_MODE_FORMAT (mode));
6873
6874 if (mode == DFmode)
6875 {
6876 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6877 ival = zext_hwi (res[order], 32);
6878 ival |= (zext_hwi (res[1 - order], 32) << 32);
6879 }
6880 else
6881 ival = zext_hwi (res[0], 32);
6882
6883 *intval = ival;
6884 return true;
6885 }
6886
6887 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6888 single MOV(+MOVK) followed by an FMOV. */
6889 bool
6890 aarch64_float_const_rtx_p (rtx x)
6891 {
6892 machine_mode mode = GET_MODE (x);
6893 if (mode == VOIDmode)
6894 return false;
6895
6896 /* Determine whether it's cheaper to write float constants as
6897 mov/movk pairs over ldr/adrp pairs. */
6898 unsigned HOST_WIDE_INT ival;
6899
6900 if (GET_CODE (x) == CONST_DOUBLE
6901 && SCALAR_FLOAT_MODE_P (mode)
6902 && aarch64_reinterpret_float_as_int (x, &ival))
6903 {
6904 scalar_int_mode imode = (mode == HFmode
6905 ? SImode
6906 : int_mode_for_mode (mode).require ());
6907 int num_instr = aarch64_internal_mov_immediate
6908 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6909 return num_instr < 3;
6910 }
6911
6912 return false;
6913 }
6914
6915 /* Return TRUE if rtx X is immediate constant 0.0 */
6916 bool
6917 aarch64_float_const_zero_rtx_p (rtx x)
6918 {
6919 if (GET_MODE (x) == VOIDmode)
6920 return false;
6921
6922 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6923 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6924 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6925 }
6926
6927 /* Return TRUE if rtx X is immediate constant that fits in a single
6928 MOVI immediate operation. */
6929 bool
6930 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6931 {
6932 if (!TARGET_SIMD)
6933 return false;
6934
6935 machine_mode vmode;
6936 scalar_int_mode imode;
6937 unsigned HOST_WIDE_INT ival;
6938
6939 if (GET_CODE (x) == CONST_DOUBLE
6940 && SCALAR_FLOAT_MODE_P (mode))
6941 {
6942 if (!aarch64_reinterpret_float_as_int (x, &ival))
6943 return false;
6944
6945 /* We make a general exception for 0. */
6946 if (aarch64_float_const_zero_rtx_p (x))
6947 return true;
6948
6949 imode = int_mode_for_mode (mode).require ();
6950 }
6951 else if (GET_CODE (x) == CONST_INT
6952 && is_a <scalar_int_mode> (mode, &imode))
6953 ival = INTVAL (x);
6954 else
6955 return false;
6956
6957 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6958 a 128 bit vector mode. */
6959 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6960
6961 vmode = aarch64_simd_container_mode (imode, width);
6962 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6963
6964 return aarch64_simd_valid_immediate (v_op, NULL);
6965 }
6966
6967
6968 /* Return the fixed registers used for condition codes. */
6969
6970 static bool
6971 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6972 {
6973 *p1 = CC_REGNUM;
6974 *p2 = INVALID_REGNUM;
6975 return true;
6976 }
6977
6978 /* This function is used by the call expanders of the machine description.
6979 RESULT is the register in which the result is returned. It's NULL for
6980 "call" and "sibcall".
6981 MEM is the location of the function call.
6982 SIBCALL indicates whether this function call is normal call or sibling call.
6983 It will generate different pattern accordingly. */
6984
6985 void
6986 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6987 {
6988 rtx call, callee, tmp;
6989 rtvec vec;
6990 machine_mode mode;
6991
6992 gcc_assert (MEM_P (mem));
6993 callee = XEXP (mem, 0);
6994 mode = GET_MODE (callee);
6995 gcc_assert (mode == Pmode);
6996
6997 /* Decide if we should generate indirect calls by loading the
6998 address of the callee into a register before performing
6999 the branch-and-link. */
7000 if (SYMBOL_REF_P (callee)
7001 ? (aarch64_is_long_call_p (callee)
7002 || aarch64_is_noplt_call_p (callee))
7003 : !REG_P (callee))
7004 XEXP (mem, 0) = force_reg (mode, callee);
7005
7006 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7007
7008 if (result != NULL_RTX)
7009 call = gen_rtx_SET (result, call);
7010
7011 if (sibcall)
7012 tmp = ret_rtx;
7013 else
7014 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7015
7016 vec = gen_rtvec (2, call, tmp);
7017 call = gen_rtx_PARALLEL (VOIDmode, vec);
7018
7019 aarch64_emit_call_insn (call);
7020 }
7021
7022 /* Emit call insn with PAT and do aarch64-specific handling. */
7023
7024 void
7025 aarch64_emit_call_insn (rtx pat)
7026 {
7027 rtx insn = emit_call_insn (pat);
7028
7029 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7030 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7031 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7032 }
7033
7034 machine_mode
7035 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7036 {
7037 /* All floating point compares return CCFP if it is an equality
7038 comparison, and CCFPE otherwise. */
7039 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
7040 {
7041 switch (code)
7042 {
7043 case EQ:
7044 case NE:
7045 case UNORDERED:
7046 case ORDERED:
7047 case UNLT:
7048 case UNLE:
7049 case UNGT:
7050 case UNGE:
7051 case UNEQ:
7052 return CCFPmode;
7053
7054 case LT:
7055 case LE:
7056 case GT:
7057 case GE:
7058 case LTGT:
7059 return CCFPEmode;
7060
7061 default:
7062 gcc_unreachable ();
7063 }
7064 }
7065
7066 /* Equality comparisons of short modes against zero can be performed
7067 using the TST instruction with the appropriate bitmask. */
7068 if (y == const0_rtx && REG_P (x)
7069 && (code == EQ || code == NE)
7070 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
7071 return CC_NZmode;
7072
7073 /* Similarly, comparisons of zero_extends from shorter modes can
7074 be performed using an ANDS with an immediate mask. */
7075 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
7076 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7077 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7078 && (code == EQ || code == NE))
7079 return CC_NZmode;
7080
7081 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7082 && y == const0_rtx
7083 && (code == EQ || code == NE || code == LT || code == GE)
7084 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
7085 || GET_CODE (x) == NEG
7086 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7087 && CONST_INT_P (XEXP (x, 2)))))
7088 return CC_NZmode;
7089
7090 /* A compare with a shifted operand. Because of canonicalization,
7091 the comparison will have to be swapped when we emit the assembly
7092 code. */
7093 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7094 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7095 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
7096 || GET_CODE (x) == LSHIFTRT
7097 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
7098 return CC_SWPmode;
7099
7100 /* Similarly for a negated operand, but we can only do this for
7101 equalities. */
7102 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7103 && (REG_P (y) || GET_CODE (y) == SUBREG)
7104 && (code == EQ || code == NE)
7105 && GET_CODE (x) == NEG)
7106 return CC_Zmode;
7107
7108 /* A test for unsigned overflow. */
7109 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
7110 && code == NE
7111 && GET_CODE (x) == PLUS
7112 && GET_CODE (y) == ZERO_EXTEND)
7113 return CC_Cmode;
7114
7115 /* A test for signed overflow. */
7116 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
7117 && code == NE
7118 && GET_CODE (x) == PLUS
7119 && GET_CODE (y) == SIGN_EXTEND)
7120 return CC_Vmode;
7121
7122 /* For everything else, return CCmode. */
7123 return CCmode;
7124 }
7125
7126 static int
7127 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7128
7129 int
7130 aarch64_get_condition_code (rtx x)
7131 {
7132 machine_mode mode = GET_MODE (XEXP (x, 0));
7133 enum rtx_code comp_code = GET_CODE (x);
7134
7135 if (GET_MODE_CLASS (mode) != MODE_CC)
7136 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7137 return aarch64_get_condition_code_1 (mode, comp_code);
7138 }
7139
7140 static int
7141 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7142 {
7143 switch (mode)
7144 {
7145 case E_CCFPmode:
7146 case E_CCFPEmode:
7147 switch (comp_code)
7148 {
7149 case GE: return AARCH64_GE;
7150 case GT: return AARCH64_GT;
7151 case LE: return AARCH64_LS;
7152 case LT: return AARCH64_MI;
7153 case NE: return AARCH64_NE;
7154 case EQ: return AARCH64_EQ;
7155 case ORDERED: return AARCH64_VC;
7156 case UNORDERED: return AARCH64_VS;
7157 case UNLT: return AARCH64_LT;
7158 case UNLE: return AARCH64_LE;
7159 case UNGT: return AARCH64_HI;
7160 case UNGE: return AARCH64_PL;
7161 default: return -1;
7162 }
7163 break;
7164
7165 case E_CCmode:
7166 switch (comp_code)
7167 {
7168 case NE: return AARCH64_NE;
7169 case EQ: return AARCH64_EQ;
7170 case GE: return AARCH64_GE;
7171 case GT: return AARCH64_GT;
7172 case LE: return AARCH64_LE;
7173 case LT: return AARCH64_LT;
7174 case GEU: return AARCH64_CS;
7175 case GTU: return AARCH64_HI;
7176 case LEU: return AARCH64_LS;
7177 case LTU: return AARCH64_CC;
7178 default: return -1;
7179 }
7180 break;
7181
7182 case E_CC_SWPmode:
7183 switch (comp_code)
7184 {
7185 case NE: return AARCH64_NE;
7186 case EQ: return AARCH64_EQ;
7187 case GE: return AARCH64_LE;
7188 case GT: return AARCH64_LT;
7189 case LE: return AARCH64_GE;
7190 case LT: return AARCH64_GT;
7191 case GEU: return AARCH64_LS;
7192 case GTU: return AARCH64_CC;
7193 case LEU: return AARCH64_CS;
7194 case LTU: return AARCH64_HI;
7195 default: return -1;
7196 }
7197 break;
7198
7199 case E_CC_NZmode:
7200 switch (comp_code)
7201 {
7202 case NE: return AARCH64_NE;
7203 case EQ: return AARCH64_EQ;
7204 case GE: return AARCH64_PL;
7205 case LT: return AARCH64_MI;
7206 default: return -1;
7207 }
7208 break;
7209
7210 case E_CC_Zmode:
7211 switch (comp_code)
7212 {
7213 case NE: return AARCH64_NE;
7214 case EQ: return AARCH64_EQ;
7215 default: return -1;
7216 }
7217 break;
7218
7219 case E_CC_Cmode:
7220 switch (comp_code)
7221 {
7222 case NE: return AARCH64_CS;
7223 case EQ: return AARCH64_CC;
7224 default: return -1;
7225 }
7226 break;
7227
7228 case E_CC_Vmode:
7229 switch (comp_code)
7230 {
7231 case NE: return AARCH64_VS;
7232 case EQ: return AARCH64_VC;
7233 default: return -1;
7234 }
7235 break;
7236
7237 default:
7238 return -1;
7239 }
7240
7241 return -1;
7242 }
7243
7244 bool
7245 aarch64_const_vec_all_same_in_range_p (rtx x,
7246 HOST_WIDE_INT minval,
7247 HOST_WIDE_INT maxval)
7248 {
7249 rtx elt;
7250 return (const_vec_duplicate_p (x, &elt)
7251 && CONST_INT_P (elt)
7252 && IN_RANGE (INTVAL (elt), minval, maxval));
7253 }
7254
7255 bool
7256 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7257 {
7258 return aarch64_const_vec_all_same_in_range_p (x, val, val);
7259 }
7260
7261 /* Return true if VEC is a constant in which every element is in the range
7262 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7263
7264 static bool
7265 aarch64_const_vec_all_in_range_p (rtx vec,
7266 HOST_WIDE_INT minval,
7267 HOST_WIDE_INT maxval)
7268 {
7269 if (GET_CODE (vec) != CONST_VECTOR
7270 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7271 return false;
7272
7273 int nunits;
7274 if (!CONST_VECTOR_STEPPED_P (vec))
7275 nunits = const_vector_encoded_nelts (vec);
7276 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7277 return false;
7278
7279 for (int i = 0; i < nunits; i++)
7280 {
7281 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7282 if (!CONST_INT_P (vec_elem)
7283 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7284 return false;
7285 }
7286 return true;
7287 }
7288
7289 /* N Z C V. */
7290 #define AARCH64_CC_V 1
7291 #define AARCH64_CC_C (1 << 1)
7292 #define AARCH64_CC_Z (1 << 2)
7293 #define AARCH64_CC_N (1 << 3)
7294
7295 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7296 static const int aarch64_nzcv_codes[] =
7297 {
7298 0, /* EQ, Z == 1. */
7299 AARCH64_CC_Z, /* NE, Z == 0. */
7300 0, /* CS, C == 1. */
7301 AARCH64_CC_C, /* CC, C == 0. */
7302 0, /* MI, N == 1. */
7303 AARCH64_CC_N, /* PL, N == 0. */
7304 0, /* VS, V == 1. */
7305 AARCH64_CC_V, /* VC, V == 0. */
7306 0, /* HI, C ==1 && Z == 0. */
7307 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
7308 AARCH64_CC_V, /* GE, N == V. */
7309 0, /* LT, N != V. */
7310 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
7311 0, /* LE, !(Z == 0 && N == V). */
7312 0, /* AL, Any. */
7313 0 /* NV, Any. */
7314 };
7315
7316 /* Print floating-point vector immediate operand X to F, negating it
7317 first if NEGATE is true. Return true on success, false if it isn't
7318 a constant we can handle. */
7319
7320 static bool
7321 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7322 {
7323 rtx elt;
7324
7325 if (!const_vec_duplicate_p (x, &elt))
7326 return false;
7327
7328 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7329 if (negate)
7330 r = real_value_negate (&r);
7331
7332 /* We only handle the SVE single-bit immediates here. */
7333 if (real_equal (&r, &dconst0))
7334 asm_fprintf (f, "0.0");
7335 else if (real_equal (&r, &dconst1))
7336 asm_fprintf (f, "1.0");
7337 else if (real_equal (&r, &dconsthalf))
7338 asm_fprintf (f, "0.5");
7339 else
7340 return false;
7341
7342 return true;
7343 }
7344
7345 /* Return the equivalent letter for size. */
7346 static char
7347 sizetochar (int size)
7348 {
7349 switch (size)
7350 {
7351 case 64: return 'd';
7352 case 32: return 's';
7353 case 16: return 'h';
7354 case 8 : return 'b';
7355 default: gcc_unreachable ();
7356 }
7357 }
7358
7359 /* Print operand X to file F in a target specific manner according to CODE.
7360 The acceptable formatting commands given by CODE are:
7361 'c': An integer or symbol address without a preceding #
7362 sign.
7363 'C': Take the duplicated element in a vector constant
7364 and print it in hex.
7365 'D': Take the duplicated element in a vector constant
7366 and print it as an unsigned integer, in decimal.
7367 'e': Print the sign/zero-extend size as a character 8->b,
7368 16->h, 32->w.
7369 'p': Prints N such that 2^N == X (X must be power of 2 and
7370 const int).
7371 'P': Print the number of non-zero bits in X (a const_int).
7372 'H': Print the higher numbered register of a pair (TImode)
7373 of regs.
7374 'm': Print a condition (eq, ne, etc).
7375 'M': Same as 'm', but invert condition.
7376 'N': Take the duplicated element in a vector constant
7377 and print the negative of it in decimal.
7378 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7379 'S/T/U/V': Print a FP/SIMD register name for a register list.
7380 The register printed is the FP/SIMD register name
7381 of X + 0/1/2/3 for S/T/U/V.
7382 'R': Print a scalar FP/SIMD register name + 1.
7383 'X': Print bottom 16 bits of integer constant in hex.
7384 'w/x': Print a general register name or the zero register
7385 (32-bit or 64-bit).
7386 '0': Print a normal operand, if it's a general register,
7387 then we assume DImode.
7388 'k': Print NZCV for conditional compare instructions.
7389 'A': Output address constant representing the first
7390 argument of X, specifying a relocation offset
7391 if appropriate.
7392 'L': Output constant address specified by X
7393 with a relocation offset if appropriate.
7394 'G': Prints address of X, specifying a PC relative
7395 relocation mode if appropriate.
7396 'y': Output address of LDP or STP - this is used for
7397 some LDP/STPs which don't use a PARALLEL in their
7398 pattern (so the mode needs to be adjusted).
7399 'z': Output address of a typical LDP or STP. */
7400
7401 static void
7402 aarch64_print_operand (FILE *f, rtx x, int code)
7403 {
7404 rtx elt;
7405 switch (code)
7406 {
7407 case 'c':
7408 switch (GET_CODE (x))
7409 {
7410 case CONST_INT:
7411 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7412 break;
7413
7414 case SYMBOL_REF:
7415 output_addr_const (f, x);
7416 break;
7417
7418 case CONST:
7419 if (GET_CODE (XEXP (x, 0)) == PLUS
7420 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7421 {
7422 output_addr_const (f, x);
7423 break;
7424 }
7425 /* Fall through. */
7426
7427 default:
7428 output_operand_lossage ("unsupported operand for code '%c'", code);
7429 }
7430 break;
7431
7432 case 'e':
7433 {
7434 int n;
7435
7436 if (!CONST_INT_P (x)
7437 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7438 {
7439 output_operand_lossage ("invalid operand for '%%%c'", code);
7440 return;
7441 }
7442
7443 switch (n)
7444 {
7445 case 3:
7446 fputc ('b', f);
7447 break;
7448 case 4:
7449 fputc ('h', f);
7450 break;
7451 case 5:
7452 fputc ('w', f);
7453 break;
7454 default:
7455 output_operand_lossage ("invalid operand for '%%%c'", code);
7456 return;
7457 }
7458 }
7459 break;
7460
7461 case 'p':
7462 {
7463 int n;
7464
7465 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7466 {
7467 output_operand_lossage ("invalid operand for '%%%c'", code);
7468 return;
7469 }
7470
7471 asm_fprintf (f, "%d", n);
7472 }
7473 break;
7474
7475 case 'P':
7476 if (!CONST_INT_P (x))
7477 {
7478 output_operand_lossage ("invalid operand for '%%%c'", code);
7479 return;
7480 }
7481
7482 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7483 break;
7484
7485 case 'H':
7486 if (x == const0_rtx)
7487 {
7488 asm_fprintf (f, "xzr");
7489 break;
7490 }
7491
7492 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7493 {
7494 output_operand_lossage ("invalid operand for '%%%c'", code);
7495 return;
7496 }
7497
7498 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7499 break;
7500
7501 case 'M':
7502 case 'm':
7503 {
7504 int cond_code;
7505 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7506 if (x == const_true_rtx)
7507 {
7508 if (code == 'M')
7509 fputs ("nv", f);
7510 return;
7511 }
7512
7513 if (!COMPARISON_P (x))
7514 {
7515 output_operand_lossage ("invalid operand for '%%%c'", code);
7516 return;
7517 }
7518
7519 cond_code = aarch64_get_condition_code (x);
7520 gcc_assert (cond_code >= 0);
7521 if (code == 'M')
7522 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7523 fputs (aarch64_condition_codes[cond_code], f);
7524 }
7525 break;
7526
7527 case 'N':
7528 if (!const_vec_duplicate_p (x, &elt))
7529 {
7530 output_operand_lossage ("invalid vector constant");
7531 return;
7532 }
7533
7534 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7535 asm_fprintf (f, "%wd", -INTVAL (elt));
7536 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7537 && aarch64_print_vector_float_operand (f, x, true))
7538 ;
7539 else
7540 {
7541 output_operand_lossage ("invalid vector constant");
7542 return;
7543 }
7544 break;
7545
7546 case 'b':
7547 case 'h':
7548 case 's':
7549 case 'd':
7550 case 'q':
7551 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7552 {
7553 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7554 return;
7555 }
7556 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7557 break;
7558
7559 case 'S':
7560 case 'T':
7561 case 'U':
7562 case 'V':
7563 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7564 {
7565 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7566 return;
7567 }
7568 asm_fprintf (f, "%c%d",
7569 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7570 REGNO (x) - V0_REGNUM + (code - 'S'));
7571 break;
7572
7573 case 'R':
7574 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7575 {
7576 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7577 return;
7578 }
7579 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7580 break;
7581
7582 case 'X':
7583 if (!CONST_INT_P (x))
7584 {
7585 output_operand_lossage ("invalid operand for '%%%c'", code);
7586 return;
7587 }
7588 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7589 break;
7590
7591 case 'C':
7592 {
7593 /* Print a replicated constant in hex. */
7594 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7595 {
7596 output_operand_lossage ("invalid operand for '%%%c'", code);
7597 return;
7598 }
7599 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7600 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7601 }
7602 break;
7603
7604 case 'D':
7605 {
7606 /* Print a replicated constant in decimal, treating it as
7607 unsigned. */
7608 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7609 {
7610 output_operand_lossage ("invalid operand for '%%%c'", code);
7611 return;
7612 }
7613 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7614 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7615 }
7616 break;
7617
7618 case 'w':
7619 case 'x':
7620 if (x == const0_rtx
7621 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7622 {
7623 asm_fprintf (f, "%czr", code);
7624 break;
7625 }
7626
7627 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7628 {
7629 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7630 break;
7631 }
7632
7633 if (REG_P (x) && REGNO (x) == SP_REGNUM)
7634 {
7635 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7636 break;
7637 }
7638
7639 /* Fall through */
7640
7641 case 0:
7642 if (x == NULL)
7643 {
7644 output_operand_lossage ("missing operand");
7645 return;
7646 }
7647
7648 switch (GET_CODE (x))
7649 {
7650 case REG:
7651 if (aarch64_sve_data_mode_p (GET_MODE (x)))
7652 {
7653 if (REG_NREGS (x) == 1)
7654 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7655 else
7656 {
7657 char suffix
7658 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7659 asm_fprintf (f, "{z%d.%c - z%d.%c}",
7660 REGNO (x) - V0_REGNUM, suffix,
7661 END_REGNO (x) - V0_REGNUM - 1, suffix);
7662 }
7663 }
7664 else
7665 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7666 break;
7667
7668 case MEM:
7669 output_address (GET_MODE (x), XEXP (x, 0));
7670 break;
7671
7672 case LABEL_REF:
7673 case SYMBOL_REF:
7674 output_addr_const (asm_out_file, x);
7675 break;
7676
7677 case CONST_INT:
7678 asm_fprintf (f, "%wd", INTVAL (x));
7679 break;
7680
7681 case CONST:
7682 if (!VECTOR_MODE_P (GET_MODE (x)))
7683 {
7684 output_addr_const (asm_out_file, x);
7685 break;
7686 }
7687 /* fall through */
7688
7689 case CONST_VECTOR:
7690 if (!const_vec_duplicate_p (x, &elt))
7691 {
7692 output_operand_lossage ("invalid vector constant");
7693 return;
7694 }
7695
7696 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7697 asm_fprintf (f, "%wd", INTVAL (elt));
7698 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7699 && aarch64_print_vector_float_operand (f, x, false))
7700 ;
7701 else
7702 {
7703 output_operand_lossage ("invalid vector constant");
7704 return;
7705 }
7706 break;
7707
7708 case CONST_DOUBLE:
7709 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7710 be getting CONST_DOUBLEs holding integers. */
7711 gcc_assert (GET_MODE (x) != VOIDmode);
7712 if (aarch64_float_const_zero_rtx_p (x))
7713 {
7714 fputc ('0', f);
7715 break;
7716 }
7717 else if (aarch64_float_const_representable_p (x))
7718 {
7719 #define buf_size 20
7720 char float_buf[buf_size] = {'\0'};
7721 real_to_decimal_for_mode (float_buf,
7722 CONST_DOUBLE_REAL_VALUE (x),
7723 buf_size, buf_size,
7724 1, GET_MODE (x));
7725 asm_fprintf (asm_out_file, "%s", float_buf);
7726 break;
7727 #undef buf_size
7728 }
7729 output_operand_lossage ("invalid constant");
7730 return;
7731 default:
7732 output_operand_lossage ("invalid operand");
7733 return;
7734 }
7735 break;
7736
7737 case 'A':
7738 if (GET_CODE (x) == HIGH)
7739 x = XEXP (x, 0);
7740
7741 switch (aarch64_classify_symbolic_expression (x))
7742 {
7743 case SYMBOL_SMALL_GOT_4G:
7744 asm_fprintf (asm_out_file, ":got:");
7745 break;
7746
7747 case SYMBOL_SMALL_TLSGD:
7748 asm_fprintf (asm_out_file, ":tlsgd:");
7749 break;
7750
7751 case SYMBOL_SMALL_TLSDESC:
7752 asm_fprintf (asm_out_file, ":tlsdesc:");
7753 break;
7754
7755 case SYMBOL_SMALL_TLSIE:
7756 asm_fprintf (asm_out_file, ":gottprel:");
7757 break;
7758
7759 case SYMBOL_TLSLE24:
7760 asm_fprintf (asm_out_file, ":tprel:");
7761 break;
7762
7763 case SYMBOL_TINY_GOT:
7764 gcc_unreachable ();
7765 break;
7766
7767 default:
7768 break;
7769 }
7770 output_addr_const (asm_out_file, x);
7771 break;
7772
7773 case 'L':
7774 switch (aarch64_classify_symbolic_expression (x))
7775 {
7776 case SYMBOL_SMALL_GOT_4G:
7777 asm_fprintf (asm_out_file, ":lo12:");
7778 break;
7779
7780 case SYMBOL_SMALL_TLSGD:
7781 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7782 break;
7783
7784 case SYMBOL_SMALL_TLSDESC:
7785 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7786 break;
7787
7788 case SYMBOL_SMALL_TLSIE:
7789 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7790 break;
7791
7792 case SYMBOL_TLSLE12:
7793 asm_fprintf (asm_out_file, ":tprel_lo12:");
7794 break;
7795
7796 case SYMBOL_TLSLE24:
7797 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7798 break;
7799
7800 case SYMBOL_TINY_GOT:
7801 asm_fprintf (asm_out_file, ":got:");
7802 break;
7803
7804 case SYMBOL_TINY_TLSIE:
7805 asm_fprintf (asm_out_file, ":gottprel:");
7806 break;
7807
7808 default:
7809 break;
7810 }
7811 output_addr_const (asm_out_file, x);
7812 break;
7813
7814 case 'G':
7815 switch (aarch64_classify_symbolic_expression (x))
7816 {
7817 case SYMBOL_TLSLE24:
7818 asm_fprintf (asm_out_file, ":tprel_hi12:");
7819 break;
7820 default:
7821 break;
7822 }
7823 output_addr_const (asm_out_file, x);
7824 break;
7825
7826 case 'k':
7827 {
7828 HOST_WIDE_INT cond_code;
7829
7830 if (!CONST_INT_P (x))
7831 {
7832 output_operand_lossage ("invalid operand for '%%%c'", code);
7833 return;
7834 }
7835
7836 cond_code = INTVAL (x);
7837 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7838 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7839 }
7840 break;
7841
7842 case 'y':
7843 case 'z':
7844 {
7845 machine_mode mode = GET_MODE (x);
7846
7847 if (GET_CODE (x) != MEM
7848 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7849 {
7850 output_operand_lossage ("invalid operand for '%%%c'", code);
7851 return;
7852 }
7853
7854 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7855 code == 'y'
7856 ? ADDR_QUERY_LDP_STP_N
7857 : ADDR_QUERY_LDP_STP))
7858 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7859 }
7860 break;
7861
7862 default:
7863 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7864 return;
7865 }
7866 }
7867
7868 /* Print address 'x' of a memory access with mode 'mode'.
7869 'op' is the context required by aarch64_classify_address. It can either be
7870 MEM for a normal memory access or PARALLEL for LDP/STP. */
7871 static bool
7872 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7873 aarch64_addr_query_type type)
7874 {
7875 struct aarch64_address_info addr;
7876 unsigned int size;
7877
7878 /* Check all addresses are Pmode - including ILP32. */
7879 if (GET_MODE (x) != Pmode
7880 && (!CONST_INT_P (x)
7881 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
7882 {
7883 output_operand_lossage ("invalid address mode");
7884 return false;
7885 }
7886
7887 if (aarch64_classify_address (&addr, x, mode, true, type))
7888 switch (addr.type)
7889 {
7890 case ADDRESS_REG_IMM:
7891 if (known_eq (addr.const_offset, 0))
7892 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7893 else if (aarch64_sve_data_mode_p (mode))
7894 {
7895 HOST_WIDE_INT vnum
7896 = exact_div (addr.const_offset,
7897 BYTES_PER_SVE_VECTOR).to_constant ();
7898 asm_fprintf (f, "[%s, #%wd, mul vl]",
7899 reg_names[REGNO (addr.base)], vnum);
7900 }
7901 else if (aarch64_sve_pred_mode_p (mode))
7902 {
7903 HOST_WIDE_INT vnum
7904 = exact_div (addr.const_offset,
7905 BYTES_PER_SVE_PRED).to_constant ();
7906 asm_fprintf (f, "[%s, #%wd, mul vl]",
7907 reg_names[REGNO (addr.base)], vnum);
7908 }
7909 else
7910 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7911 INTVAL (addr.offset));
7912 return true;
7913
7914 case ADDRESS_REG_REG:
7915 if (addr.shift == 0)
7916 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7917 reg_names [REGNO (addr.offset)]);
7918 else
7919 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7920 reg_names [REGNO (addr.offset)], addr.shift);
7921 return true;
7922
7923 case ADDRESS_REG_UXTW:
7924 if (addr.shift == 0)
7925 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7926 REGNO (addr.offset) - R0_REGNUM);
7927 else
7928 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7929 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7930 return true;
7931
7932 case ADDRESS_REG_SXTW:
7933 if (addr.shift == 0)
7934 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7935 REGNO (addr.offset) - R0_REGNUM);
7936 else
7937 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7938 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7939 return true;
7940
7941 case ADDRESS_REG_WB:
7942 /* Writeback is only supported for fixed-width modes. */
7943 size = GET_MODE_SIZE (mode).to_constant ();
7944 switch (GET_CODE (x))
7945 {
7946 case PRE_INC:
7947 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7948 return true;
7949 case POST_INC:
7950 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7951 return true;
7952 case PRE_DEC:
7953 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7954 return true;
7955 case POST_DEC:
7956 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7957 return true;
7958 case PRE_MODIFY:
7959 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7960 INTVAL (addr.offset));
7961 return true;
7962 case POST_MODIFY:
7963 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7964 INTVAL (addr.offset));
7965 return true;
7966 default:
7967 break;
7968 }
7969 break;
7970
7971 case ADDRESS_LO_SUM:
7972 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7973 output_addr_const (f, addr.offset);
7974 asm_fprintf (f, "]");
7975 return true;
7976
7977 case ADDRESS_SYMBOLIC:
7978 output_addr_const (f, x);
7979 return true;
7980 }
7981
7982 return false;
7983 }
7984
7985 /* Print address 'x' of a memory access with mode 'mode'. */
7986 static void
7987 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7988 {
7989 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7990 output_addr_const (f, x);
7991 }
7992
7993 bool
7994 aarch64_label_mentioned_p (rtx x)
7995 {
7996 const char *fmt;
7997 int i;
7998
7999 if (GET_CODE (x) == LABEL_REF)
8000 return true;
8001
8002 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8003 referencing instruction, but they are constant offsets, not
8004 symbols. */
8005 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8006 return false;
8007
8008 fmt = GET_RTX_FORMAT (GET_CODE (x));
8009 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8010 {
8011 if (fmt[i] == 'E')
8012 {
8013 int j;
8014
8015 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8016 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8017 return 1;
8018 }
8019 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8020 return 1;
8021 }
8022
8023 return 0;
8024 }
8025
8026 /* Implement REGNO_REG_CLASS. */
8027
8028 enum reg_class
8029 aarch64_regno_regclass (unsigned regno)
8030 {
8031 if (GP_REGNUM_P (regno))
8032 return GENERAL_REGS;
8033
8034 if (regno == SP_REGNUM)
8035 return STACK_REG;
8036
8037 if (regno == FRAME_POINTER_REGNUM
8038 || regno == ARG_POINTER_REGNUM)
8039 return POINTER_REGS;
8040
8041 if (FP_REGNUM_P (regno))
8042 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
8043
8044 if (PR_REGNUM_P (regno))
8045 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8046
8047 return NO_REGS;
8048 }
8049
8050 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8051 If OFFSET is out of range, return an offset of an anchor point
8052 that is in range. Return 0 otherwise. */
8053
8054 static HOST_WIDE_INT
8055 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8056 machine_mode mode)
8057 {
8058 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8059 if (size > 16)
8060 return (offset + 0x400) & ~0x7f0;
8061
8062 /* For offsets that aren't a multiple of the access size, the limit is
8063 -256...255. */
8064 if (offset & (size - 1))
8065 {
8066 /* BLKmode typically uses LDP of X-registers. */
8067 if (mode == BLKmode)
8068 return (offset + 512) & ~0x3ff;
8069 return (offset + 0x100) & ~0x1ff;
8070 }
8071
8072 /* Small negative offsets are supported. */
8073 if (IN_RANGE (offset, -256, 0))
8074 return 0;
8075
8076 if (mode == TImode || mode == TFmode)
8077 return (offset + 0x100) & ~0x1ff;
8078
8079 /* Use 12-bit offset by access size. */
8080 return offset & (~0xfff * size);
8081 }
8082
8083 static rtx
8084 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
8085 {
8086 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8087 where mask is selected by alignment and size of the offset.
8088 We try to pick as large a range for the offset as possible to
8089 maximize the chance of a CSE. However, for aligned addresses
8090 we limit the range to 4k so that structures with different sized
8091 elements are likely to use the same base. We need to be careful
8092 not to split a CONST for some forms of address expression, otherwise
8093 it will generate sub-optimal code. */
8094
8095 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8096 {
8097 rtx base = XEXP (x, 0);
8098 rtx offset_rtx = XEXP (x, 1);
8099 HOST_WIDE_INT offset = INTVAL (offset_rtx);
8100
8101 if (GET_CODE (base) == PLUS)
8102 {
8103 rtx op0 = XEXP (base, 0);
8104 rtx op1 = XEXP (base, 1);
8105
8106 /* Force any scaling into a temp for CSE. */
8107 op0 = force_reg (Pmode, op0);
8108 op1 = force_reg (Pmode, op1);
8109
8110 /* Let the pointer register be in op0. */
8111 if (REG_POINTER (op1))
8112 std::swap (op0, op1);
8113
8114 /* If the pointer is virtual or frame related, then we know that
8115 virtual register instantiation or register elimination is going
8116 to apply a second constant. We want the two constants folded
8117 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8118 if (virt_or_elim_regno_p (REGNO (op0)))
8119 {
8120 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8121 NULL_RTX, true, OPTAB_DIRECT);
8122 return gen_rtx_PLUS (Pmode, base, op1);
8123 }
8124
8125 /* Otherwise, in order to encourage CSE (and thence loop strength
8126 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8127 base = expand_binop (Pmode, add_optab, op0, op1,
8128 NULL_RTX, true, OPTAB_DIRECT);
8129 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8130 }
8131
8132 HOST_WIDE_INT size;
8133 if (GET_MODE_SIZE (mode).is_constant (&size))
8134 {
8135 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8136 mode);
8137 if (base_offset != 0)
8138 {
8139 base = plus_constant (Pmode, base, base_offset);
8140 base = force_operand (base, NULL_RTX);
8141 return plus_constant (Pmode, base, offset - base_offset);
8142 }
8143 }
8144 }
8145
8146 return x;
8147 }
8148
8149 static reg_class_t
8150 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8151 reg_class_t rclass,
8152 machine_mode mode,
8153 secondary_reload_info *sri)
8154 {
8155 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8156 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8157 comment at the head of aarch64-sve.md for more details about the
8158 big-endian handling. */
8159 if (BYTES_BIG_ENDIAN
8160 && reg_class_subset_p (rclass, FP_REGS)
8161 && !((REG_P (x) && HARD_REGISTER_P (x))
8162 || aarch64_simd_valid_immediate (x, NULL))
8163 && aarch64_sve_data_mode_p (mode))
8164 {
8165 sri->icode = CODE_FOR_aarch64_sve_reload_be;
8166 return NO_REGS;
8167 }
8168
8169 /* If we have to disable direct literal pool loads and stores because the
8170 function is too big, then we need a scratch register. */
8171 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8172 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8173 || targetm.vector_mode_supported_p (GET_MODE (x)))
8174 && !aarch64_pcrelative_literal_loads)
8175 {
8176 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8177 return NO_REGS;
8178 }
8179
8180 /* Without the TARGET_SIMD instructions we cannot move a Q register
8181 to a Q register directly. We need a scratch. */
8182 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8183 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8184 && reg_class_subset_p (rclass, FP_REGS))
8185 {
8186 sri->icode = code_for_aarch64_reload_mov (mode);
8187 return NO_REGS;
8188 }
8189
8190 /* A TFmode or TImode memory access should be handled via an FP_REGS
8191 because AArch64 has richer addressing modes for LDR/STR instructions
8192 than LDP/STP instructions. */
8193 if (TARGET_FLOAT && rclass == GENERAL_REGS
8194 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8195 return FP_REGS;
8196
8197 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8198 return GENERAL_REGS;
8199
8200 return NO_REGS;
8201 }
8202
8203 static bool
8204 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8205 {
8206 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8207
8208 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8209 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8210 if (frame_pointer_needed)
8211 return to == HARD_FRAME_POINTER_REGNUM;
8212 return true;
8213 }
8214
8215 poly_int64
8216 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8217 {
8218 if (to == HARD_FRAME_POINTER_REGNUM)
8219 {
8220 if (from == ARG_POINTER_REGNUM)
8221 return cfun->machine->frame.hard_fp_offset;
8222
8223 if (from == FRAME_POINTER_REGNUM)
8224 return cfun->machine->frame.hard_fp_offset
8225 - cfun->machine->frame.locals_offset;
8226 }
8227
8228 if (to == STACK_POINTER_REGNUM)
8229 {
8230 if (from == FRAME_POINTER_REGNUM)
8231 return cfun->machine->frame.frame_size
8232 - cfun->machine->frame.locals_offset;
8233 }
8234
8235 return cfun->machine->frame.frame_size;
8236 }
8237
8238 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8239 previous frame. */
8240
8241 rtx
8242 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8243 {
8244 if (count != 0)
8245 return const0_rtx;
8246 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8247 }
8248
8249
8250 static void
8251 aarch64_asm_trampoline_template (FILE *f)
8252 {
8253 int offset1 = 16;
8254 int offset2 = 20;
8255
8256 if (aarch64_bti_enabled ())
8257 {
8258 asm_fprintf (f, "\thint\t34 // bti c\n");
8259 offset1 -= 4;
8260 offset2 -= 4;
8261 }
8262
8263 if (TARGET_ILP32)
8264 {
8265 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8266 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8267 offset1);
8268 }
8269 else
8270 {
8271 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8272 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8273 offset2);
8274 }
8275 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8276
8277 /* The trampoline needs an extra padding instruction. In case if BTI is
8278 enabled the padding instruction is replaced by the BTI instruction at
8279 the beginning. */
8280 if (!aarch64_bti_enabled ())
8281 assemble_aligned_integer (4, const0_rtx);
8282
8283 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8284 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8285 }
8286
8287 static void
8288 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8289 {
8290 rtx fnaddr, mem, a_tramp;
8291 const int tramp_code_sz = 16;
8292
8293 /* Don't need to copy the trailing D-words, we fill those in below. */
8294 emit_block_move (m_tramp, assemble_trampoline_template (),
8295 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8296 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8297 fnaddr = XEXP (DECL_RTL (fndecl), 0);
8298 if (GET_MODE (fnaddr) != ptr_mode)
8299 fnaddr = convert_memory_address (ptr_mode, fnaddr);
8300 emit_move_insn (mem, fnaddr);
8301
8302 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8303 emit_move_insn (mem, chain_value);
8304
8305 /* XXX We should really define a "clear_cache" pattern and use
8306 gen_clear_cache(). */
8307 a_tramp = XEXP (m_tramp, 0);
8308 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8309 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8310 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8311 ptr_mode);
8312 }
8313
8314 static unsigned char
8315 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8316 {
8317 /* ??? Logically we should only need to provide a value when
8318 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8319 can hold MODE, but at the moment we need to handle all modes.
8320 Just ignore any runtime parts for registers that can't store them. */
8321 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8322 unsigned int nregs;
8323 switch (regclass)
8324 {
8325 case TAILCALL_ADDR_REGS:
8326 case POINTER_REGS:
8327 case GENERAL_REGS:
8328 case ALL_REGS:
8329 case POINTER_AND_FP_REGS:
8330 case FP_REGS:
8331 case FP_LO_REGS:
8332 if (aarch64_sve_data_mode_p (mode)
8333 && constant_multiple_p (GET_MODE_SIZE (mode),
8334 BYTES_PER_SVE_VECTOR, &nregs))
8335 return nregs;
8336 return (aarch64_vector_data_mode_p (mode)
8337 ? CEIL (lowest_size, UNITS_PER_VREG)
8338 : CEIL (lowest_size, UNITS_PER_WORD));
8339 case STACK_REG:
8340 case PR_REGS:
8341 case PR_LO_REGS:
8342 case PR_HI_REGS:
8343 return 1;
8344
8345 case NO_REGS:
8346 return 0;
8347
8348 default:
8349 break;
8350 }
8351 gcc_unreachable ();
8352 }
8353
8354 static reg_class_t
8355 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8356 {
8357 if (regclass == POINTER_REGS)
8358 return GENERAL_REGS;
8359
8360 if (regclass == STACK_REG)
8361 {
8362 if (REG_P(x)
8363 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8364 return regclass;
8365
8366 return NO_REGS;
8367 }
8368
8369 /* Register eliminiation can result in a request for
8370 SP+constant->FP_REGS. We cannot support such operations which
8371 use SP as source and an FP_REG as destination, so reject out
8372 right now. */
8373 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8374 {
8375 rtx lhs = XEXP (x, 0);
8376
8377 /* Look through a possible SUBREG introduced by ILP32. */
8378 if (GET_CODE (lhs) == SUBREG)
8379 lhs = SUBREG_REG (lhs);
8380
8381 gcc_assert (REG_P (lhs));
8382 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8383 POINTER_REGS));
8384 return NO_REGS;
8385 }
8386
8387 return regclass;
8388 }
8389
8390 void
8391 aarch64_asm_output_labelref (FILE* f, const char *name)
8392 {
8393 asm_fprintf (f, "%U%s", name);
8394 }
8395
8396 static void
8397 aarch64_elf_asm_constructor (rtx symbol, int priority)
8398 {
8399 if (priority == DEFAULT_INIT_PRIORITY)
8400 default_ctor_section_asm_out_constructor (symbol, priority);
8401 else
8402 {
8403 section *s;
8404 /* While priority is known to be in range [0, 65535], so 18 bytes
8405 would be enough, the compiler might not know that. To avoid
8406 -Wformat-truncation false positive, use a larger size. */
8407 char buf[23];
8408 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8409 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8410 switch_to_section (s);
8411 assemble_align (POINTER_SIZE);
8412 assemble_aligned_integer (POINTER_BYTES, symbol);
8413 }
8414 }
8415
8416 static void
8417 aarch64_elf_asm_destructor (rtx symbol, int priority)
8418 {
8419 if (priority == DEFAULT_INIT_PRIORITY)
8420 default_dtor_section_asm_out_destructor (symbol, priority);
8421 else
8422 {
8423 section *s;
8424 /* While priority is known to be in range [0, 65535], so 18 bytes
8425 would be enough, the compiler might not know that. To avoid
8426 -Wformat-truncation false positive, use a larger size. */
8427 char buf[23];
8428 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8429 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8430 switch_to_section (s);
8431 assemble_align (POINTER_SIZE);
8432 assemble_aligned_integer (POINTER_BYTES, symbol);
8433 }
8434 }
8435
8436 const char*
8437 aarch64_output_casesi (rtx *operands)
8438 {
8439 char buf[100];
8440 char label[100];
8441 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8442 int index;
8443 static const char *const patterns[4][2] =
8444 {
8445 {
8446 "ldrb\t%w3, [%0,%w1,uxtw]",
8447 "add\t%3, %4, %w3, sxtb #2"
8448 },
8449 {
8450 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8451 "add\t%3, %4, %w3, sxth #2"
8452 },
8453 {
8454 "ldr\t%w3, [%0,%w1,uxtw #2]",
8455 "add\t%3, %4, %w3, sxtw #2"
8456 },
8457 /* We assume that DImode is only generated when not optimizing and
8458 that we don't really need 64-bit address offsets. That would
8459 imply an object file with 8GB of code in a single function! */
8460 {
8461 "ldr\t%w3, [%0,%w1,uxtw #2]",
8462 "add\t%3, %4, %w3, sxtw #2"
8463 }
8464 };
8465
8466 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8467
8468 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8469 index = exact_log2 (GET_MODE_SIZE (mode));
8470
8471 gcc_assert (index >= 0 && index <= 3);
8472
8473 /* Need to implement table size reduction, by chaning the code below. */
8474 output_asm_insn (patterns[index][0], operands);
8475 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8476 snprintf (buf, sizeof (buf),
8477 "adr\t%%4, %s", targetm.strip_name_encoding (label));
8478 output_asm_insn (buf, operands);
8479 output_asm_insn (patterns[index][1], operands);
8480 output_asm_insn ("br\t%3", operands);
8481 assemble_label (asm_out_file, label);
8482 return "";
8483 }
8484
8485
8486 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8487 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8488 operator. */
8489
8490 int
8491 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8492 {
8493 if (shift >= 0 && shift <= 3)
8494 {
8495 int size;
8496 for (size = 8; size <= 32; size *= 2)
8497 {
8498 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8499 if (mask == bits << shift)
8500 return size;
8501 }
8502 }
8503 return 0;
8504 }
8505
8506 /* Constant pools are per function only when PC relative
8507 literal loads are true or we are in the large memory
8508 model. */
8509
8510 static inline bool
8511 aarch64_can_use_per_function_literal_pools_p (void)
8512 {
8513 return (aarch64_pcrelative_literal_loads
8514 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8515 }
8516
8517 static bool
8518 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8519 {
8520 /* We can't use blocks for constants when we're using a per-function
8521 constant pool. */
8522 return !aarch64_can_use_per_function_literal_pools_p ();
8523 }
8524
8525 /* Select appropriate section for constants depending
8526 on where we place literal pools. */
8527
8528 static section *
8529 aarch64_select_rtx_section (machine_mode mode,
8530 rtx x,
8531 unsigned HOST_WIDE_INT align)
8532 {
8533 if (aarch64_can_use_per_function_literal_pools_p ())
8534 return function_section (current_function_decl);
8535
8536 return default_elf_select_rtx_section (mode, x, align);
8537 }
8538
8539 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8540 void
8541 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8542 HOST_WIDE_INT offset)
8543 {
8544 /* When using per-function literal pools, we must ensure that any code
8545 section is aligned to the minimal instruction length, lest we get
8546 errors from the assembler re "unaligned instructions". */
8547 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8548 ASM_OUTPUT_ALIGN (f, 2);
8549 }
8550
8551 /* Costs. */
8552
8553 /* Helper function for rtx cost calculation. Strip a shift expression
8554 from X. Returns the inner operand if successful, or the original
8555 expression on failure. */
8556 static rtx
8557 aarch64_strip_shift (rtx x)
8558 {
8559 rtx op = x;
8560
8561 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8562 we can convert both to ROR during final output. */
8563 if ((GET_CODE (op) == ASHIFT
8564 || GET_CODE (op) == ASHIFTRT
8565 || GET_CODE (op) == LSHIFTRT
8566 || GET_CODE (op) == ROTATERT
8567 || GET_CODE (op) == ROTATE)
8568 && CONST_INT_P (XEXP (op, 1)))
8569 return XEXP (op, 0);
8570
8571 if (GET_CODE (op) == MULT
8572 && CONST_INT_P (XEXP (op, 1))
8573 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8574 return XEXP (op, 0);
8575
8576 return x;
8577 }
8578
8579 /* Helper function for rtx cost calculation. Strip an extend
8580 expression from X. Returns the inner operand if successful, or the
8581 original expression on failure. We deal with a number of possible
8582 canonicalization variations here. If STRIP_SHIFT is true, then
8583 we can strip off a shift also. */
8584 static rtx
8585 aarch64_strip_extend (rtx x, bool strip_shift)
8586 {
8587 scalar_int_mode mode;
8588 rtx op = x;
8589
8590 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8591 return op;
8592
8593 /* Zero and sign extraction of a widened value. */
8594 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8595 && XEXP (op, 2) == const0_rtx
8596 && GET_CODE (XEXP (op, 0)) == MULT
8597 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8598 XEXP (op, 1)))
8599 return XEXP (XEXP (op, 0), 0);
8600
8601 /* It can also be represented (for zero-extend) as an AND with an
8602 immediate. */
8603 if (GET_CODE (op) == AND
8604 && GET_CODE (XEXP (op, 0)) == MULT
8605 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8606 && CONST_INT_P (XEXP (op, 1))
8607 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8608 INTVAL (XEXP (op, 1))) != 0)
8609 return XEXP (XEXP (op, 0), 0);
8610
8611 /* Now handle extended register, as this may also have an optional
8612 left shift by 1..4. */
8613 if (strip_shift
8614 && GET_CODE (op) == ASHIFT
8615 && CONST_INT_P (XEXP (op, 1))
8616 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8617 op = XEXP (op, 0);
8618
8619 if (GET_CODE (op) == ZERO_EXTEND
8620 || GET_CODE (op) == SIGN_EXTEND)
8621 op = XEXP (op, 0);
8622
8623 if (op != x)
8624 return op;
8625
8626 return x;
8627 }
8628
8629 /* Return true iff CODE is a shift supported in combination
8630 with arithmetic instructions. */
8631
8632 static bool
8633 aarch64_shift_p (enum rtx_code code)
8634 {
8635 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8636 }
8637
8638
8639 /* Return true iff X is a cheap shift without a sign extend. */
8640
8641 static bool
8642 aarch64_cheap_mult_shift_p (rtx x)
8643 {
8644 rtx op0, op1;
8645
8646 op0 = XEXP (x, 0);
8647 op1 = XEXP (x, 1);
8648
8649 if (!(aarch64_tune_params.extra_tuning_flags
8650 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8651 return false;
8652
8653 if (GET_CODE (op0) == SIGN_EXTEND)
8654 return false;
8655
8656 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8657 && UINTVAL (op1) <= 4)
8658 return true;
8659
8660 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8661 return false;
8662
8663 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8664
8665 if (l2 > 0 && l2 <= 4)
8666 return true;
8667
8668 return false;
8669 }
8670
8671 /* Helper function for rtx cost calculation. Calculate the cost of
8672 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8673 Return the calculated cost of the expression, recursing manually in to
8674 operands where needed. */
8675
8676 static int
8677 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8678 {
8679 rtx op0, op1;
8680 const struct cpu_cost_table *extra_cost
8681 = aarch64_tune_params.insn_extra_cost;
8682 int cost = 0;
8683 bool compound_p = (outer == PLUS || outer == MINUS);
8684 machine_mode mode = GET_MODE (x);
8685
8686 gcc_checking_assert (code == MULT);
8687
8688 op0 = XEXP (x, 0);
8689 op1 = XEXP (x, 1);
8690
8691 if (VECTOR_MODE_P (mode))
8692 mode = GET_MODE_INNER (mode);
8693
8694 /* Integer multiply/fma. */
8695 if (GET_MODE_CLASS (mode) == MODE_INT)
8696 {
8697 /* The multiply will be canonicalized as a shift, cost it as such. */
8698 if (aarch64_shift_p (GET_CODE (x))
8699 || (CONST_INT_P (op1)
8700 && exact_log2 (INTVAL (op1)) > 0))
8701 {
8702 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8703 || GET_CODE (op0) == SIGN_EXTEND;
8704 if (speed)
8705 {
8706 if (compound_p)
8707 {
8708 /* If the shift is considered cheap,
8709 then don't add any cost. */
8710 if (aarch64_cheap_mult_shift_p (x))
8711 ;
8712 else if (REG_P (op1))
8713 /* ARITH + shift-by-register. */
8714 cost += extra_cost->alu.arith_shift_reg;
8715 else if (is_extend)
8716 /* ARITH + extended register. We don't have a cost field
8717 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8718 cost += extra_cost->alu.extend_arith;
8719 else
8720 /* ARITH + shift-by-immediate. */
8721 cost += extra_cost->alu.arith_shift;
8722 }
8723 else
8724 /* LSL (immediate). */
8725 cost += extra_cost->alu.shift;
8726
8727 }
8728 /* Strip extends as we will have costed them in the case above. */
8729 if (is_extend)
8730 op0 = aarch64_strip_extend (op0, true);
8731
8732 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8733
8734 return cost;
8735 }
8736
8737 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8738 compound and let the below cases handle it. After all, MNEG is a
8739 special-case alias of MSUB. */
8740 if (GET_CODE (op0) == NEG)
8741 {
8742 op0 = XEXP (op0, 0);
8743 compound_p = true;
8744 }
8745
8746 /* Integer multiplies or FMAs have zero/sign extending variants. */
8747 if ((GET_CODE (op0) == ZERO_EXTEND
8748 && GET_CODE (op1) == ZERO_EXTEND)
8749 || (GET_CODE (op0) == SIGN_EXTEND
8750 && GET_CODE (op1) == SIGN_EXTEND))
8751 {
8752 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8753 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8754
8755 if (speed)
8756 {
8757 if (compound_p)
8758 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8759 cost += extra_cost->mult[0].extend_add;
8760 else
8761 /* MUL/SMULL/UMULL. */
8762 cost += extra_cost->mult[0].extend;
8763 }
8764
8765 return cost;
8766 }
8767
8768 /* This is either an integer multiply or a MADD. In both cases
8769 we want to recurse and cost the operands. */
8770 cost += rtx_cost (op0, mode, MULT, 0, speed);
8771 cost += rtx_cost (op1, mode, MULT, 1, speed);
8772
8773 if (speed)
8774 {
8775 if (compound_p)
8776 /* MADD/MSUB. */
8777 cost += extra_cost->mult[mode == DImode].add;
8778 else
8779 /* MUL. */
8780 cost += extra_cost->mult[mode == DImode].simple;
8781 }
8782
8783 return cost;
8784 }
8785 else
8786 {
8787 if (speed)
8788 {
8789 /* Floating-point FMA/FMUL can also support negations of the
8790 operands, unless the rounding mode is upward or downward in
8791 which case FNMUL is different than FMUL with operand negation. */
8792 bool neg0 = GET_CODE (op0) == NEG;
8793 bool neg1 = GET_CODE (op1) == NEG;
8794 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8795 {
8796 if (neg0)
8797 op0 = XEXP (op0, 0);
8798 if (neg1)
8799 op1 = XEXP (op1, 0);
8800 }
8801
8802 if (compound_p)
8803 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8804 cost += extra_cost->fp[mode == DFmode].fma;
8805 else
8806 /* FMUL/FNMUL. */
8807 cost += extra_cost->fp[mode == DFmode].mult;
8808 }
8809
8810 cost += rtx_cost (op0, mode, MULT, 0, speed);
8811 cost += rtx_cost (op1, mode, MULT, 1, speed);
8812 return cost;
8813 }
8814 }
8815
8816 static int
8817 aarch64_address_cost (rtx x,
8818 machine_mode mode,
8819 addr_space_t as ATTRIBUTE_UNUSED,
8820 bool speed)
8821 {
8822 enum rtx_code c = GET_CODE (x);
8823 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8824 struct aarch64_address_info info;
8825 int cost = 0;
8826 info.shift = 0;
8827
8828 if (!aarch64_classify_address (&info, x, mode, false))
8829 {
8830 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8831 {
8832 /* This is a CONST or SYMBOL ref which will be split
8833 in a different way depending on the code model in use.
8834 Cost it through the generic infrastructure. */
8835 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8836 /* Divide through by the cost of one instruction to
8837 bring it to the same units as the address costs. */
8838 cost_symbol_ref /= COSTS_N_INSNS (1);
8839 /* The cost is then the cost of preparing the address,
8840 followed by an immediate (possibly 0) offset. */
8841 return cost_symbol_ref + addr_cost->imm_offset;
8842 }
8843 else
8844 {
8845 /* This is most likely a jump table from a case
8846 statement. */
8847 return addr_cost->register_offset;
8848 }
8849 }
8850
8851 switch (info.type)
8852 {
8853 case ADDRESS_LO_SUM:
8854 case ADDRESS_SYMBOLIC:
8855 case ADDRESS_REG_IMM:
8856 cost += addr_cost->imm_offset;
8857 break;
8858
8859 case ADDRESS_REG_WB:
8860 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8861 cost += addr_cost->pre_modify;
8862 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8863 cost += addr_cost->post_modify;
8864 else
8865 gcc_unreachable ();
8866
8867 break;
8868
8869 case ADDRESS_REG_REG:
8870 cost += addr_cost->register_offset;
8871 break;
8872
8873 case ADDRESS_REG_SXTW:
8874 cost += addr_cost->register_sextend;
8875 break;
8876
8877 case ADDRESS_REG_UXTW:
8878 cost += addr_cost->register_zextend;
8879 break;
8880
8881 default:
8882 gcc_unreachable ();
8883 }
8884
8885
8886 if (info.shift > 0)
8887 {
8888 /* For the sake of calculating the cost of the shifted register
8889 component, we can treat same sized modes in the same way. */
8890 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8891 cost += addr_cost->addr_scale_costs.hi;
8892 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8893 cost += addr_cost->addr_scale_costs.si;
8894 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8895 cost += addr_cost->addr_scale_costs.di;
8896 else
8897 /* We can't tell, or this is a 128-bit vector. */
8898 cost += addr_cost->addr_scale_costs.ti;
8899 }
8900
8901 return cost;
8902 }
8903
8904 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8905 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8906 to be taken. */
8907
8908 int
8909 aarch64_branch_cost (bool speed_p, bool predictable_p)
8910 {
8911 /* When optimizing for speed, use the cost of unpredictable branches. */
8912 const struct cpu_branch_cost *branch_costs =
8913 aarch64_tune_params.branch_costs;
8914
8915 if (!speed_p || predictable_p)
8916 return branch_costs->predictable;
8917 else
8918 return branch_costs->unpredictable;
8919 }
8920
8921 /* Return true if the RTX X in mode MODE is a zero or sign extract
8922 usable in an ADD or SUB (extended register) instruction. */
8923 static bool
8924 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8925 {
8926 /* Catch add with a sign extract.
8927 This is add_<optab><mode>_multp2. */
8928 if (GET_CODE (x) == SIGN_EXTRACT
8929 || GET_CODE (x) == ZERO_EXTRACT)
8930 {
8931 rtx op0 = XEXP (x, 0);
8932 rtx op1 = XEXP (x, 1);
8933 rtx op2 = XEXP (x, 2);
8934
8935 if (GET_CODE (op0) == MULT
8936 && CONST_INT_P (op1)
8937 && op2 == const0_rtx
8938 && CONST_INT_P (XEXP (op0, 1))
8939 && aarch64_is_extend_from_extract (mode,
8940 XEXP (op0, 1),
8941 op1))
8942 {
8943 return true;
8944 }
8945 }
8946 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8947 No shift. */
8948 else if (GET_CODE (x) == SIGN_EXTEND
8949 || GET_CODE (x) == ZERO_EXTEND)
8950 return REG_P (XEXP (x, 0));
8951
8952 return false;
8953 }
8954
8955 static bool
8956 aarch64_frint_unspec_p (unsigned int u)
8957 {
8958 switch (u)
8959 {
8960 case UNSPEC_FRINTZ:
8961 case UNSPEC_FRINTP:
8962 case UNSPEC_FRINTM:
8963 case UNSPEC_FRINTA:
8964 case UNSPEC_FRINTN:
8965 case UNSPEC_FRINTX:
8966 case UNSPEC_FRINTI:
8967 return true;
8968
8969 default:
8970 return false;
8971 }
8972 }
8973
8974 /* Return true iff X is an rtx that will match an extr instruction
8975 i.e. as described in the *extr<mode>5_insn family of patterns.
8976 OP0 and OP1 will be set to the operands of the shifts involved
8977 on success and will be NULL_RTX otherwise. */
8978
8979 static bool
8980 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8981 {
8982 rtx op0, op1;
8983 scalar_int_mode mode;
8984 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8985 return false;
8986
8987 *res_op0 = NULL_RTX;
8988 *res_op1 = NULL_RTX;
8989
8990 if (GET_CODE (x) != IOR)
8991 return false;
8992
8993 op0 = XEXP (x, 0);
8994 op1 = XEXP (x, 1);
8995
8996 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8997 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8998 {
8999 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9000 if (GET_CODE (op1) == ASHIFT)
9001 std::swap (op0, op1);
9002
9003 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9004 return false;
9005
9006 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9007 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9008
9009 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9010 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9011 {
9012 *res_op0 = XEXP (op0, 0);
9013 *res_op1 = XEXP (op1, 0);
9014 return true;
9015 }
9016 }
9017
9018 return false;
9019 }
9020
9021 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9022 storing it in *COST. Result is true if the total cost of the operation
9023 has now been calculated. */
9024 static bool
9025 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9026 {
9027 rtx inner;
9028 rtx comparator;
9029 enum rtx_code cmpcode;
9030
9031 if (COMPARISON_P (op0))
9032 {
9033 inner = XEXP (op0, 0);
9034 comparator = XEXP (op0, 1);
9035 cmpcode = GET_CODE (op0);
9036 }
9037 else
9038 {
9039 inner = op0;
9040 comparator = const0_rtx;
9041 cmpcode = NE;
9042 }
9043
9044 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9045 {
9046 /* Conditional branch. */
9047 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9048 return true;
9049 else
9050 {
9051 if (cmpcode == NE || cmpcode == EQ)
9052 {
9053 if (comparator == const0_rtx)
9054 {
9055 /* TBZ/TBNZ/CBZ/CBNZ. */
9056 if (GET_CODE (inner) == ZERO_EXTRACT)
9057 /* TBZ/TBNZ. */
9058 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9059 ZERO_EXTRACT, 0, speed);
9060 else
9061 /* CBZ/CBNZ. */
9062 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9063
9064 return true;
9065 }
9066 }
9067 else if (cmpcode == LT || cmpcode == GE)
9068 {
9069 /* TBZ/TBNZ. */
9070 if (comparator == const0_rtx)
9071 return true;
9072 }
9073 }
9074 }
9075 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9076 {
9077 /* CCMP. */
9078 if (GET_CODE (op1) == COMPARE)
9079 {
9080 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9081 if (XEXP (op1, 1) == const0_rtx)
9082 *cost += 1;
9083 if (speed)
9084 {
9085 machine_mode mode = GET_MODE (XEXP (op1, 0));
9086 const struct cpu_cost_table *extra_cost
9087 = aarch64_tune_params.insn_extra_cost;
9088
9089 if (GET_MODE_CLASS (mode) == MODE_INT)
9090 *cost += extra_cost->alu.arith;
9091 else
9092 *cost += extra_cost->fp[mode == DFmode].compare;
9093 }
9094 return true;
9095 }
9096
9097 /* It's a conditional operation based on the status flags,
9098 so it must be some flavor of CSEL. */
9099
9100 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9101 if (GET_CODE (op1) == NEG
9102 || GET_CODE (op1) == NOT
9103 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9104 op1 = XEXP (op1, 0);
9105 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9106 {
9107 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9108 op1 = XEXP (op1, 0);
9109 op2 = XEXP (op2, 0);
9110 }
9111
9112 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9113 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9114 return true;
9115 }
9116
9117 /* We don't know what this is, cost all operands. */
9118 return false;
9119 }
9120
9121 /* Check whether X is a bitfield operation of the form shift + extend that
9122 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9123 operand to which the bitfield operation is applied. Otherwise return
9124 NULL_RTX. */
9125
9126 static rtx
9127 aarch64_extend_bitfield_pattern_p (rtx x)
9128 {
9129 rtx_code outer_code = GET_CODE (x);
9130 machine_mode outer_mode = GET_MODE (x);
9131
9132 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9133 && outer_mode != SImode && outer_mode != DImode)
9134 return NULL_RTX;
9135
9136 rtx inner = XEXP (x, 0);
9137 rtx_code inner_code = GET_CODE (inner);
9138 machine_mode inner_mode = GET_MODE (inner);
9139 rtx op = NULL_RTX;
9140
9141 switch (inner_code)
9142 {
9143 case ASHIFT:
9144 if (CONST_INT_P (XEXP (inner, 1))
9145 && (inner_mode == QImode || inner_mode == HImode))
9146 op = XEXP (inner, 0);
9147 break;
9148 case LSHIFTRT:
9149 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9150 && (inner_mode == QImode || inner_mode == HImode))
9151 op = XEXP (inner, 0);
9152 break;
9153 case ASHIFTRT:
9154 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9155 && (inner_mode == QImode || inner_mode == HImode))
9156 op = XEXP (inner, 0);
9157 break;
9158 default:
9159 break;
9160 }
9161
9162 return op;
9163 }
9164
9165 /* Return true if the mask and a shift amount from an RTX of the form
9166 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9167 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9168
9169 bool
9170 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9171 rtx shft_amnt)
9172 {
9173 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9174 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9175 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9176 && (INTVAL (mask)
9177 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9178 }
9179
9180 /* Calculate the cost of calculating X, storing it in *COST. Result
9181 is true if the total cost of the operation has now been calculated. */
9182 static bool
9183 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9184 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9185 {
9186 rtx op0, op1, op2;
9187 const struct cpu_cost_table *extra_cost
9188 = aarch64_tune_params.insn_extra_cost;
9189 int code = GET_CODE (x);
9190 scalar_int_mode int_mode;
9191
9192 /* By default, assume that everything has equivalent cost to the
9193 cheapest instruction. Any additional costs are applied as a delta
9194 above this default. */
9195 *cost = COSTS_N_INSNS (1);
9196
9197 switch (code)
9198 {
9199 case SET:
9200 /* The cost depends entirely on the operands to SET. */
9201 *cost = 0;
9202 op0 = SET_DEST (x);
9203 op1 = SET_SRC (x);
9204
9205 switch (GET_CODE (op0))
9206 {
9207 case MEM:
9208 if (speed)
9209 {
9210 rtx address = XEXP (op0, 0);
9211 if (VECTOR_MODE_P (mode))
9212 *cost += extra_cost->ldst.storev;
9213 else if (GET_MODE_CLASS (mode) == MODE_INT)
9214 *cost += extra_cost->ldst.store;
9215 else if (mode == SFmode)
9216 *cost += extra_cost->ldst.storef;
9217 else if (mode == DFmode)
9218 *cost += extra_cost->ldst.stored;
9219
9220 *cost +=
9221 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9222 0, speed));
9223 }
9224
9225 *cost += rtx_cost (op1, mode, SET, 1, speed);
9226 return true;
9227
9228 case SUBREG:
9229 if (! REG_P (SUBREG_REG (op0)))
9230 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9231
9232 /* Fall through. */
9233 case REG:
9234 /* The cost is one per vector-register copied. */
9235 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9236 {
9237 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9238 *cost = COSTS_N_INSNS (nregs);
9239 }
9240 /* const0_rtx is in general free, but we will use an
9241 instruction to set a register to 0. */
9242 else if (REG_P (op1) || op1 == const0_rtx)
9243 {
9244 /* The cost is 1 per register copied. */
9245 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9246 *cost = COSTS_N_INSNS (nregs);
9247 }
9248 else
9249 /* Cost is just the cost of the RHS of the set. */
9250 *cost += rtx_cost (op1, mode, SET, 1, speed);
9251 return true;
9252
9253 case ZERO_EXTRACT:
9254 case SIGN_EXTRACT:
9255 /* Bit-field insertion. Strip any redundant widening of
9256 the RHS to meet the width of the target. */
9257 if (GET_CODE (op1) == SUBREG)
9258 op1 = SUBREG_REG (op1);
9259 if ((GET_CODE (op1) == ZERO_EXTEND
9260 || GET_CODE (op1) == SIGN_EXTEND)
9261 && CONST_INT_P (XEXP (op0, 1))
9262 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9263 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9264 op1 = XEXP (op1, 0);
9265
9266 if (CONST_INT_P (op1))
9267 {
9268 /* MOV immediate is assumed to always be cheap. */
9269 *cost = COSTS_N_INSNS (1);
9270 }
9271 else
9272 {
9273 /* BFM. */
9274 if (speed)
9275 *cost += extra_cost->alu.bfi;
9276 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9277 }
9278
9279 return true;
9280
9281 default:
9282 /* We can't make sense of this, assume default cost. */
9283 *cost = COSTS_N_INSNS (1);
9284 return false;
9285 }
9286 return false;
9287
9288 case CONST_INT:
9289 /* If an instruction can incorporate a constant within the
9290 instruction, the instruction's expression avoids calling
9291 rtx_cost() on the constant. If rtx_cost() is called on a
9292 constant, then it is usually because the constant must be
9293 moved into a register by one or more instructions.
9294
9295 The exception is constant 0, which can be expressed
9296 as XZR/WZR and is therefore free. The exception to this is
9297 if we have (set (reg) (const0_rtx)) in which case we must cost
9298 the move. However, we can catch that when we cost the SET, so
9299 we don't need to consider that here. */
9300 if (x == const0_rtx)
9301 *cost = 0;
9302 else
9303 {
9304 /* To an approximation, building any other constant is
9305 proportionally expensive to the number of instructions
9306 required to build that constant. This is true whether we
9307 are compiling for SPEED or otherwise. */
9308 if (!is_a <scalar_int_mode> (mode, &int_mode))
9309 int_mode = word_mode;
9310 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9311 (NULL_RTX, x, false, int_mode));
9312 }
9313 return true;
9314
9315 case CONST_DOUBLE:
9316
9317 /* First determine number of instructions to do the move
9318 as an integer constant. */
9319 if (!aarch64_float_const_representable_p (x)
9320 && !aarch64_can_const_movi_rtx_p (x, mode)
9321 && aarch64_float_const_rtx_p (x))
9322 {
9323 unsigned HOST_WIDE_INT ival;
9324 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9325 gcc_assert (succeed);
9326
9327 scalar_int_mode imode = (mode == HFmode
9328 ? SImode
9329 : int_mode_for_mode (mode).require ());
9330 int ncost = aarch64_internal_mov_immediate
9331 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9332 *cost += COSTS_N_INSNS (ncost);
9333 return true;
9334 }
9335
9336 if (speed)
9337 {
9338 /* mov[df,sf]_aarch64. */
9339 if (aarch64_float_const_representable_p (x))
9340 /* FMOV (scalar immediate). */
9341 *cost += extra_cost->fp[mode == DFmode].fpconst;
9342 else if (!aarch64_float_const_zero_rtx_p (x))
9343 {
9344 /* This will be a load from memory. */
9345 if (mode == DFmode)
9346 *cost += extra_cost->ldst.loadd;
9347 else
9348 *cost += extra_cost->ldst.loadf;
9349 }
9350 else
9351 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9352 or MOV v0.s[0], wzr - neither of which are modeled by the
9353 cost tables. Just use the default cost. */
9354 {
9355 }
9356 }
9357
9358 return true;
9359
9360 case MEM:
9361 if (speed)
9362 {
9363 /* For loads we want the base cost of a load, plus an
9364 approximation for the additional cost of the addressing
9365 mode. */
9366 rtx address = XEXP (x, 0);
9367 if (VECTOR_MODE_P (mode))
9368 *cost += extra_cost->ldst.loadv;
9369 else if (GET_MODE_CLASS (mode) == MODE_INT)
9370 *cost += extra_cost->ldst.load;
9371 else if (mode == SFmode)
9372 *cost += extra_cost->ldst.loadf;
9373 else if (mode == DFmode)
9374 *cost += extra_cost->ldst.loadd;
9375
9376 *cost +=
9377 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9378 0, speed));
9379 }
9380
9381 return true;
9382
9383 case NEG:
9384 op0 = XEXP (x, 0);
9385
9386 if (VECTOR_MODE_P (mode))
9387 {
9388 if (speed)
9389 {
9390 /* FNEG. */
9391 *cost += extra_cost->vect.alu;
9392 }
9393 return false;
9394 }
9395
9396 if (GET_MODE_CLASS (mode) == MODE_INT)
9397 {
9398 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9399 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9400 {
9401 /* CSETM. */
9402 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9403 return true;
9404 }
9405
9406 /* Cost this as SUB wzr, X. */
9407 op0 = CONST0_RTX (mode);
9408 op1 = XEXP (x, 0);
9409 goto cost_minus;
9410 }
9411
9412 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9413 {
9414 /* Support (neg(fma...)) as a single instruction only if
9415 sign of zeros is unimportant. This matches the decision
9416 making in aarch64.md. */
9417 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9418 {
9419 /* FNMADD. */
9420 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9421 return true;
9422 }
9423 if (GET_CODE (op0) == MULT)
9424 {
9425 /* FNMUL. */
9426 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9427 return true;
9428 }
9429 if (speed)
9430 /* FNEG. */
9431 *cost += extra_cost->fp[mode == DFmode].neg;
9432 return false;
9433 }
9434
9435 return false;
9436
9437 case CLRSB:
9438 case CLZ:
9439 if (speed)
9440 {
9441 if (VECTOR_MODE_P (mode))
9442 *cost += extra_cost->vect.alu;
9443 else
9444 *cost += extra_cost->alu.clz;
9445 }
9446
9447 return false;
9448
9449 case COMPARE:
9450 op0 = XEXP (x, 0);
9451 op1 = XEXP (x, 1);
9452
9453 if (op1 == const0_rtx
9454 && GET_CODE (op0) == AND)
9455 {
9456 x = op0;
9457 mode = GET_MODE (op0);
9458 goto cost_logic;
9459 }
9460
9461 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9462 {
9463 /* TODO: A write to the CC flags possibly costs extra, this
9464 needs encoding in the cost tables. */
9465
9466 mode = GET_MODE (op0);
9467 /* ANDS. */
9468 if (GET_CODE (op0) == AND)
9469 {
9470 x = op0;
9471 goto cost_logic;
9472 }
9473
9474 if (GET_CODE (op0) == PLUS)
9475 {
9476 /* ADDS (and CMN alias). */
9477 x = op0;
9478 goto cost_plus;
9479 }
9480
9481 if (GET_CODE (op0) == MINUS)
9482 {
9483 /* SUBS. */
9484 x = op0;
9485 goto cost_minus;
9486 }
9487
9488 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9489 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9490 && CONST_INT_P (XEXP (op0, 2)))
9491 {
9492 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9493 Handle it here directly rather than going to cost_logic
9494 since we know the immediate generated for the TST is valid
9495 so we can avoid creating an intermediate rtx for it only
9496 for costing purposes. */
9497 if (speed)
9498 *cost += extra_cost->alu.logical;
9499
9500 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9501 ZERO_EXTRACT, 0, speed);
9502 return true;
9503 }
9504
9505 if (GET_CODE (op1) == NEG)
9506 {
9507 /* CMN. */
9508 if (speed)
9509 *cost += extra_cost->alu.arith;
9510
9511 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9512 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9513 return true;
9514 }
9515
9516 /* CMP.
9517
9518 Compare can freely swap the order of operands, and
9519 canonicalization puts the more complex operation first.
9520 But the integer MINUS logic expects the shift/extend
9521 operation in op1. */
9522 if (! (REG_P (op0)
9523 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9524 {
9525 op0 = XEXP (x, 1);
9526 op1 = XEXP (x, 0);
9527 }
9528 goto cost_minus;
9529 }
9530
9531 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9532 {
9533 /* FCMP. */
9534 if (speed)
9535 *cost += extra_cost->fp[mode == DFmode].compare;
9536
9537 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9538 {
9539 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9540 /* FCMP supports constant 0.0 for no extra cost. */
9541 return true;
9542 }
9543 return false;
9544 }
9545
9546 if (VECTOR_MODE_P (mode))
9547 {
9548 /* Vector compare. */
9549 if (speed)
9550 *cost += extra_cost->vect.alu;
9551
9552 if (aarch64_float_const_zero_rtx_p (op1))
9553 {
9554 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9555 cost. */
9556 return true;
9557 }
9558 return false;
9559 }
9560 return false;
9561
9562 case MINUS:
9563 {
9564 op0 = XEXP (x, 0);
9565 op1 = XEXP (x, 1);
9566
9567 cost_minus:
9568 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9569
9570 /* Detect valid immediates. */
9571 if ((GET_MODE_CLASS (mode) == MODE_INT
9572 || (GET_MODE_CLASS (mode) == MODE_CC
9573 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9574 && CONST_INT_P (op1)
9575 && aarch64_uimm12_shift (INTVAL (op1)))
9576 {
9577 if (speed)
9578 /* SUB(S) (immediate). */
9579 *cost += extra_cost->alu.arith;
9580 return true;
9581 }
9582
9583 /* Look for SUB (extended register). */
9584 if (is_a <scalar_int_mode> (mode, &int_mode)
9585 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9586 {
9587 if (speed)
9588 *cost += extra_cost->alu.extend_arith;
9589
9590 op1 = aarch64_strip_extend (op1, true);
9591 *cost += rtx_cost (op1, VOIDmode,
9592 (enum rtx_code) GET_CODE (op1), 0, speed);
9593 return true;
9594 }
9595
9596 rtx new_op1 = aarch64_strip_extend (op1, false);
9597
9598 /* Cost this as an FMA-alike operation. */
9599 if ((GET_CODE (new_op1) == MULT
9600 || aarch64_shift_p (GET_CODE (new_op1)))
9601 && code != COMPARE)
9602 {
9603 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9604 (enum rtx_code) code,
9605 speed);
9606 return true;
9607 }
9608
9609 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9610
9611 if (speed)
9612 {
9613 if (VECTOR_MODE_P (mode))
9614 {
9615 /* Vector SUB. */
9616 *cost += extra_cost->vect.alu;
9617 }
9618 else if (GET_MODE_CLASS (mode) == MODE_INT)
9619 {
9620 /* SUB(S). */
9621 *cost += extra_cost->alu.arith;
9622 }
9623 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9624 {
9625 /* FSUB. */
9626 *cost += extra_cost->fp[mode == DFmode].addsub;
9627 }
9628 }
9629 return true;
9630 }
9631
9632 case PLUS:
9633 {
9634 rtx new_op0;
9635
9636 op0 = XEXP (x, 0);
9637 op1 = XEXP (x, 1);
9638
9639 cost_plus:
9640 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9641 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9642 {
9643 /* CSINC. */
9644 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9645 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9646 return true;
9647 }
9648
9649 if (GET_MODE_CLASS (mode) == MODE_INT
9650 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9651 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9652 {
9653 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9654
9655 if (speed)
9656 /* ADD (immediate). */
9657 *cost += extra_cost->alu.arith;
9658 return true;
9659 }
9660
9661 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9662
9663 /* Look for ADD (extended register). */
9664 if (is_a <scalar_int_mode> (mode, &int_mode)
9665 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9666 {
9667 if (speed)
9668 *cost += extra_cost->alu.extend_arith;
9669
9670 op0 = aarch64_strip_extend (op0, true);
9671 *cost += rtx_cost (op0, VOIDmode,
9672 (enum rtx_code) GET_CODE (op0), 0, speed);
9673 return true;
9674 }
9675
9676 /* Strip any extend, leave shifts behind as we will
9677 cost them through mult_cost. */
9678 new_op0 = aarch64_strip_extend (op0, false);
9679
9680 if (GET_CODE (new_op0) == MULT
9681 || aarch64_shift_p (GET_CODE (new_op0)))
9682 {
9683 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9684 speed);
9685 return true;
9686 }
9687
9688 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9689
9690 if (speed)
9691 {
9692 if (VECTOR_MODE_P (mode))
9693 {
9694 /* Vector ADD. */
9695 *cost += extra_cost->vect.alu;
9696 }
9697 else if (GET_MODE_CLASS (mode) == MODE_INT)
9698 {
9699 /* ADD. */
9700 *cost += extra_cost->alu.arith;
9701 }
9702 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9703 {
9704 /* FADD. */
9705 *cost += extra_cost->fp[mode == DFmode].addsub;
9706 }
9707 }
9708 return true;
9709 }
9710
9711 case BSWAP:
9712 *cost = COSTS_N_INSNS (1);
9713
9714 if (speed)
9715 {
9716 if (VECTOR_MODE_P (mode))
9717 *cost += extra_cost->vect.alu;
9718 else
9719 *cost += extra_cost->alu.rev;
9720 }
9721 return false;
9722
9723 case IOR:
9724 if (aarch_rev16_p (x))
9725 {
9726 *cost = COSTS_N_INSNS (1);
9727
9728 if (speed)
9729 {
9730 if (VECTOR_MODE_P (mode))
9731 *cost += extra_cost->vect.alu;
9732 else
9733 *cost += extra_cost->alu.rev;
9734 }
9735 return true;
9736 }
9737
9738 if (aarch64_extr_rtx_p (x, &op0, &op1))
9739 {
9740 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9741 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9742 if (speed)
9743 *cost += extra_cost->alu.shift;
9744
9745 return true;
9746 }
9747 /* Fall through. */
9748 case XOR:
9749 case AND:
9750 cost_logic:
9751 op0 = XEXP (x, 0);
9752 op1 = XEXP (x, 1);
9753
9754 if (VECTOR_MODE_P (mode))
9755 {
9756 if (speed)
9757 *cost += extra_cost->vect.alu;
9758 return true;
9759 }
9760
9761 if (code == AND
9762 && GET_CODE (op0) == MULT
9763 && CONST_INT_P (XEXP (op0, 1))
9764 && CONST_INT_P (op1)
9765 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9766 INTVAL (op1)) != 0)
9767 {
9768 /* This is a UBFM/SBFM. */
9769 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9770 if (speed)
9771 *cost += extra_cost->alu.bfx;
9772 return true;
9773 }
9774
9775 if (is_int_mode (mode, &int_mode))
9776 {
9777 if (CONST_INT_P (op1))
9778 {
9779 /* We have a mask + shift version of a UBFIZ
9780 i.e. the *andim_ashift<mode>_bfiz pattern. */
9781 if (GET_CODE (op0) == ASHIFT
9782 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9783 XEXP (op0, 1)))
9784 {
9785 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9786 (enum rtx_code) code, 0, speed);
9787 if (speed)
9788 *cost += extra_cost->alu.bfx;
9789
9790 return true;
9791 }
9792 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9793 {
9794 /* We possibly get the immediate for free, this is not
9795 modelled. */
9796 *cost += rtx_cost (op0, int_mode,
9797 (enum rtx_code) code, 0, speed);
9798 if (speed)
9799 *cost += extra_cost->alu.logical;
9800
9801 return true;
9802 }
9803 }
9804 else
9805 {
9806 rtx new_op0 = op0;
9807
9808 /* Handle ORN, EON, or BIC. */
9809 if (GET_CODE (op0) == NOT)
9810 op0 = XEXP (op0, 0);
9811
9812 new_op0 = aarch64_strip_shift (op0);
9813
9814 /* If we had a shift on op0 then this is a logical-shift-
9815 by-register/immediate operation. Otherwise, this is just
9816 a logical operation. */
9817 if (speed)
9818 {
9819 if (new_op0 != op0)
9820 {
9821 /* Shift by immediate. */
9822 if (CONST_INT_P (XEXP (op0, 1)))
9823 *cost += extra_cost->alu.log_shift;
9824 else
9825 *cost += extra_cost->alu.log_shift_reg;
9826 }
9827 else
9828 *cost += extra_cost->alu.logical;
9829 }
9830
9831 /* In both cases we want to cost both operands. */
9832 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9833 0, speed);
9834 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9835 1, speed);
9836
9837 return true;
9838 }
9839 }
9840 return false;
9841
9842 case NOT:
9843 x = XEXP (x, 0);
9844 op0 = aarch64_strip_shift (x);
9845
9846 if (VECTOR_MODE_P (mode))
9847 {
9848 /* Vector NOT. */
9849 *cost += extra_cost->vect.alu;
9850 return false;
9851 }
9852
9853 /* MVN-shifted-reg. */
9854 if (op0 != x)
9855 {
9856 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9857
9858 if (speed)
9859 *cost += extra_cost->alu.log_shift;
9860
9861 return true;
9862 }
9863 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9864 Handle the second form here taking care that 'a' in the above can
9865 be a shift. */
9866 else if (GET_CODE (op0) == XOR)
9867 {
9868 rtx newop0 = XEXP (op0, 0);
9869 rtx newop1 = XEXP (op0, 1);
9870 rtx op0_stripped = aarch64_strip_shift (newop0);
9871
9872 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9873 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9874
9875 if (speed)
9876 {
9877 if (op0_stripped != newop0)
9878 *cost += extra_cost->alu.log_shift;
9879 else
9880 *cost += extra_cost->alu.logical;
9881 }
9882
9883 return true;
9884 }
9885 /* MVN. */
9886 if (speed)
9887 *cost += extra_cost->alu.logical;
9888
9889 return false;
9890
9891 case ZERO_EXTEND:
9892
9893 op0 = XEXP (x, 0);
9894 /* If a value is written in SI mode, then zero extended to DI
9895 mode, the operation will in general be free as a write to
9896 a 'w' register implicitly zeroes the upper bits of an 'x'
9897 register. However, if this is
9898
9899 (set (reg) (zero_extend (reg)))
9900
9901 we must cost the explicit register move. */
9902 if (mode == DImode
9903 && GET_MODE (op0) == SImode
9904 && outer == SET)
9905 {
9906 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9907
9908 /* If OP_COST is non-zero, then the cost of the zero extend
9909 is effectively the cost of the inner operation. Otherwise
9910 we have a MOV instruction and we take the cost from the MOV
9911 itself. This is true independently of whether we are
9912 optimizing for space or time. */
9913 if (op_cost)
9914 *cost = op_cost;
9915
9916 return true;
9917 }
9918 else if (MEM_P (op0))
9919 {
9920 /* All loads can zero extend to any size for free. */
9921 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9922 return true;
9923 }
9924
9925 op0 = aarch64_extend_bitfield_pattern_p (x);
9926 if (op0)
9927 {
9928 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9929 if (speed)
9930 *cost += extra_cost->alu.bfx;
9931 return true;
9932 }
9933
9934 if (speed)
9935 {
9936 if (VECTOR_MODE_P (mode))
9937 {
9938 /* UMOV. */
9939 *cost += extra_cost->vect.alu;
9940 }
9941 else
9942 {
9943 /* We generate an AND instead of UXTB/UXTH. */
9944 *cost += extra_cost->alu.logical;
9945 }
9946 }
9947 return false;
9948
9949 case SIGN_EXTEND:
9950 if (MEM_P (XEXP (x, 0)))
9951 {
9952 /* LDRSH. */
9953 if (speed)
9954 {
9955 rtx address = XEXP (XEXP (x, 0), 0);
9956 *cost += extra_cost->ldst.load_sign_extend;
9957
9958 *cost +=
9959 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9960 0, speed));
9961 }
9962 return true;
9963 }
9964
9965 op0 = aarch64_extend_bitfield_pattern_p (x);
9966 if (op0)
9967 {
9968 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9969 if (speed)
9970 *cost += extra_cost->alu.bfx;
9971 return true;
9972 }
9973
9974 if (speed)
9975 {
9976 if (VECTOR_MODE_P (mode))
9977 *cost += extra_cost->vect.alu;
9978 else
9979 *cost += extra_cost->alu.extend;
9980 }
9981 return false;
9982
9983 case ASHIFT:
9984 op0 = XEXP (x, 0);
9985 op1 = XEXP (x, 1);
9986
9987 if (CONST_INT_P (op1))
9988 {
9989 if (speed)
9990 {
9991 if (VECTOR_MODE_P (mode))
9992 {
9993 /* Vector shift (immediate). */
9994 *cost += extra_cost->vect.alu;
9995 }
9996 else
9997 {
9998 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9999 aliases. */
10000 *cost += extra_cost->alu.shift;
10001 }
10002 }
10003
10004 /* We can incorporate zero/sign extend for free. */
10005 if (GET_CODE (op0) == ZERO_EXTEND
10006 || GET_CODE (op0) == SIGN_EXTEND)
10007 op0 = XEXP (op0, 0);
10008
10009 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10010 return true;
10011 }
10012 else
10013 {
10014 if (VECTOR_MODE_P (mode))
10015 {
10016 if (speed)
10017 /* Vector shift (register). */
10018 *cost += extra_cost->vect.alu;
10019 }
10020 else
10021 {
10022 if (speed)
10023 /* LSLV. */
10024 *cost += extra_cost->alu.shift_reg;
10025
10026 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10027 && CONST_INT_P (XEXP (op1, 1))
10028 && known_eq (INTVAL (XEXP (op1, 1)),
10029 GET_MODE_BITSIZE (mode) - 1))
10030 {
10031 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10032 /* We already demanded XEXP (op1, 0) to be REG_P, so
10033 don't recurse into it. */
10034 return true;
10035 }
10036 }
10037 return false; /* All arguments need to be in registers. */
10038 }
10039
10040 case ROTATE:
10041 case ROTATERT:
10042 case LSHIFTRT:
10043 case ASHIFTRT:
10044 op0 = XEXP (x, 0);
10045 op1 = XEXP (x, 1);
10046
10047 if (CONST_INT_P (op1))
10048 {
10049 /* ASR (immediate) and friends. */
10050 if (speed)
10051 {
10052 if (VECTOR_MODE_P (mode))
10053 *cost += extra_cost->vect.alu;
10054 else
10055 *cost += extra_cost->alu.shift;
10056 }
10057
10058 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10059 return true;
10060 }
10061 else
10062 {
10063 if (VECTOR_MODE_P (mode))
10064 {
10065 if (speed)
10066 /* Vector shift (register). */
10067 *cost += extra_cost->vect.alu;
10068 }
10069 else
10070 {
10071 if (speed)
10072 /* ASR (register) and friends. */
10073 *cost += extra_cost->alu.shift_reg;
10074
10075 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10076 && CONST_INT_P (XEXP (op1, 1))
10077 && known_eq (INTVAL (XEXP (op1, 1)),
10078 GET_MODE_BITSIZE (mode) - 1))
10079 {
10080 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10081 /* We already demanded XEXP (op1, 0) to be REG_P, so
10082 don't recurse into it. */
10083 return true;
10084 }
10085 }
10086 return false; /* All arguments need to be in registers. */
10087 }
10088
10089 case SYMBOL_REF:
10090
10091 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10092 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10093 {
10094 /* LDR. */
10095 if (speed)
10096 *cost += extra_cost->ldst.load;
10097 }
10098 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10099 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10100 {
10101 /* ADRP, followed by ADD. */
10102 *cost += COSTS_N_INSNS (1);
10103 if (speed)
10104 *cost += 2 * extra_cost->alu.arith;
10105 }
10106 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10107 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10108 {
10109 /* ADR. */
10110 if (speed)
10111 *cost += extra_cost->alu.arith;
10112 }
10113
10114 if (flag_pic)
10115 {
10116 /* One extra load instruction, after accessing the GOT. */
10117 *cost += COSTS_N_INSNS (1);
10118 if (speed)
10119 *cost += extra_cost->ldst.load;
10120 }
10121 return true;
10122
10123 case HIGH:
10124 case LO_SUM:
10125 /* ADRP/ADD (immediate). */
10126 if (speed)
10127 *cost += extra_cost->alu.arith;
10128 return true;
10129
10130 case ZERO_EXTRACT:
10131 case SIGN_EXTRACT:
10132 /* UBFX/SBFX. */
10133 if (speed)
10134 {
10135 if (VECTOR_MODE_P (mode))
10136 *cost += extra_cost->vect.alu;
10137 else
10138 *cost += extra_cost->alu.bfx;
10139 }
10140
10141 /* We can trust that the immediates used will be correct (there
10142 are no by-register forms), so we need only cost op0. */
10143 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10144 return true;
10145
10146 case MULT:
10147 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10148 /* aarch64_rtx_mult_cost always handles recursion to its
10149 operands. */
10150 return true;
10151
10152 case MOD:
10153 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10154 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10155 an unconditional negate. This case should only ever be reached through
10156 the set_smod_pow2_cheap check in expmed.c. */
10157 if (CONST_INT_P (XEXP (x, 1))
10158 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10159 && (mode == SImode || mode == DImode))
10160 {
10161 /* We expand to 4 instructions. Reset the baseline. */
10162 *cost = COSTS_N_INSNS (4);
10163
10164 if (speed)
10165 *cost += 2 * extra_cost->alu.logical
10166 + 2 * extra_cost->alu.arith;
10167
10168 return true;
10169 }
10170
10171 /* Fall-through. */
10172 case UMOD:
10173 if (speed)
10174 {
10175 /* Slighly prefer UMOD over SMOD. */
10176 if (VECTOR_MODE_P (mode))
10177 *cost += extra_cost->vect.alu;
10178 else if (GET_MODE_CLASS (mode) == MODE_INT)
10179 *cost += (extra_cost->mult[mode == DImode].add
10180 + extra_cost->mult[mode == DImode].idiv
10181 + (code == MOD ? 1 : 0));
10182 }
10183 return false; /* All arguments need to be in registers. */
10184
10185 case DIV:
10186 case UDIV:
10187 case SQRT:
10188 if (speed)
10189 {
10190 if (VECTOR_MODE_P (mode))
10191 *cost += extra_cost->vect.alu;
10192 else if (GET_MODE_CLASS (mode) == MODE_INT)
10193 /* There is no integer SQRT, so only DIV and UDIV can get
10194 here. */
10195 *cost += (extra_cost->mult[mode == DImode].idiv
10196 /* Slighly prefer UDIV over SDIV. */
10197 + (code == DIV ? 1 : 0));
10198 else
10199 *cost += extra_cost->fp[mode == DFmode].div;
10200 }
10201 return false; /* All arguments need to be in registers. */
10202
10203 case IF_THEN_ELSE:
10204 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10205 XEXP (x, 2), cost, speed);
10206
10207 case EQ:
10208 case NE:
10209 case GT:
10210 case GTU:
10211 case LT:
10212 case LTU:
10213 case GE:
10214 case GEU:
10215 case LE:
10216 case LEU:
10217
10218 return false; /* All arguments must be in registers. */
10219
10220 case FMA:
10221 op0 = XEXP (x, 0);
10222 op1 = XEXP (x, 1);
10223 op2 = XEXP (x, 2);
10224
10225 if (speed)
10226 {
10227 if (VECTOR_MODE_P (mode))
10228 *cost += extra_cost->vect.alu;
10229 else
10230 *cost += extra_cost->fp[mode == DFmode].fma;
10231 }
10232
10233 /* FMSUB, FNMADD, and FNMSUB are free. */
10234 if (GET_CODE (op0) == NEG)
10235 op0 = XEXP (op0, 0);
10236
10237 if (GET_CODE (op2) == NEG)
10238 op2 = XEXP (op2, 0);
10239
10240 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10241 and the by-element operand as operand 0. */
10242 if (GET_CODE (op1) == NEG)
10243 op1 = XEXP (op1, 0);
10244
10245 /* Catch vector-by-element operations. The by-element operand can
10246 either be (vec_duplicate (vec_select (x))) or just
10247 (vec_select (x)), depending on whether we are multiplying by
10248 a vector or a scalar.
10249
10250 Canonicalization is not very good in these cases, FMA4 will put the
10251 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10252 if (GET_CODE (op0) == VEC_DUPLICATE)
10253 op0 = XEXP (op0, 0);
10254 else if (GET_CODE (op1) == VEC_DUPLICATE)
10255 op1 = XEXP (op1, 0);
10256
10257 if (GET_CODE (op0) == VEC_SELECT)
10258 op0 = XEXP (op0, 0);
10259 else if (GET_CODE (op1) == VEC_SELECT)
10260 op1 = XEXP (op1, 0);
10261
10262 /* If the remaining parameters are not registers,
10263 get the cost to put them into registers. */
10264 *cost += rtx_cost (op0, mode, FMA, 0, speed);
10265 *cost += rtx_cost (op1, mode, FMA, 1, speed);
10266 *cost += rtx_cost (op2, mode, FMA, 2, speed);
10267 return true;
10268
10269 case FLOAT:
10270 case UNSIGNED_FLOAT:
10271 if (speed)
10272 *cost += extra_cost->fp[mode == DFmode].fromint;
10273 return false;
10274
10275 case FLOAT_EXTEND:
10276 if (speed)
10277 {
10278 if (VECTOR_MODE_P (mode))
10279 {
10280 /*Vector truncate. */
10281 *cost += extra_cost->vect.alu;
10282 }
10283 else
10284 *cost += extra_cost->fp[mode == DFmode].widen;
10285 }
10286 return false;
10287
10288 case FLOAT_TRUNCATE:
10289 if (speed)
10290 {
10291 if (VECTOR_MODE_P (mode))
10292 {
10293 /*Vector conversion. */
10294 *cost += extra_cost->vect.alu;
10295 }
10296 else
10297 *cost += extra_cost->fp[mode == DFmode].narrow;
10298 }
10299 return false;
10300
10301 case FIX:
10302 case UNSIGNED_FIX:
10303 x = XEXP (x, 0);
10304 /* Strip the rounding part. They will all be implemented
10305 by the fcvt* family of instructions anyway. */
10306 if (GET_CODE (x) == UNSPEC)
10307 {
10308 unsigned int uns_code = XINT (x, 1);
10309
10310 if (uns_code == UNSPEC_FRINTA
10311 || uns_code == UNSPEC_FRINTM
10312 || uns_code == UNSPEC_FRINTN
10313 || uns_code == UNSPEC_FRINTP
10314 || uns_code == UNSPEC_FRINTZ)
10315 x = XVECEXP (x, 0, 0);
10316 }
10317
10318 if (speed)
10319 {
10320 if (VECTOR_MODE_P (mode))
10321 *cost += extra_cost->vect.alu;
10322 else
10323 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10324 }
10325
10326 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10327 fixed-point fcvt. */
10328 if (GET_CODE (x) == MULT
10329 && ((VECTOR_MODE_P (mode)
10330 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10331 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10332 {
10333 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10334 0, speed);
10335 return true;
10336 }
10337
10338 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10339 return true;
10340
10341 case ABS:
10342 if (VECTOR_MODE_P (mode))
10343 {
10344 /* ABS (vector). */
10345 if (speed)
10346 *cost += extra_cost->vect.alu;
10347 }
10348 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10349 {
10350 op0 = XEXP (x, 0);
10351
10352 /* FABD, which is analogous to FADD. */
10353 if (GET_CODE (op0) == MINUS)
10354 {
10355 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10356 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10357 if (speed)
10358 *cost += extra_cost->fp[mode == DFmode].addsub;
10359
10360 return true;
10361 }
10362 /* Simple FABS is analogous to FNEG. */
10363 if (speed)
10364 *cost += extra_cost->fp[mode == DFmode].neg;
10365 }
10366 else
10367 {
10368 /* Integer ABS will either be split to
10369 two arithmetic instructions, or will be an ABS
10370 (scalar), which we don't model. */
10371 *cost = COSTS_N_INSNS (2);
10372 if (speed)
10373 *cost += 2 * extra_cost->alu.arith;
10374 }
10375 return false;
10376
10377 case SMAX:
10378 case SMIN:
10379 if (speed)
10380 {
10381 if (VECTOR_MODE_P (mode))
10382 *cost += extra_cost->vect.alu;
10383 else
10384 {
10385 /* FMAXNM/FMINNM/FMAX/FMIN.
10386 TODO: This may not be accurate for all implementations, but
10387 we do not model this in the cost tables. */
10388 *cost += extra_cost->fp[mode == DFmode].addsub;
10389 }
10390 }
10391 return false;
10392
10393 case UNSPEC:
10394 /* The floating point round to integer frint* instructions. */
10395 if (aarch64_frint_unspec_p (XINT (x, 1)))
10396 {
10397 if (speed)
10398 *cost += extra_cost->fp[mode == DFmode].roundint;
10399
10400 return false;
10401 }
10402
10403 if (XINT (x, 1) == UNSPEC_RBIT)
10404 {
10405 if (speed)
10406 *cost += extra_cost->alu.rev;
10407
10408 return false;
10409 }
10410 break;
10411
10412 case TRUNCATE:
10413
10414 /* Decompose <su>muldi3_highpart. */
10415 if (/* (truncate:DI */
10416 mode == DImode
10417 /* (lshiftrt:TI */
10418 && GET_MODE (XEXP (x, 0)) == TImode
10419 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10420 /* (mult:TI */
10421 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10422 /* (ANY_EXTEND:TI (reg:DI))
10423 (ANY_EXTEND:TI (reg:DI))) */
10424 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10425 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10426 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10427 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10428 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10429 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10430 /* (const_int 64) */
10431 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10432 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10433 {
10434 /* UMULH/SMULH. */
10435 if (speed)
10436 *cost += extra_cost->mult[mode == DImode].extend;
10437 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10438 mode, MULT, 0, speed);
10439 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10440 mode, MULT, 1, speed);
10441 return true;
10442 }
10443
10444 /* Fall through. */
10445 default:
10446 break;
10447 }
10448
10449 if (dump_file
10450 && flag_aarch64_verbose_cost)
10451 fprintf (dump_file,
10452 "\nFailed to cost RTX. Assuming default cost.\n");
10453
10454 return true;
10455 }
10456
10457 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10458 calculated for X. This cost is stored in *COST. Returns true
10459 if the total cost of X was calculated. */
10460 static bool
10461 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10462 int param, int *cost, bool speed)
10463 {
10464 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10465
10466 if (dump_file
10467 && flag_aarch64_verbose_cost)
10468 {
10469 print_rtl_single (dump_file, x);
10470 fprintf (dump_file, "\n%s cost: %d (%s)\n",
10471 speed ? "Hot" : "Cold",
10472 *cost, result ? "final" : "partial");
10473 }
10474
10475 return result;
10476 }
10477
10478 static int
10479 aarch64_register_move_cost (machine_mode mode,
10480 reg_class_t from_i, reg_class_t to_i)
10481 {
10482 enum reg_class from = (enum reg_class) from_i;
10483 enum reg_class to = (enum reg_class) to_i;
10484 const struct cpu_regmove_cost *regmove_cost
10485 = aarch64_tune_params.regmove_cost;
10486
10487 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10488 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10489 to = GENERAL_REGS;
10490
10491 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10492 from = GENERAL_REGS;
10493
10494 /* Moving between GPR and stack cost is the same as GP2GP. */
10495 if ((from == GENERAL_REGS && to == STACK_REG)
10496 || (to == GENERAL_REGS && from == STACK_REG))
10497 return regmove_cost->GP2GP;
10498
10499 /* To/From the stack register, we move via the gprs. */
10500 if (to == STACK_REG || from == STACK_REG)
10501 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10502 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10503
10504 if (known_eq (GET_MODE_SIZE (mode), 16))
10505 {
10506 /* 128-bit operations on general registers require 2 instructions. */
10507 if (from == GENERAL_REGS && to == GENERAL_REGS)
10508 return regmove_cost->GP2GP * 2;
10509 else if (from == GENERAL_REGS)
10510 return regmove_cost->GP2FP * 2;
10511 else if (to == GENERAL_REGS)
10512 return regmove_cost->FP2GP * 2;
10513
10514 /* When AdvSIMD instructions are disabled it is not possible to move
10515 a 128-bit value directly between Q registers. This is handled in
10516 secondary reload. A general register is used as a scratch to move
10517 the upper DI value and the lower DI value is moved directly,
10518 hence the cost is the sum of three moves. */
10519 if (! TARGET_SIMD)
10520 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10521
10522 return regmove_cost->FP2FP;
10523 }
10524
10525 if (from == GENERAL_REGS && to == GENERAL_REGS)
10526 return regmove_cost->GP2GP;
10527 else if (from == GENERAL_REGS)
10528 return regmove_cost->GP2FP;
10529 else if (to == GENERAL_REGS)
10530 return regmove_cost->FP2GP;
10531
10532 return regmove_cost->FP2FP;
10533 }
10534
10535 static int
10536 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10537 reg_class_t rclass ATTRIBUTE_UNUSED,
10538 bool in ATTRIBUTE_UNUSED)
10539 {
10540 return aarch64_tune_params.memmov_cost;
10541 }
10542
10543 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10544 to optimize 1.0/sqrt. */
10545
10546 static bool
10547 use_rsqrt_p (machine_mode mode)
10548 {
10549 return (!flag_trapping_math
10550 && flag_unsafe_math_optimizations
10551 && ((aarch64_tune_params.approx_modes->recip_sqrt
10552 & AARCH64_APPROX_MODE (mode))
10553 || flag_mrecip_low_precision_sqrt));
10554 }
10555
10556 /* Function to decide when to use the approximate reciprocal square root
10557 builtin. */
10558
10559 static tree
10560 aarch64_builtin_reciprocal (tree fndecl)
10561 {
10562 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10563
10564 if (!use_rsqrt_p (mode))
10565 return NULL_TREE;
10566 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10567 }
10568
10569 /* Emit instruction sequence to compute either the approximate square root
10570 or its approximate reciprocal, depending on the flag RECP, and return
10571 whether the sequence was emitted or not. */
10572
10573 bool
10574 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10575 {
10576 machine_mode mode = GET_MODE (dst);
10577
10578 if (GET_MODE_INNER (mode) == HFmode)
10579 {
10580 gcc_assert (!recp);
10581 return false;
10582 }
10583
10584 if (!recp)
10585 {
10586 if (!(flag_mlow_precision_sqrt
10587 || (aarch64_tune_params.approx_modes->sqrt
10588 & AARCH64_APPROX_MODE (mode))))
10589 return false;
10590
10591 if (flag_finite_math_only
10592 || flag_trapping_math
10593 || !flag_unsafe_math_optimizations
10594 || optimize_function_for_size_p (cfun))
10595 return false;
10596 }
10597 else
10598 /* Caller assumes we cannot fail. */
10599 gcc_assert (use_rsqrt_p (mode));
10600
10601 machine_mode mmsk = mode_for_int_vector (mode).require ();
10602 rtx xmsk = gen_reg_rtx (mmsk);
10603 if (!recp)
10604 /* When calculating the approximate square root, compare the
10605 argument with 0.0 and create a mask. */
10606 emit_insn (gen_rtx_SET (xmsk,
10607 gen_rtx_NEG (mmsk,
10608 gen_rtx_EQ (mmsk, src,
10609 CONST0_RTX (mode)))));
10610
10611 /* Estimate the approximate reciprocal square root. */
10612 rtx xdst = gen_reg_rtx (mode);
10613 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10614
10615 /* Iterate over the series twice for SF and thrice for DF. */
10616 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10617
10618 /* Optionally iterate over the series once less for faster performance
10619 while sacrificing the accuracy. */
10620 if ((recp && flag_mrecip_low_precision_sqrt)
10621 || (!recp && flag_mlow_precision_sqrt))
10622 iterations--;
10623
10624 /* Iterate over the series to calculate the approximate reciprocal square
10625 root. */
10626 rtx x1 = gen_reg_rtx (mode);
10627 while (iterations--)
10628 {
10629 rtx x2 = gen_reg_rtx (mode);
10630 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10631
10632 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10633
10634 if (iterations > 0)
10635 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10636 }
10637
10638 if (!recp)
10639 {
10640 /* Qualify the approximate reciprocal square root when the argument is
10641 0.0 by squashing the intermediary result to 0.0. */
10642 rtx xtmp = gen_reg_rtx (mmsk);
10643 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10644 gen_rtx_SUBREG (mmsk, xdst, 0)));
10645 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10646
10647 /* Calculate the approximate square root. */
10648 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10649 }
10650
10651 /* Finalize the approximation. */
10652 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10653
10654 return true;
10655 }
10656
10657 /* Emit the instruction sequence to compute the approximation for the division
10658 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10659
10660 bool
10661 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10662 {
10663 machine_mode mode = GET_MODE (quo);
10664
10665 if (GET_MODE_INNER (mode) == HFmode)
10666 return false;
10667
10668 bool use_approx_division_p = (flag_mlow_precision_div
10669 || (aarch64_tune_params.approx_modes->division
10670 & AARCH64_APPROX_MODE (mode)));
10671
10672 if (!flag_finite_math_only
10673 || flag_trapping_math
10674 || !flag_unsafe_math_optimizations
10675 || optimize_function_for_size_p (cfun)
10676 || !use_approx_division_p)
10677 return false;
10678
10679 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10680 return false;
10681
10682 /* Estimate the approximate reciprocal. */
10683 rtx xrcp = gen_reg_rtx (mode);
10684 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10685
10686 /* Iterate over the series twice for SF and thrice for DF. */
10687 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10688
10689 /* Optionally iterate over the series once less for faster performance,
10690 while sacrificing the accuracy. */
10691 if (flag_mlow_precision_div)
10692 iterations--;
10693
10694 /* Iterate over the series to calculate the approximate reciprocal. */
10695 rtx xtmp = gen_reg_rtx (mode);
10696 while (iterations--)
10697 {
10698 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10699
10700 if (iterations > 0)
10701 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10702 }
10703
10704 if (num != CONST1_RTX (mode))
10705 {
10706 /* As the approximate reciprocal of DEN is already calculated, only
10707 calculate the approximate division when NUM is not 1.0. */
10708 rtx xnum = force_reg (mode, num);
10709 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10710 }
10711
10712 /* Finalize the approximation. */
10713 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10714 return true;
10715 }
10716
10717 /* Return the number of instructions that can be issued per cycle. */
10718 static int
10719 aarch64_sched_issue_rate (void)
10720 {
10721 return aarch64_tune_params.issue_rate;
10722 }
10723
10724 static int
10725 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10726 {
10727 int issue_rate = aarch64_sched_issue_rate ();
10728
10729 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10730 }
10731
10732
10733 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10734 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10735 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10736
10737 static int
10738 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10739 int ready_index)
10740 {
10741 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10742 }
10743
10744
10745 /* Vectorizer cost model target hooks. */
10746
10747 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10748 static int
10749 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10750 tree vectype,
10751 int misalign ATTRIBUTE_UNUSED)
10752 {
10753 unsigned elements;
10754 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10755 bool fp = false;
10756
10757 if (vectype != NULL)
10758 fp = FLOAT_TYPE_P (vectype);
10759
10760 switch (type_of_cost)
10761 {
10762 case scalar_stmt:
10763 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10764
10765 case scalar_load:
10766 return costs->scalar_load_cost;
10767
10768 case scalar_store:
10769 return costs->scalar_store_cost;
10770
10771 case vector_stmt:
10772 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10773
10774 case vector_load:
10775 return costs->vec_align_load_cost;
10776
10777 case vector_store:
10778 return costs->vec_store_cost;
10779
10780 case vec_to_scalar:
10781 return costs->vec_to_scalar_cost;
10782
10783 case scalar_to_vec:
10784 return costs->scalar_to_vec_cost;
10785
10786 case unaligned_load:
10787 case vector_gather_load:
10788 return costs->vec_unalign_load_cost;
10789
10790 case unaligned_store:
10791 case vector_scatter_store:
10792 return costs->vec_unalign_store_cost;
10793
10794 case cond_branch_taken:
10795 return costs->cond_taken_branch_cost;
10796
10797 case cond_branch_not_taken:
10798 return costs->cond_not_taken_branch_cost;
10799
10800 case vec_perm:
10801 return costs->vec_permute_cost;
10802
10803 case vec_promote_demote:
10804 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10805
10806 case vec_construct:
10807 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10808 return elements / 2 + 1;
10809
10810 default:
10811 gcc_unreachable ();
10812 }
10813 }
10814
10815 /* Implement targetm.vectorize.add_stmt_cost. */
10816 static unsigned
10817 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10818 struct _stmt_vec_info *stmt_info, int misalign,
10819 enum vect_cost_model_location where)
10820 {
10821 unsigned *cost = (unsigned *) data;
10822 unsigned retval = 0;
10823
10824 if (flag_vect_cost_model)
10825 {
10826 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10827 int stmt_cost =
10828 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10829
10830 /* Statements in an inner loop relative to the loop being
10831 vectorized are weighted more heavily. The value here is
10832 arbitrary and could potentially be improved with analysis. */
10833 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10834 count *= 50; /* FIXME */
10835
10836 retval = (unsigned) (count * stmt_cost);
10837 cost[where] += retval;
10838 }
10839
10840 return retval;
10841 }
10842
10843 static void initialize_aarch64_code_model (struct gcc_options *);
10844
10845 /* Parse the TO_PARSE string and put the architecture struct that it
10846 selects into RES and the architectural features into ISA_FLAGS.
10847 Return an aarch64_parse_opt_result describing the parse result.
10848 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
10849 When the TO_PARSE string contains an invalid extension,
10850 a copy of the string is created and stored to INVALID_EXTENSION. */
10851
10852 static enum aarch64_parse_opt_result
10853 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10854 unsigned long *isa_flags, std::string *invalid_extension)
10855 {
10856 const char *ext;
10857 const struct processor *arch;
10858 size_t len;
10859
10860 ext = strchr (to_parse, '+');
10861
10862 if (ext != NULL)
10863 len = ext - to_parse;
10864 else
10865 len = strlen (to_parse);
10866
10867 if (len == 0)
10868 return AARCH64_PARSE_MISSING_ARG;
10869
10870
10871 /* Loop through the list of supported ARCHes to find a match. */
10872 for (arch = all_architectures; arch->name != NULL; arch++)
10873 {
10874 if (strlen (arch->name) == len
10875 && strncmp (arch->name, to_parse, len) == 0)
10876 {
10877 unsigned long isa_temp = arch->flags;
10878
10879 if (ext != NULL)
10880 {
10881 /* TO_PARSE string contains at least one extension. */
10882 enum aarch64_parse_opt_result ext_res
10883 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
10884
10885 if (ext_res != AARCH64_PARSE_OK)
10886 return ext_res;
10887 }
10888 /* Extension parsing was successful. Confirm the result
10889 arch and ISA flags. */
10890 *res = arch;
10891 *isa_flags = isa_temp;
10892 return AARCH64_PARSE_OK;
10893 }
10894 }
10895
10896 /* ARCH name not found in list. */
10897 return AARCH64_PARSE_INVALID_ARG;
10898 }
10899
10900 /* Parse the TO_PARSE string and put the result tuning in RES and the
10901 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10902 describing the parse result. If there is an error parsing, RES and
10903 ISA_FLAGS are left unchanged.
10904 When the TO_PARSE string contains an invalid extension,
10905 a copy of the string is created and stored to INVALID_EXTENSION. */
10906
10907 static enum aarch64_parse_opt_result
10908 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10909 unsigned long *isa_flags, std::string *invalid_extension)
10910 {
10911 const char *ext;
10912 const struct processor *cpu;
10913 size_t len;
10914
10915 ext = strchr (to_parse, '+');
10916
10917 if (ext != NULL)
10918 len = ext - to_parse;
10919 else
10920 len = strlen (to_parse);
10921
10922 if (len == 0)
10923 return AARCH64_PARSE_MISSING_ARG;
10924
10925
10926 /* Loop through the list of supported CPUs to find a match. */
10927 for (cpu = all_cores; cpu->name != NULL; cpu++)
10928 {
10929 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
10930 {
10931 unsigned long isa_temp = cpu->flags;
10932
10933
10934 if (ext != NULL)
10935 {
10936 /* TO_PARSE string contains at least one extension. */
10937 enum aarch64_parse_opt_result ext_res
10938 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
10939
10940 if (ext_res != AARCH64_PARSE_OK)
10941 return ext_res;
10942 }
10943 /* Extension parsing was successfull. Confirm the result
10944 cpu and ISA flags. */
10945 *res = cpu;
10946 *isa_flags = isa_temp;
10947 return AARCH64_PARSE_OK;
10948 }
10949 }
10950
10951 /* CPU name not found in list. */
10952 return AARCH64_PARSE_INVALID_ARG;
10953 }
10954
10955 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10956 Return an aarch64_parse_opt_result describing the parse result.
10957 If the parsing fails the RES does not change. */
10958
10959 static enum aarch64_parse_opt_result
10960 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10961 {
10962 const struct processor *cpu;
10963
10964 /* Loop through the list of supported CPUs to find a match. */
10965 for (cpu = all_cores; cpu->name != NULL; cpu++)
10966 {
10967 if (strcmp (cpu->name, to_parse) == 0)
10968 {
10969 *res = cpu;
10970 return AARCH64_PARSE_OK;
10971 }
10972 }
10973
10974 /* CPU name not found in list. */
10975 return AARCH64_PARSE_INVALID_ARG;
10976 }
10977
10978 /* Parse TOKEN, which has length LENGTH to see if it is an option
10979 described in FLAG. If it is, return the index bit for that fusion type.
10980 If not, error (printing OPTION_NAME) and return zero. */
10981
10982 static unsigned int
10983 aarch64_parse_one_option_token (const char *token,
10984 size_t length,
10985 const struct aarch64_flag_desc *flag,
10986 const char *option_name)
10987 {
10988 for (; flag->name != NULL; flag++)
10989 {
10990 if (length == strlen (flag->name)
10991 && !strncmp (flag->name, token, length))
10992 return flag->flag;
10993 }
10994
10995 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10996 return 0;
10997 }
10998
10999 /* Parse OPTION which is a comma-separated list of flags to enable.
11000 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11001 default state we inherit from the CPU tuning structures. OPTION_NAME
11002 gives the top-level option we are parsing in the -moverride string,
11003 for use in error messages. */
11004
11005 static unsigned int
11006 aarch64_parse_boolean_options (const char *option,
11007 const struct aarch64_flag_desc *flags,
11008 unsigned int initial_state,
11009 const char *option_name)
11010 {
11011 const char separator = '.';
11012 const char* specs = option;
11013 const char* ntoken = option;
11014 unsigned int found_flags = initial_state;
11015
11016 while ((ntoken = strchr (specs, separator)))
11017 {
11018 size_t token_length = ntoken - specs;
11019 unsigned token_ops = aarch64_parse_one_option_token (specs,
11020 token_length,
11021 flags,
11022 option_name);
11023 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11024 in the token stream, reset the supported operations. So:
11025
11026 adrp+add.cmp+branch.none.adrp+add
11027
11028 would have the result of turning on only adrp+add fusion. */
11029 if (!token_ops)
11030 found_flags = 0;
11031
11032 found_flags |= token_ops;
11033 specs = ++ntoken;
11034 }
11035
11036 /* We ended with a comma, print something. */
11037 if (!(*specs))
11038 {
11039 error ("%s string ill-formed\n", option_name);
11040 return 0;
11041 }
11042
11043 /* We still have one more token to parse. */
11044 size_t token_length = strlen (specs);
11045 unsigned token_ops = aarch64_parse_one_option_token (specs,
11046 token_length,
11047 flags,
11048 option_name);
11049 if (!token_ops)
11050 found_flags = 0;
11051
11052 found_flags |= token_ops;
11053 return found_flags;
11054 }
11055
11056 /* Support for overriding instruction fusion. */
11057
11058 static void
11059 aarch64_parse_fuse_string (const char *fuse_string,
11060 struct tune_params *tune)
11061 {
11062 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11063 aarch64_fusible_pairs,
11064 tune->fusible_ops,
11065 "fuse=");
11066 }
11067
11068 /* Support for overriding other tuning flags. */
11069
11070 static void
11071 aarch64_parse_tune_string (const char *tune_string,
11072 struct tune_params *tune)
11073 {
11074 tune->extra_tuning_flags
11075 = aarch64_parse_boolean_options (tune_string,
11076 aarch64_tuning_flags,
11077 tune->extra_tuning_flags,
11078 "tune=");
11079 }
11080
11081 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11082 Accept the valid SVE vector widths allowed by
11083 aarch64_sve_vector_bits_enum and use it to override sve_width
11084 in TUNE. */
11085
11086 static void
11087 aarch64_parse_sve_width_string (const char *tune_string,
11088 struct tune_params *tune)
11089 {
11090 int width = -1;
11091
11092 int n = sscanf (tune_string, "%d", &width);
11093 if (n == EOF)
11094 {
11095 error ("invalid format for sve_width");
11096 return;
11097 }
11098 switch (width)
11099 {
11100 case SVE_128:
11101 case SVE_256:
11102 case SVE_512:
11103 case SVE_1024:
11104 case SVE_2048:
11105 break;
11106 default:
11107 error ("invalid sve_width value: %d", width);
11108 }
11109 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11110 }
11111
11112 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11113 we understand. If it is, extract the option string and handoff to
11114 the appropriate function. */
11115
11116 void
11117 aarch64_parse_one_override_token (const char* token,
11118 size_t length,
11119 struct tune_params *tune)
11120 {
11121 const struct aarch64_tuning_override_function *fn
11122 = aarch64_tuning_override_functions;
11123
11124 const char *option_part = strchr (token, '=');
11125 if (!option_part)
11126 {
11127 error ("tuning string missing in option (%s)", token);
11128 return;
11129 }
11130
11131 /* Get the length of the option name. */
11132 length = option_part - token;
11133 /* Skip the '=' to get to the option string. */
11134 option_part++;
11135
11136 for (; fn->name != NULL; fn++)
11137 {
11138 if (!strncmp (fn->name, token, length))
11139 {
11140 fn->parse_override (option_part, tune);
11141 return;
11142 }
11143 }
11144
11145 error ("unknown tuning option (%s)",token);
11146 return;
11147 }
11148
11149 /* A checking mechanism for the implementation of the tls size. */
11150
11151 static void
11152 initialize_aarch64_tls_size (struct gcc_options *opts)
11153 {
11154 if (aarch64_tls_size == 0)
11155 aarch64_tls_size = 24;
11156
11157 switch (opts->x_aarch64_cmodel_var)
11158 {
11159 case AARCH64_CMODEL_TINY:
11160 /* Both the default and maximum TLS size allowed under tiny is 1M which
11161 needs two instructions to address, so we clamp the size to 24. */
11162 if (aarch64_tls_size > 24)
11163 aarch64_tls_size = 24;
11164 break;
11165 case AARCH64_CMODEL_SMALL:
11166 /* The maximum TLS size allowed under small is 4G. */
11167 if (aarch64_tls_size > 32)
11168 aarch64_tls_size = 32;
11169 break;
11170 case AARCH64_CMODEL_LARGE:
11171 /* The maximum TLS size allowed under large is 16E.
11172 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11173 if (aarch64_tls_size > 48)
11174 aarch64_tls_size = 48;
11175 break;
11176 default:
11177 gcc_unreachable ();
11178 }
11179
11180 return;
11181 }
11182
11183 /* Parse STRING looking for options in the format:
11184 string :: option:string
11185 option :: name=substring
11186 name :: {a-z}
11187 substring :: defined by option. */
11188
11189 static void
11190 aarch64_parse_override_string (const char* input_string,
11191 struct tune_params* tune)
11192 {
11193 const char separator = ':';
11194 size_t string_length = strlen (input_string) + 1;
11195 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11196 char *string = string_root;
11197 strncpy (string, input_string, string_length);
11198 string[string_length - 1] = '\0';
11199
11200 char* ntoken = string;
11201
11202 while ((ntoken = strchr (string, separator)))
11203 {
11204 size_t token_length = ntoken - string;
11205 /* Make this substring look like a string. */
11206 *ntoken = '\0';
11207 aarch64_parse_one_override_token (string, token_length, tune);
11208 string = ++ntoken;
11209 }
11210
11211 /* One last option to parse. */
11212 aarch64_parse_one_override_token (string, strlen (string), tune);
11213 free (string_root);
11214 }
11215
11216
11217 static void
11218 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11219 {
11220 if (accepted_branch_protection_string)
11221 {
11222 opts->x_aarch64_branch_protection_string
11223 = xstrdup (accepted_branch_protection_string);
11224 }
11225
11226 /* PR 70044: We have to be careful about being called multiple times for the
11227 same function. This means all changes should be repeatable. */
11228
11229 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11230 Disable the frame pointer flag so the mid-end will not use a frame
11231 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11232 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11233 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11234 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11235 if (opts->x_flag_omit_frame_pointer == 0)
11236 opts->x_flag_omit_frame_pointer = 2;
11237
11238 /* If not optimizing for size, set the default
11239 alignment to what the target wants. */
11240 if (!opts->x_optimize_size)
11241 {
11242 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11243 opts->x_str_align_loops = aarch64_tune_params.loop_align;
11244 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11245 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11246 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11247 opts->x_str_align_functions = aarch64_tune_params.function_align;
11248 }
11249
11250 /* We default to no pc-relative literal loads. */
11251
11252 aarch64_pcrelative_literal_loads = false;
11253
11254 /* If -mpc-relative-literal-loads is set on the command line, this
11255 implies that the user asked for PC relative literal loads. */
11256 if (opts->x_pcrelative_literal_loads == 1)
11257 aarch64_pcrelative_literal_loads = true;
11258
11259 /* In the tiny memory model it makes no sense to disallow PC relative
11260 literal pool loads. */
11261 if (aarch64_cmodel == AARCH64_CMODEL_TINY
11262 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11263 aarch64_pcrelative_literal_loads = true;
11264
11265 /* When enabling the lower precision Newton series for the square root, also
11266 enable it for the reciprocal square root, since the latter is an
11267 intermediary step for the former. */
11268 if (flag_mlow_precision_sqrt)
11269 flag_mrecip_low_precision_sqrt = true;
11270 }
11271
11272 /* 'Unpack' up the internal tuning structs and update the options
11273 in OPTS. The caller must have set up selected_tune and selected_arch
11274 as all the other target-specific codegen decisions are
11275 derived from them. */
11276
11277 void
11278 aarch64_override_options_internal (struct gcc_options *opts)
11279 {
11280 aarch64_tune_flags = selected_tune->flags;
11281 aarch64_tune = selected_tune->sched_core;
11282 /* Make a copy of the tuning parameters attached to the core, which
11283 we may later overwrite. */
11284 aarch64_tune_params = *(selected_tune->tune);
11285 aarch64_architecture_version = selected_arch->architecture_version;
11286
11287 if (opts->x_aarch64_override_tune_string)
11288 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11289 &aarch64_tune_params);
11290
11291 /* This target defaults to strict volatile bitfields. */
11292 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11293 opts->x_flag_strict_volatile_bitfields = 1;
11294
11295 initialize_aarch64_code_model (opts);
11296 initialize_aarch64_tls_size (opts);
11297
11298 int queue_depth = 0;
11299 switch (aarch64_tune_params.autoprefetcher_model)
11300 {
11301 case tune_params::AUTOPREFETCHER_OFF:
11302 queue_depth = -1;
11303 break;
11304 case tune_params::AUTOPREFETCHER_WEAK:
11305 queue_depth = 0;
11306 break;
11307 case tune_params::AUTOPREFETCHER_STRONG:
11308 queue_depth = max_insn_queue_index + 1;
11309 break;
11310 default:
11311 gcc_unreachable ();
11312 }
11313
11314 /* We don't mind passing in global_options_set here as we don't use
11315 the *options_set structs anyway. */
11316 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11317 queue_depth,
11318 opts->x_param_values,
11319 global_options_set.x_param_values);
11320
11321 /* Set up parameters to be used in prefetching algorithm. Do not
11322 override the defaults unless we are tuning for a core we have
11323 researched values for. */
11324 if (aarch64_tune_params.prefetch->num_slots > 0)
11325 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11326 aarch64_tune_params.prefetch->num_slots,
11327 opts->x_param_values,
11328 global_options_set.x_param_values);
11329 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11330 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11331 aarch64_tune_params.prefetch->l1_cache_size,
11332 opts->x_param_values,
11333 global_options_set.x_param_values);
11334 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11335 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11336 aarch64_tune_params.prefetch->l1_cache_line_size,
11337 opts->x_param_values,
11338 global_options_set.x_param_values);
11339 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11340 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11341 aarch64_tune_params.prefetch->l2_cache_size,
11342 opts->x_param_values,
11343 global_options_set.x_param_values);
11344 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11345 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11346 0,
11347 opts->x_param_values,
11348 global_options_set.x_param_values);
11349 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11350 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11351 aarch64_tune_params.prefetch->minimum_stride,
11352 opts->x_param_values,
11353 global_options_set.x_param_values);
11354
11355 /* Use the alternative scheduling-pressure algorithm by default. */
11356 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11357 opts->x_param_values,
11358 global_options_set.x_param_values);
11359
11360 /* If the user hasn't changed it via configure then set the default to 64 KB
11361 for the backend. */
11362 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11363 DEFAULT_STK_CLASH_GUARD_SIZE == 0
11364 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11365 opts->x_param_values,
11366 global_options_set.x_param_values);
11367
11368 /* Validate the guard size. */
11369 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11370
11371 /* Enforce that interval is the same size as size so the mid-end does the
11372 right thing. */
11373 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11374 guard_size,
11375 opts->x_param_values,
11376 global_options_set.x_param_values);
11377
11378 /* The maybe_set calls won't update the value if the user has explicitly set
11379 one. Which means we need to validate that probing interval and guard size
11380 are equal. */
11381 int probe_interval
11382 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11383 if (guard_size != probe_interval)
11384 error ("stack clash guard size '%d' must be equal to probing interval "
11385 "'%d'", guard_size, probe_interval);
11386
11387 /* Enable sw prefetching at specified optimization level for
11388 CPUS that have prefetch. Lower optimization level threshold by 1
11389 when profiling is enabled. */
11390 if (opts->x_flag_prefetch_loop_arrays < 0
11391 && !opts->x_optimize_size
11392 && aarch64_tune_params.prefetch->default_opt_level >= 0
11393 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11394 opts->x_flag_prefetch_loop_arrays = 1;
11395
11396 if (opts->x_aarch64_arch_string == NULL)
11397 opts->x_aarch64_arch_string = selected_arch->name;
11398 if (opts->x_aarch64_cpu_string == NULL)
11399 opts->x_aarch64_cpu_string = selected_cpu->name;
11400 if (opts->x_aarch64_tune_string == NULL)
11401 opts->x_aarch64_tune_string = selected_tune->name;
11402
11403 aarch64_override_options_after_change_1 (opts);
11404 }
11405
11406 /* Print a hint with a suggestion for a core or architecture name that
11407 most closely resembles what the user passed in STR. ARCH is true if
11408 the user is asking for an architecture name. ARCH is false if the user
11409 is asking for a core name. */
11410
11411 static void
11412 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11413 {
11414 auto_vec<const char *> candidates;
11415 const struct processor *entry = arch ? all_architectures : all_cores;
11416 for (; entry->name != NULL; entry++)
11417 candidates.safe_push (entry->name);
11418
11419 #ifdef HAVE_LOCAL_CPU_DETECT
11420 /* Add also "native" as possible value. */
11421 if (arch)
11422 candidates.safe_push ("native");
11423 #endif
11424
11425 char *s;
11426 const char *hint = candidates_list_and_hint (str, s, candidates);
11427 if (hint)
11428 inform (input_location, "valid arguments are: %s;"
11429 " did you mean %qs?", s, hint);
11430 else
11431 inform (input_location, "valid arguments are: %s", s);
11432
11433 XDELETEVEC (s);
11434 }
11435
11436 /* Print a hint with a suggestion for a core name that most closely resembles
11437 what the user passed in STR. */
11438
11439 inline static void
11440 aarch64_print_hint_for_core (const char *str)
11441 {
11442 aarch64_print_hint_for_core_or_arch (str, false);
11443 }
11444
11445 /* Print a hint with a suggestion for an architecture name that most closely
11446 resembles what the user passed in STR. */
11447
11448 inline static void
11449 aarch64_print_hint_for_arch (const char *str)
11450 {
11451 aarch64_print_hint_for_core_or_arch (str, true);
11452 }
11453
11454
11455 /* Print a hint with a suggestion for an extension name
11456 that most closely resembles what the user passed in STR. */
11457
11458 void
11459 aarch64_print_hint_for_extensions (const std::string &str)
11460 {
11461 auto_vec<const char *> candidates;
11462 aarch64_get_all_extension_candidates (&candidates);
11463 char *s;
11464 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11465 if (hint)
11466 inform (input_location, "valid arguments are: %s;"
11467 " did you mean %qs?", s, hint);
11468 else
11469 inform (input_location, "valid arguments are: %s;", s);
11470
11471 XDELETEVEC (s);
11472 }
11473
11474 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11475 specified in STR and throw errors if appropriate. Put the results if
11476 they are valid in RES and ISA_FLAGS. Return whether the option is
11477 valid. */
11478
11479 static bool
11480 aarch64_validate_mcpu (const char *str, const struct processor **res,
11481 unsigned long *isa_flags)
11482 {
11483 std::string invalid_extension;
11484 enum aarch64_parse_opt_result parse_res
11485 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11486
11487 if (parse_res == AARCH64_PARSE_OK)
11488 return true;
11489
11490 switch (parse_res)
11491 {
11492 case AARCH64_PARSE_MISSING_ARG:
11493 error ("missing cpu name in %<-mcpu=%s%>", str);
11494 break;
11495 case AARCH64_PARSE_INVALID_ARG:
11496 error ("unknown value %qs for -mcpu", str);
11497 aarch64_print_hint_for_core (str);
11498 break;
11499 case AARCH64_PARSE_INVALID_FEATURE:
11500 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11501 invalid_extension.c_str (), str);
11502 aarch64_print_hint_for_extensions (invalid_extension);
11503 break;
11504 default:
11505 gcc_unreachable ();
11506 }
11507
11508 return false;
11509 }
11510
11511 /* Parses CONST_STR for branch protection features specified in
11512 aarch64_branch_protect_types, and set any global variables required. Returns
11513 the parsing result and assigns LAST_STR to the last processed token from
11514 CONST_STR so that it can be used for error reporting. */
11515
11516 static enum
11517 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11518 char** last_str)
11519 {
11520 char *str_root = xstrdup (const_str);
11521 char* token_save = NULL;
11522 char *str = strtok_r (str_root, "+", &token_save);
11523 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11524 if (!str)
11525 res = AARCH64_PARSE_MISSING_ARG;
11526 else
11527 {
11528 char *next_str = strtok_r (NULL, "+", &token_save);
11529 /* Reset the branch protection features to their defaults. */
11530 aarch64_handle_no_branch_protection (NULL, NULL);
11531
11532 while (str && res == AARCH64_PARSE_OK)
11533 {
11534 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11535 bool found = false;
11536 /* Search for this type. */
11537 while (type && type->name && !found && res == AARCH64_PARSE_OK)
11538 {
11539 if (strcmp (str, type->name) == 0)
11540 {
11541 found = true;
11542 res = type->handler (str, next_str);
11543 str = next_str;
11544 next_str = strtok_r (NULL, "+", &token_save);
11545 }
11546 else
11547 type++;
11548 }
11549 if (found && res == AARCH64_PARSE_OK)
11550 {
11551 bool found_subtype = true;
11552 /* Loop through each token until we find one that isn't a
11553 subtype. */
11554 while (found_subtype)
11555 {
11556 found_subtype = false;
11557 const aarch64_branch_protect_type *subtype = type->subtypes;
11558 /* Search for the subtype. */
11559 while (str && subtype && subtype->name && !found_subtype
11560 && res == AARCH64_PARSE_OK)
11561 {
11562 if (strcmp (str, subtype->name) == 0)
11563 {
11564 found_subtype = true;
11565 res = subtype->handler (str, next_str);
11566 str = next_str;
11567 next_str = strtok_r (NULL, "+", &token_save);
11568 }
11569 else
11570 subtype++;
11571 }
11572 }
11573 }
11574 else if (!found)
11575 res = AARCH64_PARSE_INVALID_ARG;
11576 }
11577 }
11578 /* Copy the last processed token into the argument to pass it back.
11579 Used by option and attribute validation to print the offending token. */
11580 if (last_str)
11581 {
11582 if (str) strcpy (*last_str, str);
11583 else *last_str = NULL;
11584 }
11585 if (res == AARCH64_PARSE_OK)
11586 {
11587 /* If needed, alloc the accepted string then copy in const_str.
11588 Used by override_option_after_change_1. */
11589 if (!accepted_branch_protection_string)
11590 accepted_branch_protection_string = (char *) xmalloc (
11591 BRANCH_PROTECT_STR_MAX
11592 + 1);
11593 strncpy (accepted_branch_protection_string, const_str,
11594 BRANCH_PROTECT_STR_MAX + 1);
11595 /* Forcibly null-terminate. */
11596 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11597 }
11598 return res;
11599 }
11600
11601 static bool
11602 aarch64_validate_mbranch_protection (const char *const_str)
11603 {
11604 char *str = (char *) xmalloc (strlen (const_str));
11605 enum aarch64_parse_opt_result res =
11606 aarch64_parse_branch_protection (const_str, &str);
11607 if (res == AARCH64_PARSE_INVALID_ARG)
11608 error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11609 else if (res == AARCH64_PARSE_MISSING_ARG)
11610 error ("missing arg for %<-mbranch-protection=%>");
11611 free (str);
11612 return res == AARCH64_PARSE_OK;
11613 }
11614
11615 /* Validate a command-line -march option. Parse the arch and extensions
11616 (if any) specified in STR and throw errors if appropriate. Put the
11617 results, if they are valid, in RES and ISA_FLAGS. Return whether the
11618 option is valid. */
11619
11620 static bool
11621 aarch64_validate_march (const char *str, const struct processor **res,
11622 unsigned long *isa_flags)
11623 {
11624 std::string invalid_extension;
11625 enum aarch64_parse_opt_result parse_res
11626 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11627
11628 if (parse_res == AARCH64_PARSE_OK)
11629 return true;
11630
11631 switch (parse_res)
11632 {
11633 case AARCH64_PARSE_MISSING_ARG:
11634 error ("missing arch name in %<-march=%s%>", str);
11635 break;
11636 case AARCH64_PARSE_INVALID_ARG:
11637 error ("unknown value %qs for -march", str);
11638 aarch64_print_hint_for_arch (str);
11639 break;
11640 case AARCH64_PARSE_INVALID_FEATURE:
11641 error ("invalid feature modifier %qs in %<-march=%s%>",
11642 invalid_extension.c_str (), str);
11643 aarch64_print_hint_for_extensions (invalid_extension);
11644 break;
11645 default:
11646 gcc_unreachable ();
11647 }
11648
11649 return false;
11650 }
11651
11652 /* Validate a command-line -mtune option. Parse the cpu
11653 specified in STR and throw errors if appropriate. Put the
11654 result, if it is valid, in RES. Return whether the option is
11655 valid. */
11656
11657 static bool
11658 aarch64_validate_mtune (const char *str, const struct processor **res)
11659 {
11660 enum aarch64_parse_opt_result parse_res
11661 = aarch64_parse_tune (str, res);
11662
11663 if (parse_res == AARCH64_PARSE_OK)
11664 return true;
11665
11666 switch (parse_res)
11667 {
11668 case AARCH64_PARSE_MISSING_ARG:
11669 error ("missing cpu name in %<-mtune=%s%>", str);
11670 break;
11671 case AARCH64_PARSE_INVALID_ARG:
11672 error ("unknown value %qs for -mtune", str);
11673 aarch64_print_hint_for_core (str);
11674 break;
11675 default:
11676 gcc_unreachable ();
11677 }
11678 return false;
11679 }
11680
11681 /* Return the CPU corresponding to the enum CPU.
11682 If it doesn't specify a cpu, return the default. */
11683
11684 static const struct processor *
11685 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11686 {
11687 if (cpu != aarch64_none)
11688 return &all_cores[cpu];
11689
11690 /* The & 0x3f is to extract the bottom 6 bits that encode the
11691 default cpu as selected by the --with-cpu GCC configure option
11692 in config.gcc.
11693 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11694 flags mechanism should be reworked to make it more sane. */
11695 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11696 }
11697
11698 /* Return the architecture corresponding to the enum ARCH.
11699 If it doesn't specify a valid architecture, return the default. */
11700
11701 static const struct processor *
11702 aarch64_get_arch (enum aarch64_arch arch)
11703 {
11704 if (arch != aarch64_no_arch)
11705 return &all_architectures[arch];
11706
11707 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11708
11709 return &all_architectures[cpu->arch];
11710 }
11711
11712 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
11713
11714 static poly_uint16
11715 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11716 {
11717 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11718 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11719 deciding which .md file patterns to use and when deciding whether
11720 something is a legitimate address or constant. */
11721 if (value == SVE_SCALABLE || value == SVE_128)
11722 return poly_uint16 (2, 2);
11723 else
11724 return (int) value / 64;
11725 }
11726
11727 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
11728 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11729 tuning structs. In particular it must set selected_tune and
11730 aarch64_isa_flags that define the available ISA features and tuning
11731 decisions. It must also set selected_arch as this will be used to
11732 output the .arch asm tags for each function. */
11733
11734 static void
11735 aarch64_override_options (void)
11736 {
11737 unsigned long cpu_isa = 0;
11738 unsigned long arch_isa = 0;
11739 aarch64_isa_flags = 0;
11740
11741 bool valid_cpu = true;
11742 bool valid_tune = true;
11743 bool valid_arch = true;
11744
11745 selected_cpu = NULL;
11746 selected_arch = NULL;
11747 selected_tune = NULL;
11748
11749 if (aarch64_branch_protection_string)
11750 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11751
11752 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11753 If either of -march or -mtune is given, they override their
11754 respective component of -mcpu. */
11755 if (aarch64_cpu_string)
11756 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11757 &cpu_isa);
11758
11759 if (aarch64_arch_string)
11760 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11761 &arch_isa);
11762
11763 if (aarch64_tune_string)
11764 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11765
11766 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11767 SUBTARGET_OVERRIDE_OPTIONS;
11768 #endif
11769
11770 /* If the user did not specify a processor, choose the default
11771 one for them. This will be the CPU set during configuration using
11772 --with-cpu, otherwise it is "generic". */
11773 if (!selected_cpu)
11774 {
11775 if (selected_arch)
11776 {
11777 selected_cpu = &all_cores[selected_arch->ident];
11778 aarch64_isa_flags = arch_isa;
11779 explicit_arch = selected_arch->arch;
11780 }
11781 else
11782 {
11783 /* Get default configure-time CPU. */
11784 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
11785 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
11786 }
11787
11788 if (selected_tune)
11789 explicit_tune_core = selected_tune->ident;
11790 }
11791 /* If both -mcpu and -march are specified check that they are architecturally
11792 compatible, warn if they're not and prefer the -march ISA flags. */
11793 else if (selected_arch)
11794 {
11795 if (selected_arch->arch != selected_cpu->arch)
11796 {
11797 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
11798 all_architectures[selected_cpu->arch].name,
11799 selected_arch->name);
11800 }
11801 aarch64_isa_flags = arch_isa;
11802 explicit_arch = selected_arch->arch;
11803 explicit_tune_core = selected_tune ? selected_tune->ident
11804 : selected_cpu->ident;
11805 }
11806 else
11807 {
11808 /* -mcpu but no -march. */
11809 aarch64_isa_flags = cpu_isa;
11810 explicit_tune_core = selected_tune ? selected_tune->ident
11811 : selected_cpu->ident;
11812 gcc_assert (selected_cpu);
11813 selected_arch = &all_architectures[selected_cpu->arch];
11814 explicit_arch = selected_arch->arch;
11815 }
11816
11817 /* Set the arch as well as we will need it when outputing
11818 the .arch directive in assembly. */
11819 if (!selected_arch)
11820 {
11821 gcc_assert (selected_cpu);
11822 selected_arch = &all_architectures[selected_cpu->arch];
11823 }
11824
11825 if (!selected_tune)
11826 selected_tune = selected_cpu;
11827
11828 if (aarch64_enable_bti == 2)
11829 {
11830 #ifdef TARGET_ENABLE_BTI
11831 aarch64_enable_bti = 1;
11832 #else
11833 aarch64_enable_bti = 0;
11834 #endif
11835 }
11836
11837 /* Return address signing is currently not supported for ILP32 targets. For
11838 LP64 targets use the configured option in the absence of a command-line
11839 option for -mbranch-protection. */
11840 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
11841 {
11842 #ifdef TARGET_ENABLE_PAC_RET
11843 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
11844 aarch64_ra_sign_key = AARCH64_KEY_A;
11845 #else
11846 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
11847 #endif
11848 }
11849
11850 #ifndef HAVE_AS_MABI_OPTION
11851 /* The compiler may have been configured with 2.23.* binutils, which does
11852 not have support for ILP32. */
11853 if (TARGET_ILP32)
11854 error ("assembler does not support -mabi=ilp32");
11855 #endif
11856
11857 /* Convert -msve-vector-bits to a VG count. */
11858 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11859
11860 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11861 sorry ("return address signing is only supported for -mabi=lp64");
11862
11863 /* Make sure we properly set up the explicit options. */
11864 if ((aarch64_cpu_string && valid_cpu)
11865 || (aarch64_tune_string && valid_tune))
11866 gcc_assert (explicit_tune_core != aarch64_none);
11867
11868 if ((aarch64_cpu_string && valid_cpu)
11869 || (aarch64_arch_string && valid_arch))
11870 gcc_assert (explicit_arch != aarch64_no_arch);
11871
11872 /* The pass to insert speculation tracking runs before
11873 shrink-wrapping and the latter does not know how to update the
11874 tracking status. So disable it in this case. */
11875 if (aarch64_track_speculation)
11876 flag_shrink_wrap = 0;
11877
11878 aarch64_override_options_internal (&global_options);
11879
11880 /* Save these options as the default ones in case we push and pop them later
11881 while processing functions with potential target attributes. */
11882 target_option_default_node = target_option_current_node
11883 = build_target_option_node (&global_options);
11884 }
11885
11886 /* Implement targetm.override_options_after_change. */
11887
11888 static void
11889 aarch64_override_options_after_change (void)
11890 {
11891 aarch64_override_options_after_change_1 (&global_options);
11892 }
11893
11894 static struct machine_function *
11895 aarch64_init_machine_status (void)
11896 {
11897 struct machine_function *machine;
11898 machine = ggc_cleared_alloc<machine_function> ();
11899 return machine;
11900 }
11901
11902 void
11903 aarch64_init_expanders (void)
11904 {
11905 init_machine_status = aarch64_init_machine_status;
11906 }
11907
11908 /* A checking mechanism for the implementation of the various code models. */
11909 static void
11910 initialize_aarch64_code_model (struct gcc_options *opts)
11911 {
11912 if (opts->x_flag_pic)
11913 {
11914 switch (opts->x_aarch64_cmodel_var)
11915 {
11916 case AARCH64_CMODEL_TINY:
11917 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11918 break;
11919 case AARCH64_CMODEL_SMALL:
11920 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11921 aarch64_cmodel = (flag_pic == 2
11922 ? AARCH64_CMODEL_SMALL_PIC
11923 : AARCH64_CMODEL_SMALL_SPIC);
11924 #else
11925 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11926 #endif
11927 break;
11928 case AARCH64_CMODEL_LARGE:
11929 sorry ("code model %qs with -f%s", "large",
11930 opts->x_flag_pic > 1 ? "PIC" : "pic");
11931 break;
11932 default:
11933 gcc_unreachable ();
11934 }
11935 }
11936 else
11937 aarch64_cmodel = opts->x_aarch64_cmodel_var;
11938 }
11939
11940 /* Implement TARGET_OPTION_SAVE. */
11941
11942 static void
11943 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11944 {
11945 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11946 ptr->x_aarch64_branch_protection_string
11947 = opts->x_aarch64_branch_protection_string;
11948 }
11949
11950 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11951 using the information saved in PTR. */
11952
11953 static void
11954 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11955 {
11956 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11957 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11958 opts->x_explicit_arch = ptr->x_explicit_arch;
11959 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11960 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11961 opts->x_aarch64_branch_protection_string
11962 = ptr->x_aarch64_branch_protection_string;
11963 if (opts->x_aarch64_branch_protection_string)
11964 {
11965 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
11966 NULL);
11967 }
11968
11969 aarch64_override_options_internal (opts);
11970 }
11971
11972 /* Implement TARGET_OPTION_PRINT. */
11973
11974 static void
11975 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11976 {
11977 const struct processor *cpu
11978 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11979 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11980 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11981 std::string extension
11982 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11983
11984 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11985 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11986 arch->name, extension.c_str ());
11987 }
11988
11989 static GTY(()) tree aarch64_previous_fndecl;
11990
11991 void
11992 aarch64_reset_previous_fndecl (void)
11993 {
11994 aarch64_previous_fndecl = NULL;
11995 }
11996
11997 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11998 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11999 make sure optab availability predicates are recomputed when necessary. */
12000
12001 void
12002 aarch64_save_restore_target_globals (tree new_tree)
12003 {
12004 if (TREE_TARGET_GLOBALS (new_tree))
12005 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12006 else if (new_tree == target_option_default_node)
12007 restore_target_globals (&default_target_globals);
12008 else
12009 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12010 }
12011
12012 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12013 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12014 of the function, if such exists. This function may be called multiple
12015 times on a single function so use aarch64_previous_fndecl to avoid
12016 setting up identical state. */
12017
12018 static void
12019 aarch64_set_current_function (tree fndecl)
12020 {
12021 if (!fndecl || fndecl == aarch64_previous_fndecl)
12022 return;
12023
12024 tree old_tree = (aarch64_previous_fndecl
12025 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12026 : NULL_TREE);
12027
12028 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12029
12030 /* If current function has no attributes but the previous one did,
12031 use the default node. */
12032 if (!new_tree && old_tree)
12033 new_tree = target_option_default_node;
12034
12035 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12036 the default have been handled by aarch64_save_restore_target_globals from
12037 aarch64_pragma_target_parse. */
12038 if (old_tree == new_tree)
12039 return;
12040
12041 aarch64_previous_fndecl = fndecl;
12042
12043 /* First set the target options. */
12044 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12045
12046 aarch64_save_restore_target_globals (new_tree);
12047 }
12048
12049 /* Enum describing the various ways we can handle attributes.
12050 In many cases we can reuse the generic option handling machinery. */
12051
12052 enum aarch64_attr_opt_type
12053 {
12054 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
12055 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
12056 aarch64_attr_enum, /* Attribute sets an enum variable. */
12057 aarch64_attr_custom /* Attribute requires a custom handling function. */
12058 };
12059
12060 /* All the information needed to handle a target attribute.
12061 NAME is the name of the attribute.
12062 ATTR_TYPE specifies the type of behavior of the attribute as described
12063 in the definition of enum aarch64_attr_opt_type.
12064 ALLOW_NEG is true if the attribute supports a "no-" form.
12065 HANDLER is the function that takes the attribute string as an argument
12066 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12067 OPT_NUM is the enum specifying the option that the attribute modifies.
12068 This is needed for attributes that mirror the behavior of a command-line
12069 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12070 aarch64_attr_enum. */
12071
12072 struct aarch64_attribute_info
12073 {
12074 const char *name;
12075 enum aarch64_attr_opt_type attr_type;
12076 bool allow_neg;
12077 bool (*handler) (const char *);
12078 enum opt_code opt_num;
12079 };
12080
12081 /* Handle the ARCH_STR argument to the arch= target attribute. */
12082
12083 static bool
12084 aarch64_handle_attr_arch (const char *str)
12085 {
12086 const struct processor *tmp_arch = NULL;
12087 std::string invalid_extension;
12088 enum aarch64_parse_opt_result parse_res
12089 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12090
12091 if (parse_res == AARCH64_PARSE_OK)
12092 {
12093 gcc_assert (tmp_arch);
12094 selected_arch = tmp_arch;
12095 explicit_arch = selected_arch->arch;
12096 return true;
12097 }
12098
12099 switch (parse_res)
12100 {
12101 case AARCH64_PARSE_MISSING_ARG:
12102 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12103 break;
12104 case AARCH64_PARSE_INVALID_ARG:
12105 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12106 aarch64_print_hint_for_arch (str);
12107 break;
12108 case AARCH64_PARSE_INVALID_FEATURE:
12109 error ("invalid feature modifier %s of value (\"%s\") in "
12110 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12111 aarch64_print_hint_for_extensions (invalid_extension);
12112 break;
12113 default:
12114 gcc_unreachable ();
12115 }
12116
12117 return false;
12118 }
12119
12120 /* Handle the argument CPU_STR to the cpu= target attribute. */
12121
12122 static bool
12123 aarch64_handle_attr_cpu (const char *str)
12124 {
12125 const struct processor *tmp_cpu = NULL;
12126 std::string invalid_extension;
12127 enum aarch64_parse_opt_result parse_res
12128 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12129
12130 if (parse_res == AARCH64_PARSE_OK)
12131 {
12132 gcc_assert (tmp_cpu);
12133 selected_tune = tmp_cpu;
12134 explicit_tune_core = selected_tune->ident;
12135
12136 selected_arch = &all_architectures[tmp_cpu->arch];
12137 explicit_arch = selected_arch->arch;
12138 return true;
12139 }
12140
12141 switch (parse_res)
12142 {
12143 case AARCH64_PARSE_MISSING_ARG:
12144 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12145 break;
12146 case AARCH64_PARSE_INVALID_ARG:
12147 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12148 aarch64_print_hint_for_core (str);
12149 break;
12150 case AARCH64_PARSE_INVALID_FEATURE:
12151 error ("invalid feature modifier %s of value (\"%s\") in "
12152 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12153 aarch64_print_hint_for_extensions (invalid_extension);
12154 break;
12155 default:
12156 gcc_unreachable ();
12157 }
12158
12159 return false;
12160 }
12161
12162 /* Handle the argument STR to the branch-protection= attribute. */
12163
12164 static bool
12165 aarch64_handle_attr_branch_protection (const char* str)
12166 {
12167 char *err_str = (char *) xmalloc (strlen (str));
12168 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12169 &err_str);
12170 bool success = false;
12171 switch (res)
12172 {
12173 case AARCH64_PARSE_MISSING_ARG:
12174 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12175 " attribute");
12176 break;
12177 case AARCH64_PARSE_INVALID_ARG:
12178 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12179 "=\")%> pragma or attribute", err_str);
12180 break;
12181 case AARCH64_PARSE_OK:
12182 success = true;
12183 /* Fall through. */
12184 case AARCH64_PARSE_INVALID_FEATURE:
12185 break;
12186 default:
12187 gcc_unreachable ();
12188 }
12189 free (err_str);
12190 return success;
12191 }
12192
12193 /* Handle the argument STR to the tune= target attribute. */
12194
12195 static bool
12196 aarch64_handle_attr_tune (const char *str)
12197 {
12198 const struct processor *tmp_tune = NULL;
12199 enum aarch64_parse_opt_result parse_res
12200 = aarch64_parse_tune (str, &tmp_tune);
12201
12202 if (parse_res == AARCH64_PARSE_OK)
12203 {
12204 gcc_assert (tmp_tune);
12205 selected_tune = tmp_tune;
12206 explicit_tune_core = selected_tune->ident;
12207 return true;
12208 }
12209
12210 switch (parse_res)
12211 {
12212 case AARCH64_PARSE_INVALID_ARG:
12213 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12214 aarch64_print_hint_for_core (str);
12215 break;
12216 default:
12217 gcc_unreachable ();
12218 }
12219
12220 return false;
12221 }
12222
12223 /* Parse an architecture extensions target attribute string specified in STR.
12224 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12225 if successful. Update aarch64_isa_flags to reflect the ISA features
12226 modified. */
12227
12228 static bool
12229 aarch64_handle_attr_isa_flags (char *str)
12230 {
12231 enum aarch64_parse_opt_result parse_res;
12232 unsigned long isa_flags = aarch64_isa_flags;
12233
12234 /* We allow "+nothing" in the beginning to clear out all architectural
12235 features if the user wants to handpick specific features. */
12236 if (strncmp ("+nothing", str, 8) == 0)
12237 {
12238 isa_flags = 0;
12239 str += 8;
12240 }
12241
12242 std::string invalid_extension;
12243 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12244
12245 if (parse_res == AARCH64_PARSE_OK)
12246 {
12247 aarch64_isa_flags = isa_flags;
12248 return true;
12249 }
12250
12251 switch (parse_res)
12252 {
12253 case AARCH64_PARSE_MISSING_ARG:
12254 error ("missing value in %<target()%> pragma or attribute");
12255 break;
12256
12257 case AARCH64_PARSE_INVALID_FEATURE:
12258 error ("invalid feature modifier %s of value (\"%s\") in "
12259 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12260 break;
12261
12262 default:
12263 gcc_unreachable ();
12264 }
12265
12266 return false;
12267 }
12268
12269 /* The target attributes that we support. On top of these we also support just
12270 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12271 handled explicitly in aarch64_process_one_target_attr. */
12272
12273 static const struct aarch64_attribute_info aarch64_attributes[] =
12274 {
12275 { "general-regs-only", aarch64_attr_mask, false, NULL,
12276 OPT_mgeneral_regs_only },
12277 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12278 OPT_mfix_cortex_a53_835769 },
12279 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12280 OPT_mfix_cortex_a53_843419 },
12281 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12282 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12283 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12284 OPT_momit_leaf_frame_pointer },
12285 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12286 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12287 OPT_march_ },
12288 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12289 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12290 OPT_mtune_ },
12291 { "branch-protection", aarch64_attr_custom, false,
12292 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12293 { "sign-return-address", aarch64_attr_enum, false, NULL,
12294 OPT_msign_return_address_ },
12295 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12296 };
12297
12298 /* Parse ARG_STR which contains the definition of one target attribute.
12299 Show appropriate errors if any or return true if the attribute is valid. */
12300
12301 static bool
12302 aarch64_process_one_target_attr (char *arg_str)
12303 {
12304 bool invert = false;
12305
12306 size_t len = strlen (arg_str);
12307
12308 if (len == 0)
12309 {
12310 error ("malformed %<target()%> pragma or attribute");
12311 return false;
12312 }
12313
12314 char *str_to_check = (char *) alloca (len + 1);
12315 strcpy (str_to_check, arg_str);
12316
12317 /* Skip leading whitespace. */
12318 while (*str_to_check == ' ' || *str_to_check == '\t')
12319 str_to_check++;
12320
12321 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12322 It is easier to detect and handle it explicitly here rather than going
12323 through the machinery for the rest of the target attributes in this
12324 function. */
12325 if (*str_to_check == '+')
12326 return aarch64_handle_attr_isa_flags (str_to_check);
12327
12328 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12329 {
12330 invert = true;
12331 str_to_check += 3;
12332 }
12333 char *arg = strchr (str_to_check, '=');
12334
12335 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12336 and point ARG to "foo". */
12337 if (arg)
12338 {
12339 *arg = '\0';
12340 arg++;
12341 }
12342 const struct aarch64_attribute_info *p_attr;
12343 bool found = false;
12344 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12345 {
12346 /* If the names don't match up, or the user has given an argument
12347 to an attribute that doesn't accept one, or didn't give an argument
12348 to an attribute that expects one, fail to match. */
12349 if (strcmp (str_to_check, p_attr->name) != 0)
12350 continue;
12351
12352 found = true;
12353 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12354 || p_attr->attr_type == aarch64_attr_enum;
12355
12356 if (attr_need_arg_p ^ (arg != NULL))
12357 {
12358 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12359 return false;
12360 }
12361
12362 /* If the name matches but the attribute does not allow "no-" versions
12363 then we can't match. */
12364 if (invert && !p_attr->allow_neg)
12365 {
12366 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12367 return false;
12368 }
12369
12370 switch (p_attr->attr_type)
12371 {
12372 /* Has a custom handler registered.
12373 For example, cpu=, arch=, tune=. */
12374 case aarch64_attr_custom:
12375 gcc_assert (p_attr->handler);
12376 if (!p_attr->handler (arg))
12377 return false;
12378 break;
12379
12380 /* Either set or unset a boolean option. */
12381 case aarch64_attr_bool:
12382 {
12383 struct cl_decoded_option decoded;
12384
12385 generate_option (p_attr->opt_num, NULL, !invert,
12386 CL_TARGET, &decoded);
12387 aarch64_handle_option (&global_options, &global_options_set,
12388 &decoded, input_location);
12389 break;
12390 }
12391 /* Set or unset a bit in the target_flags. aarch64_handle_option
12392 should know what mask to apply given the option number. */
12393 case aarch64_attr_mask:
12394 {
12395 struct cl_decoded_option decoded;
12396 /* We only need to specify the option number.
12397 aarch64_handle_option will know which mask to apply. */
12398 decoded.opt_index = p_attr->opt_num;
12399 decoded.value = !invert;
12400 aarch64_handle_option (&global_options, &global_options_set,
12401 &decoded, input_location);
12402 break;
12403 }
12404 /* Use the option setting machinery to set an option to an enum. */
12405 case aarch64_attr_enum:
12406 {
12407 gcc_assert (arg);
12408 bool valid;
12409 int value;
12410 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12411 &value, CL_TARGET);
12412 if (valid)
12413 {
12414 set_option (&global_options, NULL, p_attr->opt_num, value,
12415 NULL, DK_UNSPECIFIED, input_location,
12416 global_dc);
12417 }
12418 else
12419 {
12420 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12421 }
12422 break;
12423 }
12424 default:
12425 gcc_unreachable ();
12426 }
12427 }
12428
12429 /* If we reached here we either have found an attribute and validated
12430 it or didn't match any. If we matched an attribute but its arguments
12431 were malformed we will have returned false already. */
12432 return found;
12433 }
12434
12435 /* Count how many times the character C appears in
12436 NULL-terminated string STR. */
12437
12438 static unsigned int
12439 num_occurences_in_str (char c, char *str)
12440 {
12441 unsigned int res = 0;
12442 while (*str != '\0')
12443 {
12444 if (*str == c)
12445 res++;
12446
12447 str++;
12448 }
12449
12450 return res;
12451 }
12452
12453 /* Parse the tree in ARGS that contains the target attribute information
12454 and update the global target options space. */
12455
12456 bool
12457 aarch64_process_target_attr (tree args)
12458 {
12459 if (TREE_CODE (args) == TREE_LIST)
12460 {
12461 do
12462 {
12463 tree head = TREE_VALUE (args);
12464 if (head)
12465 {
12466 if (!aarch64_process_target_attr (head))
12467 return false;
12468 }
12469 args = TREE_CHAIN (args);
12470 } while (args);
12471
12472 return true;
12473 }
12474
12475 if (TREE_CODE (args) != STRING_CST)
12476 {
12477 error ("attribute %<target%> argument not a string");
12478 return false;
12479 }
12480
12481 size_t len = strlen (TREE_STRING_POINTER (args));
12482 char *str_to_check = (char *) alloca (len + 1);
12483 strcpy (str_to_check, TREE_STRING_POINTER (args));
12484
12485 if (len == 0)
12486 {
12487 error ("malformed %<target()%> pragma or attribute");
12488 return false;
12489 }
12490
12491 /* Used to catch empty spaces between commas i.e.
12492 attribute ((target ("attr1,,attr2"))). */
12493 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12494
12495 /* Handle multiple target attributes separated by ','. */
12496 char *token = strtok_r (str_to_check, ",", &str_to_check);
12497
12498 unsigned int num_attrs = 0;
12499 while (token)
12500 {
12501 num_attrs++;
12502 if (!aarch64_process_one_target_attr (token))
12503 {
12504 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12505 return false;
12506 }
12507
12508 token = strtok_r (NULL, ",", &str_to_check);
12509 }
12510
12511 if (num_attrs != num_commas + 1)
12512 {
12513 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12514 return false;
12515 }
12516
12517 return true;
12518 }
12519
12520 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
12521 process attribute ((target ("..."))). */
12522
12523 static bool
12524 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12525 {
12526 struct cl_target_option cur_target;
12527 bool ret;
12528 tree old_optimize;
12529 tree new_target, new_optimize;
12530 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12531
12532 /* If what we're processing is the current pragma string then the
12533 target option node is already stored in target_option_current_node
12534 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
12535 having to re-parse the string. This is especially useful to keep
12536 arm_neon.h compile times down since that header contains a lot
12537 of intrinsics enclosed in pragmas. */
12538 if (!existing_target && args == current_target_pragma)
12539 {
12540 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12541 return true;
12542 }
12543 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12544
12545 old_optimize = build_optimization_node (&global_options);
12546 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12547
12548 /* If the function changed the optimization levels as well as setting
12549 target options, start with the optimizations specified. */
12550 if (func_optimize && func_optimize != old_optimize)
12551 cl_optimization_restore (&global_options,
12552 TREE_OPTIMIZATION (func_optimize));
12553
12554 /* Save the current target options to restore at the end. */
12555 cl_target_option_save (&cur_target, &global_options);
12556
12557 /* If fndecl already has some target attributes applied to it, unpack
12558 them so that we add this attribute on top of them, rather than
12559 overwriting them. */
12560 if (existing_target)
12561 {
12562 struct cl_target_option *existing_options
12563 = TREE_TARGET_OPTION (existing_target);
12564
12565 if (existing_options)
12566 cl_target_option_restore (&global_options, existing_options);
12567 }
12568 else
12569 cl_target_option_restore (&global_options,
12570 TREE_TARGET_OPTION (target_option_current_node));
12571
12572 ret = aarch64_process_target_attr (args);
12573
12574 /* Set up any additional state. */
12575 if (ret)
12576 {
12577 aarch64_override_options_internal (&global_options);
12578 /* Initialize SIMD builtins if we haven't already.
12579 Set current_target_pragma to NULL for the duration so that
12580 the builtin initialization code doesn't try to tag the functions
12581 being built with the attributes specified by any current pragma, thus
12582 going into an infinite recursion. */
12583 if (TARGET_SIMD)
12584 {
12585 tree saved_current_target_pragma = current_target_pragma;
12586 current_target_pragma = NULL;
12587 aarch64_init_simd_builtins ();
12588 current_target_pragma = saved_current_target_pragma;
12589 }
12590 new_target = build_target_option_node (&global_options);
12591 }
12592 else
12593 new_target = NULL;
12594
12595 new_optimize = build_optimization_node (&global_options);
12596
12597 if (fndecl && ret)
12598 {
12599 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12600
12601 if (old_optimize != new_optimize)
12602 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12603 }
12604
12605 cl_target_option_restore (&global_options, &cur_target);
12606
12607 if (old_optimize != new_optimize)
12608 cl_optimization_restore (&global_options,
12609 TREE_OPTIMIZATION (old_optimize));
12610 return ret;
12611 }
12612
12613 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
12614 tri-bool options (yes, no, don't care) and the default value is
12615 DEF, determine whether to reject inlining. */
12616
12617 static bool
12618 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12619 int dont_care, int def)
12620 {
12621 /* If the callee doesn't care, always allow inlining. */
12622 if (callee == dont_care)
12623 return true;
12624
12625 /* If the caller doesn't care, always allow inlining. */
12626 if (caller == dont_care)
12627 return true;
12628
12629 /* Otherwise, allow inlining if either the callee and caller values
12630 agree, or if the callee is using the default value. */
12631 return (callee == caller || callee == def);
12632 }
12633
12634 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
12635 to inline CALLEE into CALLER based on target-specific info.
12636 Make sure that the caller and callee have compatible architectural
12637 features. Then go through the other possible target attributes
12638 and see if they can block inlining. Try not to reject always_inline
12639 callees unless they are incompatible architecturally. */
12640
12641 static bool
12642 aarch64_can_inline_p (tree caller, tree callee)
12643 {
12644 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12645 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12646
12647 struct cl_target_option *caller_opts
12648 = TREE_TARGET_OPTION (caller_tree ? caller_tree
12649 : target_option_default_node);
12650
12651 struct cl_target_option *callee_opts
12652 = TREE_TARGET_OPTION (callee_tree ? callee_tree
12653 : target_option_default_node);
12654
12655 /* Callee's ISA flags should be a subset of the caller's. */
12656 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12657 != callee_opts->x_aarch64_isa_flags)
12658 return false;
12659
12660 /* Allow non-strict aligned functions inlining into strict
12661 aligned ones. */
12662 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12663 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12664 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12665 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12666 return false;
12667
12668 bool always_inline = lookup_attribute ("always_inline",
12669 DECL_ATTRIBUTES (callee));
12670
12671 /* If the architectural features match up and the callee is always_inline
12672 then the other attributes don't matter. */
12673 if (always_inline)
12674 return true;
12675
12676 if (caller_opts->x_aarch64_cmodel_var
12677 != callee_opts->x_aarch64_cmodel_var)
12678 return false;
12679
12680 if (caller_opts->x_aarch64_tls_dialect
12681 != callee_opts->x_aarch64_tls_dialect)
12682 return false;
12683
12684 /* Honour explicit requests to workaround errata. */
12685 if (!aarch64_tribools_ok_for_inlining_p (
12686 caller_opts->x_aarch64_fix_a53_err835769,
12687 callee_opts->x_aarch64_fix_a53_err835769,
12688 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12689 return false;
12690
12691 if (!aarch64_tribools_ok_for_inlining_p (
12692 caller_opts->x_aarch64_fix_a53_err843419,
12693 callee_opts->x_aarch64_fix_a53_err843419,
12694 2, TARGET_FIX_ERR_A53_843419))
12695 return false;
12696
12697 /* If the user explicitly specified -momit-leaf-frame-pointer for the
12698 caller and calle and they don't match up, reject inlining. */
12699 if (!aarch64_tribools_ok_for_inlining_p (
12700 caller_opts->x_flag_omit_leaf_frame_pointer,
12701 callee_opts->x_flag_omit_leaf_frame_pointer,
12702 2, 1))
12703 return false;
12704
12705 /* If the callee has specific tuning overrides, respect them. */
12706 if (callee_opts->x_aarch64_override_tune_string != NULL
12707 && caller_opts->x_aarch64_override_tune_string == NULL)
12708 return false;
12709
12710 /* If the user specified tuning override strings for the
12711 caller and callee and they don't match up, reject inlining.
12712 We just do a string compare here, we don't analyze the meaning
12713 of the string, as it would be too costly for little gain. */
12714 if (callee_opts->x_aarch64_override_tune_string
12715 && caller_opts->x_aarch64_override_tune_string
12716 && (strcmp (callee_opts->x_aarch64_override_tune_string,
12717 caller_opts->x_aarch64_override_tune_string) != 0))
12718 return false;
12719
12720 return true;
12721 }
12722
12723 /* Return true if SYMBOL_REF X binds locally. */
12724
12725 static bool
12726 aarch64_symbol_binds_local_p (const_rtx x)
12727 {
12728 return (SYMBOL_REF_DECL (x)
12729 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12730 : SYMBOL_REF_LOCAL_P (x));
12731 }
12732
12733 /* Return true if SYMBOL_REF X is thread local */
12734 static bool
12735 aarch64_tls_symbol_p (rtx x)
12736 {
12737 if (! TARGET_HAVE_TLS)
12738 return false;
12739
12740 if (GET_CODE (x) != SYMBOL_REF)
12741 return false;
12742
12743 return SYMBOL_REF_TLS_MODEL (x) != 0;
12744 }
12745
12746 /* Classify a TLS symbol into one of the TLS kinds. */
12747 enum aarch64_symbol_type
12748 aarch64_classify_tls_symbol (rtx x)
12749 {
12750 enum tls_model tls_kind = tls_symbolic_operand_type (x);
12751
12752 switch (tls_kind)
12753 {
12754 case TLS_MODEL_GLOBAL_DYNAMIC:
12755 case TLS_MODEL_LOCAL_DYNAMIC:
12756 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12757
12758 case TLS_MODEL_INITIAL_EXEC:
12759 switch (aarch64_cmodel)
12760 {
12761 case AARCH64_CMODEL_TINY:
12762 case AARCH64_CMODEL_TINY_PIC:
12763 return SYMBOL_TINY_TLSIE;
12764 default:
12765 return SYMBOL_SMALL_TLSIE;
12766 }
12767
12768 case TLS_MODEL_LOCAL_EXEC:
12769 if (aarch64_tls_size == 12)
12770 return SYMBOL_TLSLE12;
12771 else if (aarch64_tls_size == 24)
12772 return SYMBOL_TLSLE24;
12773 else if (aarch64_tls_size == 32)
12774 return SYMBOL_TLSLE32;
12775 else if (aarch64_tls_size == 48)
12776 return SYMBOL_TLSLE48;
12777 else
12778 gcc_unreachable ();
12779
12780 case TLS_MODEL_EMULATED:
12781 case TLS_MODEL_NONE:
12782 return SYMBOL_FORCE_TO_MEM;
12783
12784 default:
12785 gcc_unreachable ();
12786 }
12787 }
12788
12789 /* Return the correct method for accessing X + OFFSET, where X is either
12790 a SYMBOL_REF or LABEL_REF. */
12791
12792 enum aarch64_symbol_type
12793 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
12794 {
12795 if (GET_CODE (x) == LABEL_REF)
12796 {
12797 switch (aarch64_cmodel)
12798 {
12799 case AARCH64_CMODEL_LARGE:
12800 return SYMBOL_FORCE_TO_MEM;
12801
12802 case AARCH64_CMODEL_TINY_PIC:
12803 case AARCH64_CMODEL_TINY:
12804 return SYMBOL_TINY_ABSOLUTE;
12805
12806 case AARCH64_CMODEL_SMALL_SPIC:
12807 case AARCH64_CMODEL_SMALL_PIC:
12808 case AARCH64_CMODEL_SMALL:
12809 return SYMBOL_SMALL_ABSOLUTE;
12810
12811 default:
12812 gcc_unreachable ();
12813 }
12814 }
12815
12816 if (GET_CODE (x) == SYMBOL_REF)
12817 {
12818 if (aarch64_tls_symbol_p (x))
12819 return aarch64_classify_tls_symbol (x);
12820
12821 switch (aarch64_cmodel)
12822 {
12823 case AARCH64_CMODEL_TINY:
12824 /* When we retrieve symbol + offset address, we have to make sure
12825 the offset does not cause overflow of the final address. But
12826 we have no way of knowing the address of symbol at compile time
12827 so we can't accurately say if the distance between the PC and
12828 symbol + offset is outside the addressible range of +/-1M in the
12829 TINY code model. So we rely on images not being greater than
12830 1M and cap the offset at 1M and anything beyond 1M will have to
12831 be loaded using an alternative mechanism. Furthermore if the
12832 symbol is a weak reference to something that isn't known to
12833 resolve to a symbol in this module, then force to memory. */
12834 if ((SYMBOL_REF_WEAK (x)
12835 && !aarch64_symbol_binds_local_p (x))
12836 || !IN_RANGE (offset, -1048575, 1048575))
12837 return SYMBOL_FORCE_TO_MEM;
12838 return SYMBOL_TINY_ABSOLUTE;
12839
12840 case AARCH64_CMODEL_SMALL:
12841 /* Same reasoning as the tiny code model, but the offset cap here is
12842 4G. */
12843 if ((SYMBOL_REF_WEAK (x)
12844 && !aarch64_symbol_binds_local_p (x))
12845 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
12846 HOST_WIDE_INT_C (4294967264)))
12847 return SYMBOL_FORCE_TO_MEM;
12848 return SYMBOL_SMALL_ABSOLUTE;
12849
12850 case AARCH64_CMODEL_TINY_PIC:
12851 if (!aarch64_symbol_binds_local_p (x))
12852 return SYMBOL_TINY_GOT;
12853 return SYMBOL_TINY_ABSOLUTE;
12854
12855 case AARCH64_CMODEL_SMALL_SPIC:
12856 case AARCH64_CMODEL_SMALL_PIC:
12857 if (!aarch64_symbol_binds_local_p (x))
12858 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
12859 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
12860 return SYMBOL_SMALL_ABSOLUTE;
12861
12862 case AARCH64_CMODEL_LARGE:
12863 /* This is alright even in PIC code as the constant
12864 pool reference is always PC relative and within
12865 the same translation unit. */
12866 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
12867 return SYMBOL_SMALL_ABSOLUTE;
12868 else
12869 return SYMBOL_FORCE_TO_MEM;
12870
12871 default:
12872 gcc_unreachable ();
12873 }
12874 }
12875
12876 /* By default push everything into the constant pool. */
12877 return SYMBOL_FORCE_TO_MEM;
12878 }
12879
12880 bool
12881 aarch64_constant_address_p (rtx x)
12882 {
12883 return (CONSTANT_P (x) && memory_address_p (DImode, x));
12884 }
12885
12886 bool
12887 aarch64_legitimate_pic_operand_p (rtx x)
12888 {
12889 if (GET_CODE (x) == SYMBOL_REF
12890 || (GET_CODE (x) == CONST
12891 && GET_CODE (XEXP (x, 0)) == PLUS
12892 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
12893 return false;
12894
12895 return true;
12896 }
12897
12898 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
12899 that should be rematerialized rather than spilled. */
12900
12901 static bool
12902 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
12903 {
12904 /* Support CSE and rematerialization of common constants. */
12905 if (CONST_INT_P (x)
12906 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
12907 || GET_CODE (x) == CONST_VECTOR)
12908 return true;
12909
12910 /* Do not allow vector struct mode constants for Advanced SIMD.
12911 We could support 0 and -1 easily, but they need support in
12912 aarch64-simd.md. */
12913 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12914 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12915 return false;
12916
12917 /* Only accept variable-length vector constants if they can be
12918 handled directly.
12919
12920 ??? It would be possible to handle rematerialization of other
12921 constants via secondary reloads. */
12922 if (vec_flags & VEC_ANY_SVE)
12923 return aarch64_simd_valid_immediate (x, NULL);
12924
12925 if (GET_CODE (x) == HIGH)
12926 x = XEXP (x, 0);
12927
12928 /* Accept polynomial constants that can be calculated by using the
12929 destination of a move as the sole temporary. Constants that
12930 require a second temporary cannot be rematerialized (they can't be
12931 forced to memory and also aren't legitimate constants). */
12932 poly_int64 offset;
12933 if (poly_int_rtx_p (x, &offset))
12934 return aarch64_offset_temporaries (false, offset) <= 1;
12935
12936 /* If an offset is being added to something else, we need to allow the
12937 base to be moved into the destination register, meaning that there
12938 are no free temporaries for the offset. */
12939 x = strip_offset (x, &offset);
12940 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12941 return false;
12942
12943 /* Do not allow const (plus (anchor_symbol, const_int)). */
12944 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12945 return false;
12946
12947 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12948 so spilling them is better than rematerialization. */
12949 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12950 return true;
12951
12952 /* Label references are always constant. */
12953 if (GET_CODE (x) == LABEL_REF)
12954 return true;
12955
12956 return false;
12957 }
12958
12959 rtx
12960 aarch64_load_tp (rtx target)
12961 {
12962 if (!target
12963 || GET_MODE (target) != Pmode
12964 || !register_operand (target, Pmode))
12965 target = gen_reg_rtx (Pmode);
12966
12967 /* Can return in any reg. */
12968 emit_insn (gen_aarch64_load_tp_hard (target));
12969 return target;
12970 }
12971
12972 /* On AAPCS systems, this is the "struct __va_list". */
12973 static GTY(()) tree va_list_type;
12974
12975 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12976 Return the type to use as __builtin_va_list.
12977
12978 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12979
12980 struct __va_list
12981 {
12982 void *__stack;
12983 void *__gr_top;
12984 void *__vr_top;
12985 int __gr_offs;
12986 int __vr_offs;
12987 }; */
12988
12989 static tree
12990 aarch64_build_builtin_va_list (void)
12991 {
12992 tree va_list_name;
12993 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12994
12995 /* Create the type. */
12996 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12997 /* Give it the required name. */
12998 va_list_name = build_decl (BUILTINS_LOCATION,
12999 TYPE_DECL,
13000 get_identifier ("__va_list"),
13001 va_list_type);
13002 DECL_ARTIFICIAL (va_list_name) = 1;
13003 TYPE_NAME (va_list_type) = va_list_name;
13004 TYPE_STUB_DECL (va_list_type) = va_list_name;
13005
13006 /* Create the fields. */
13007 f_stack = build_decl (BUILTINS_LOCATION,
13008 FIELD_DECL, get_identifier ("__stack"),
13009 ptr_type_node);
13010 f_grtop = build_decl (BUILTINS_LOCATION,
13011 FIELD_DECL, get_identifier ("__gr_top"),
13012 ptr_type_node);
13013 f_vrtop = build_decl (BUILTINS_LOCATION,
13014 FIELD_DECL, get_identifier ("__vr_top"),
13015 ptr_type_node);
13016 f_groff = build_decl (BUILTINS_LOCATION,
13017 FIELD_DECL, get_identifier ("__gr_offs"),
13018 integer_type_node);
13019 f_vroff = build_decl (BUILTINS_LOCATION,
13020 FIELD_DECL, get_identifier ("__vr_offs"),
13021 integer_type_node);
13022
13023 /* Tell tree-stdarg pass about our internal offset fields.
13024 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13025 purpose to identify whether the code is updating va_list internal
13026 offset fields through irregular way. */
13027 va_list_gpr_counter_field = f_groff;
13028 va_list_fpr_counter_field = f_vroff;
13029
13030 DECL_ARTIFICIAL (f_stack) = 1;
13031 DECL_ARTIFICIAL (f_grtop) = 1;
13032 DECL_ARTIFICIAL (f_vrtop) = 1;
13033 DECL_ARTIFICIAL (f_groff) = 1;
13034 DECL_ARTIFICIAL (f_vroff) = 1;
13035
13036 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13037 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13038 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13039 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13040 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13041
13042 TYPE_FIELDS (va_list_type) = f_stack;
13043 DECL_CHAIN (f_stack) = f_grtop;
13044 DECL_CHAIN (f_grtop) = f_vrtop;
13045 DECL_CHAIN (f_vrtop) = f_groff;
13046 DECL_CHAIN (f_groff) = f_vroff;
13047
13048 /* Compute its layout. */
13049 layout_type (va_list_type);
13050
13051 return va_list_type;
13052 }
13053
13054 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13055 static void
13056 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13057 {
13058 const CUMULATIVE_ARGS *cum;
13059 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13060 tree stack, grtop, vrtop, groff, vroff;
13061 tree t;
13062 int gr_save_area_size = cfun->va_list_gpr_size;
13063 int vr_save_area_size = cfun->va_list_fpr_size;
13064 int vr_offset;
13065
13066 cum = &crtl->args.info;
13067 if (cfun->va_list_gpr_size)
13068 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13069 cfun->va_list_gpr_size);
13070 if (cfun->va_list_fpr_size)
13071 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13072 * UNITS_PER_VREG, cfun->va_list_fpr_size);
13073
13074 if (!TARGET_FLOAT)
13075 {
13076 gcc_assert (cum->aapcs_nvrn == 0);
13077 vr_save_area_size = 0;
13078 }
13079
13080 f_stack = TYPE_FIELDS (va_list_type_node);
13081 f_grtop = DECL_CHAIN (f_stack);
13082 f_vrtop = DECL_CHAIN (f_grtop);
13083 f_groff = DECL_CHAIN (f_vrtop);
13084 f_vroff = DECL_CHAIN (f_groff);
13085
13086 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13087 NULL_TREE);
13088 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13089 NULL_TREE);
13090 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13091 NULL_TREE);
13092 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13093 NULL_TREE);
13094 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13095 NULL_TREE);
13096
13097 /* Emit code to initialize STACK, which points to the next varargs stack
13098 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13099 by named arguments. STACK is 8-byte aligned. */
13100 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13101 if (cum->aapcs_stack_size > 0)
13102 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13103 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13104 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13105
13106 /* Emit code to initialize GRTOP, the top of the GR save area.
13107 virtual_incoming_args_rtx should have been 16 byte aligned. */
13108 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13109 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13110 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13111
13112 /* Emit code to initialize VRTOP, the top of the VR save area.
13113 This address is gr_save_area_bytes below GRTOP, rounded
13114 down to the next 16-byte boundary. */
13115 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13116 vr_offset = ROUND_UP (gr_save_area_size,
13117 STACK_BOUNDARY / BITS_PER_UNIT);
13118
13119 if (vr_offset)
13120 t = fold_build_pointer_plus_hwi (t, -vr_offset);
13121 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13122 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13123
13124 /* Emit code to initialize GROFF, the offset from GRTOP of the
13125 next GPR argument. */
13126 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13127 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13128 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13129
13130 /* Likewise emit code to initialize VROFF, the offset from FTOP
13131 of the next VR argument. */
13132 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13133 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13134 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13135 }
13136
13137 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13138
13139 static tree
13140 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13141 gimple_seq *post_p ATTRIBUTE_UNUSED)
13142 {
13143 tree addr;
13144 bool indirect_p;
13145 bool is_ha; /* is HFA or HVA. */
13146 bool dw_align; /* double-word align. */
13147 machine_mode ag_mode = VOIDmode;
13148 int nregs;
13149 machine_mode mode;
13150
13151 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13152 tree stack, f_top, f_off, off, arg, roundup, on_stack;
13153 HOST_WIDE_INT size, rsize, adjust, align;
13154 tree t, u, cond1, cond2;
13155
13156 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13157 if (indirect_p)
13158 type = build_pointer_type (type);
13159
13160 mode = TYPE_MODE (type);
13161
13162 f_stack = TYPE_FIELDS (va_list_type_node);
13163 f_grtop = DECL_CHAIN (f_stack);
13164 f_vrtop = DECL_CHAIN (f_grtop);
13165 f_groff = DECL_CHAIN (f_vrtop);
13166 f_vroff = DECL_CHAIN (f_groff);
13167
13168 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13169 f_stack, NULL_TREE);
13170 size = int_size_in_bytes (type);
13171 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
13172
13173 dw_align = false;
13174 adjust = 0;
13175 if (aarch64_vfp_is_call_or_return_candidate (mode,
13176 type,
13177 &ag_mode,
13178 &nregs,
13179 &is_ha))
13180 {
13181 /* No frontends can create types with variable-sized modes, so we
13182 shouldn't be asked to pass or return them. */
13183 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13184
13185 /* TYPE passed in fp/simd registers. */
13186 if (!TARGET_FLOAT)
13187 aarch64_err_no_fpadvsimd (mode);
13188
13189 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13190 unshare_expr (valist), f_vrtop, NULL_TREE);
13191 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13192 unshare_expr (valist), f_vroff, NULL_TREE);
13193
13194 rsize = nregs * UNITS_PER_VREG;
13195
13196 if (is_ha)
13197 {
13198 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13199 adjust = UNITS_PER_VREG - ag_size;
13200 }
13201 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13202 && size < UNITS_PER_VREG)
13203 {
13204 adjust = UNITS_PER_VREG - size;
13205 }
13206 }
13207 else
13208 {
13209 /* TYPE passed in general registers. */
13210 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13211 unshare_expr (valist), f_grtop, NULL_TREE);
13212 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13213 unshare_expr (valist), f_groff, NULL_TREE);
13214 rsize = ROUND_UP (size, UNITS_PER_WORD);
13215 nregs = rsize / UNITS_PER_WORD;
13216
13217 if (align > 8)
13218 dw_align = true;
13219
13220 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13221 && size < UNITS_PER_WORD)
13222 {
13223 adjust = UNITS_PER_WORD - size;
13224 }
13225 }
13226
13227 /* Get a local temporary for the field value. */
13228 off = get_initialized_tmp_var (f_off, pre_p, NULL);
13229
13230 /* Emit code to branch if off >= 0. */
13231 t = build2 (GE_EXPR, boolean_type_node, off,
13232 build_int_cst (TREE_TYPE (off), 0));
13233 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13234
13235 if (dw_align)
13236 {
13237 /* Emit: offs = (offs + 15) & -16. */
13238 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13239 build_int_cst (TREE_TYPE (off), 15));
13240 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13241 build_int_cst (TREE_TYPE (off), -16));
13242 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13243 }
13244 else
13245 roundup = NULL;
13246
13247 /* Update ap.__[g|v]r_offs */
13248 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13249 build_int_cst (TREE_TYPE (off), rsize));
13250 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13251
13252 /* String up. */
13253 if (roundup)
13254 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13255
13256 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13257 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13258 build_int_cst (TREE_TYPE (f_off), 0));
13259 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13260
13261 /* String up: make sure the assignment happens before the use. */
13262 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13263 COND_EXPR_ELSE (cond1) = t;
13264
13265 /* Prepare the trees handling the argument that is passed on the stack;
13266 the top level node will store in ON_STACK. */
13267 arg = get_initialized_tmp_var (stack, pre_p, NULL);
13268 if (align > 8)
13269 {
13270 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13271 t = fold_build_pointer_plus_hwi (arg, 15);
13272 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13273 build_int_cst (TREE_TYPE (t), -16));
13274 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13275 }
13276 else
13277 roundup = NULL;
13278 /* Advance ap.__stack */
13279 t = fold_build_pointer_plus_hwi (arg, size + 7);
13280 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13281 build_int_cst (TREE_TYPE (t), -8));
13282 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13283 /* String up roundup and advance. */
13284 if (roundup)
13285 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13286 /* String up with arg */
13287 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13288 /* Big-endianness related address adjustment. */
13289 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13290 && size < UNITS_PER_WORD)
13291 {
13292 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13293 size_int (UNITS_PER_WORD - size));
13294 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13295 }
13296
13297 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13298 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13299
13300 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13301 t = off;
13302 if (adjust)
13303 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13304 build_int_cst (TREE_TYPE (off), adjust));
13305
13306 t = fold_convert (sizetype, t);
13307 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13308
13309 if (is_ha)
13310 {
13311 /* type ha; // treat as "struct {ftype field[n];}"
13312 ... [computing offs]
13313 for (i = 0; i <nregs; ++i, offs += 16)
13314 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13315 return ha; */
13316 int i;
13317 tree tmp_ha, field_t, field_ptr_t;
13318
13319 /* Declare a local variable. */
13320 tmp_ha = create_tmp_var_raw (type, "ha");
13321 gimple_add_tmp_var (tmp_ha);
13322
13323 /* Establish the base type. */
13324 switch (ag_mode)
13325 {
13326 case E_SFmode:
13327 field_t = float_type_node;
13328 field_ptr_t = float_ptr_type_node;
13329 break;
13330 case E_DFmode:
13331 field_t = double_type_node;
13332 field_ptr_t = double_ptr_type_node;
13333 break;
13334 case E_TFmode:
13335 field_t = long_double_type_node;
13336 field_ptr_t = long_double_ptr_type_node;
13337 break;
13338 case E_HFmode:
13339 field_t = aarch64_fp16_type_node;
13340 field_ptr_t = aarch64_fp16_ptr_type_node;
13341 break;
13342 case E_V2SImode:
13343 case E_V4SImode:
13344 {
13345 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13346 field_t = build_vector_type_for_mode (innertype, ag_mode);
13347 field_ptr_t = build_pointer_type (field_t);
13348 }
13349 break;
13350 default:
13351 gcc_assert (0);
13352 }
13353
13354 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13355 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13356 addr = t;
13357 t = fold_convert (field_ptr_t, addr);
13358 t = build2 (MODIFY_EXPR, field_t,
13359 build1 (INDIRECT_REF, field_t, tmp_ha),
13360 build1 (INDIRECT_REF, field_t, t));
13361
13362 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13363 for (i = 1; i < nregs; ++i)
13364 {
13365 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13366 u = fold_convert (field_ptr_t, addr);
13367 u = build2 (MODIFY_EXPR, field_t,
13368 build2 (MEM_REF, field_t, tmp_ha,
13369 build_int_cst (field_ptr_t,
13370 (i *
13371 int_size_in_bytes (field_t)))),
13372 build1 (INDIRECT_REF, field_t, u));
13373 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13374 }
13375
13376 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13377 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13378 }
13379
13380 COND_EXPR_ELSE (cond2) = t;
13381 addr = fold_convert (build_pointer_type (type), cond1);
13382 addr = build_va_arg_indirect_ref (addr);
13383
13384 if (indirect_p)
13385 addr = build_va_arg_indirect_ref (addr);
13386
13387 return addr;
13388 }
13389
13390 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13391
13392 static void
13393 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13394 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13395 int no_rtl)
13396 {
13397 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13398 CUMULATIVE_ARGS local_cum;
13399 int gr_saved = cfun->va_list_gpr_size;
13400 int vr_saved = cfun->va_list_fpr_size;
13401
13402 /* The caller has advanced CUM up to, but not beyond, the last named
13403 argument. Advance a local copy of CUM past the last "real" named
13404 argument, to find out how many registers are left over. */
13405 local_cum = *cum;
13406 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13407
13408 /* Found out how many registers we need to save.
13409 Honor tree-stdvar analysis results. */
13410 if (cfun->va_list_gpr_size)
13411 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13412 cfun->va_list_gpr_size / UNITS_PER_WORD);
13413 if (cfun->va_list_fpr_size)
13414 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13415 cfun->va_list_fpr_size / UNITS_PER_VREG);
13416
13417 if (!TARGET_FLOAT)
13418 {
13419 gcc_assert (local_cum.aapcs_nvrn == 0);
13420 vr_saved = 0;
13421 }
13422
13423 if (!no_rtl)
13424 {
13425 if (gr_saved > 0)
13426 {
13427 rtx ptr, mem;
13428
13429 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13430 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13431 - gr_saved * UNITS_PER_WORD);
13432 mem = gen_frame_mem (BLKmode, ptr);
13433 set_mem_alias_set (mem, get_varargs_alias_set ());
13434
13435 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13436 mem, gr_saved);
13437 }
13438 if (vr_saved > 0)
13439 {
13440 /* We can't use move_block_from_reg, because it will use
13441 the wrong mode, storing D regs only. */
13442 machine_mode mode = TImode;
13443 int off, i, vr_start;
13444
13445 /* Set OFF to the offset from virtual_incoming_args_rtx of
13446 the first vector register. The VR save area lies below
13447 the GR one, and is aligned to 16 bytes. */
13448 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13449 STACK_BOUNDARY / BITS_PER_UNIT);
13450 off -= vr_saved * UNITS_PER_VREG;
13451
13452 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13453 for (i = 0; i < vr_saved; ++i)
13454 {
13455 rtx ptr, mem;
13456
13457 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13458 mem = gen_frame_mem (mode, ptr);
13459 set_mem_alias_set (mem, get_varargs_alias_set ());
13460 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13461 off += UNITS_PER_VREG;
13462 }
13463 }
13464 }
13465
13466 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13467 any complication of having crtl->args.pretend_args_size changed. */
13468 cfun->machine->frame.saved_varargs_size
13469 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13470 STACK_BOUNDARY / BITS_PER_UNIT)
13471 + vr_saved * UNITS_PER_VREG);
13472 }
13473
13474 static void
13475 aarch64_conditional_register_usage (void)
13476 {
13477 int i;
13478 if (!TARGET_FLOAT)
13479 {
13480 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13481 {
13482 fixed_regs[i] = 1;
13483 call_used_regs[i] = 1;
13484 }
13485 }
13486 if (!TARGET_SVE)
13487 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13488 {
13489 fixed_regs[i] = 1;
13490 call_used_regs[i] = 1;
13491 }
13492
13493 /* When tracking speculation, we need a couple of call-clobbered registers
13494 to track the speculation state. It would be nice to just use
13495 IP0 and IP1, but currently there are numerous places that just
13496 assume these registers are free for other uses (eg pointer
13497 authentication). */
13498 if (aarch64_track_speculation)
13499 {
13500 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13501 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13502 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13503 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13504 }
13505 }
13506
13507 /* Walk down the type tree of TYPE counting consecutive base elements.
13508 If *MODEP is VOIDmode, then set it to the first valid floating point
13509 type. If a non-floating point type is found, or if a floating point
13510 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13511 otherwise return the count in the sub-tree. */
13512 static int
13513 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13514 {
13515 machine_mode mode;
13516 HOST_WIDE_INT size;
13517
13518 switch (TREE_CODE (type))
13519 {
13520 case REAL_TYPE:
13521 mode = TYPE_MODE (type);
13522 if (mode != DFmode && mode != SFmode
13523 && mode != TFmode && mode != HFmode)
13524 return -1;
13525
13526 if (*modep == VOIDmode)
13527 *modep = mode;
13528
13529 if (*modep == mode)
13530 return 1;
13531
13532 break;
13533
13534 case COMPLEX_TYPE:
13535 mode = TYPE_MODE (TREE_TYPE (type));
13536 if (mode != DFmode && mode != SFmode
13537 && mode != TFmode && mode != HFmode)
13538 return -1;
13539
13540 if (*modep == VOIDmode)
13541 *modep = mode;
13542
13543 if (*modep == mode)
13544 return 2;
13545
13546 break;
13547
13548 case VECTOR_TYPE:
13549 /* Use V2SImode and V4SImode as representatives of all 64-bit
13550 and 128-bit vector types. */
13551 size = int_size_in_bytes (type);
13552 switch (size)
13553 {
13554 case 8:
13555 mode = V2SImode;
13556 break;
13557 case 16:
13558 mode = V4SImode;
13559 break;
13560 default:
13561 return -1;
13562 }
13563
13564 if (*modep == VOIDmode)
13565 *modep = mode;
13566
13567 /* Vector modes are considered to be opaque: two vectors are
13568 equivalent for the purposes of being homogeneous aggregates
13569 if they are the same size. */
13570 if (*modep == mode)
13571 return 1;
13572
13573 break;
13574
13575 case ARRAY_TYPE:
13576 {
13577 int count;
13578 tree index = TYPE_DOMAIN (type);
13579
13580 /* Can't handle incomplete types nor sizes that are not
13581 fixed. */
13582 if (!COMPLETE_TYPE_P (type)
13583 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13584 return -1;
13585
13586 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13587 if (count == -1
13588 || !index
13589 || !TYPE_MAX_VALUE (index)
13590 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13591 || !TYPE_MIN_VALUE (index)
13592 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13593 || count < 0)
13594 return -1;
13595
13596 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13597 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13598
13599 /* There must be no padding. */
13600 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13601 count * GET_MODE_BITSIZE (*modep)))
13602 return -1;
13603
13604 return count;
13605 }
13606
13607 case RECORD_TYPE:
13608 {
13609 int count = 0;
13610 int sub_count;
13611 tree field;
13612
13613 /* Can't handle incomplete types nor sizes that are not
13614 fixed. */
13615 if (!COMPLETE_TYPE_P (type)
13616 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13617 return -1;
13618
13619 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13620 {
13621 if (TREE_CODE (field) != FIELD_DECL)
13622 continue;
13623
13624 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13625 if (sub_count < 0)
13626 return -1;
13627 count += sub_count;
13628 }
13629
13630 /* There must be no padding. */
13631 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13632 count * GET_MODE_BITSIZE (*modep)))
13633 return -1;
13634
13635 return count;
13636 }
13637
13638 case UNION_TYPE:
13639 case QUAL_UNION_TYPE:
13640 {
13641 /* These aren't very interesting except in a degenerate case. */
13642 int count = 0;
13643 int sub_count;
13644 tree field;
13645
13646 /* Can't handle incomplete types nor sizes that are not
13647 fixed. */
13648 if (!COMPLETE_TYPE_P (type)
13649 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13650 return -1;
13651
13652 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13653 {
13654 if (TREE_CODE (field) != FIELD_DECL)
13655 continue;
13656
13657 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13658 if (sub_count < 0)
13659 return -1;
13660 count = count > sub_count ? count : sub_count;
13661 }
13662
13663 /* There must be no padding. */
13664 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13665 count * GET_MODE_BITSIZE (*modep)))
13666 return -1;
13667
13668 return count;
13669 }
13670
13671 default:
13672 break;
13673 }
13674
13675 return -1;
13676 }
13677
13678 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13679 type as described in AAPCS64 \S 4.1.2.
13680
13681 See the comment above aarch64_composite_type_p for the notes on MODE. */
13682
13683 static bool
13684 aarch64_short_vector_p (const_tree type,
13685 machine_mode mode)
13686 {
13687 poly_int64 size = -1;
13688
13689 if (type && TREE_CODE (type) == VECTOR_TYPE)
13690 size = int_size_in_bytes (type);
13691 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13692 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13693 size = GET_MODE_SIZE (mode);
13694
13695 return known_eq (size, 8) || known_eq (size, 16);
13696 }
13697
13698 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13699 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
13700 array types. The C99 floating-point complex types are also considered
13701 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
13702 types, which are GCC extensions and out of the scope of AAPCS64, are
13703 treated as composite types here as well.
13704
13705 Note that MODE itself is not sufficient in determining whether a type
13706 is such a composite type or not. This is because
13707 stor-layout.c:compute_record_mode may have already changed the MODE
13708 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
13709 structure with only one field may have its MODE set to the mode of the
13710 field. Also an integer mode whose size matches the size of the
13711 RECORD_TYPE type may be used to substitute the original mode
13712 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
13713 solely relied on. */
13714
13715 static bool
13716 aarch64_composite_type_p (const_tree type,
13717 machine_mode mode)
13718 {
13719 if (aarch64_short_vector_p (type, mode))
13720 return false;
13721
13722 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13723 return true;
13724
13725 if (mode == BLKmode
13726 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13727 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13728 return true;
13729
13730 return false;
13731 }
13732
13733 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13734 shall be passed or returned in simd/fp register(s) (providing these
13735 parameter passing registers are available).
13736
13737 Upon successful return, *COUNT returns the number of needed registers,
13738 *BASE_MODE returns the mode of the individual register and when IS_HAF
13739 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13740 floating-point aggregate or a homogeneous short-vector aggregate. */
13741
13742 static bool
13743 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13744 const_tree type,
13745 machine_mode *base_mode,
13746 int *count,
13747 bool *is_ha)
13748 {
13749 machine_mode new_mode = VOIDmode;
13750 bool composite_p = aarch64_composite_type_p (type, mode);
13751
13752 if (is_ha != NULL) *is_ha = false;
13753
13754 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13755 || aarch64_short_vector_p (type, mode))
13756 {
13757 *count = 1;
13758 new_mode = mode;
13759 }
13760 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13761 {
13762 if (is_ha != NULL) *is_ha = true;
13763 *count = 2;
13764 new_mode = GET_MODE_INNER (mode);
13765 }
13766 else if (type && composite_p)
13767 {
13768 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
13769
13770 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
13771 {
13772 if (is_ha != NULL) *is_ha = true;
13773 *count = ag_count;
13774 }
13775 else
13776 return false;
13777 }
13778 else
13779 return false;
13780
13781 *base_mode = new_mode;
13782 return true;
13783 }
13784
13785 /* Implement TARGET_STRUCT_VALUE_RTX. */
13786
13787 static rtx
13788 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
13789 int incoming ATTRIBUTE_UNUSED)
13790 {
13791 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
13792 }
13793
13794 /* Implements target hook vector_mode_supported_p. */
13795 static bool
13796 aarch64_vector_mode_supported_p (machine_mode mode)
13797 {
13798 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13799 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
13800 }
13801
13802 /* Return appropriate SIMD container
13803 for MODE within a vector of WIDTH bits. */
13804 static machine_mode
13805 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
13806 {
13807 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
13808 switch (mode)
13809 {
13810 case E_DFmode:
13811 return VNx2DFmode;
13812 case E_SFmode:
13813 return VNx4SFmode;
13814 case E_HFmode:
13815 return VNx8HFmode;
13816 case E_DImode:
13817 return VNx2DImode;
13818 case E_SImode:
13819 return VNx4SImode;
13820 case E_HImode:
13821 return VNx8HImode;
13822 case E_QImode:
13823 return VNx16QImode;
13824 default:
13825 return word_mode;
13826 }
13827
13828 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
13829 if (TARGET_SIMD)
13830 {
13831 if (known_eq (width, 128))
13832 switch (mode)
13833 {
13834 case E_DFmode:
13835 return V2DFmode;
13836 case E_SFmode:
13837 return V4SFmode;
13838 case E_HFmode:
13839 return V8HFmode;
13840 case E_SImode:
13841 return V4SImode;
13842 case E_HImode:
13843 return V8HImode;
13844 case E_QImode:
13845 return V16QImode;
13846 case E_DImode:
13847 return V2DImode;
13848 default:
13849 break;
13850 }
13851 else
13852 switch (mode)
13853 {
13854 case E_SFmode:
13855 return V2SFmode;
13856 case E_HFmode:
13857 return V4HFmode;
13858 case E_SImode:
13859 return V2SImode;
13860 case E_HImode:
13861 return V4HImode;
13862 case E_QImode:
13863 return V8QImode;
13864 default:
13865 break;
13866 }
13867 }
13868 return word_mode;
13869 }
13870
13871 /* Return 128-bit container as the preferred SIMD mode for MODE. */
13872 static machine_mode
13873 aarch64_preferred_simd_mode (scalar_mode mode)
13874 {
13875 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
13876 return aarch64_simd_container_mode (mode, bits);
13877 }
13878
13879 /* Return a list of possible vector sizes for the vectorizer
13880 to iterate over. */
13881 static void
13882 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
13883 {
13884 if (TARGET_SVE)
13885 sizes->safe_push (BYTES_PER_SVE_VECTOR);
13886 sizes->safe_push (16);
13887 sizes->safe_push (8);
13888 }
13889
13890 /* Implement TARGET_MANGLE_TYPE. */
13891
13892 static const char *
13893 aarch64_mangle_type (const_tree type)
13894 {
13895 /* The AArch64 ABI documents say that "__va_list" has to be
13896 mangled as if it is in the "std" namespace. */
13897 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
13898 return "St9__va_list";
13899
13900 /* Half-precision float. */
13901 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
13902 return "Dh";
13903
13904 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
13905 builtin types. */
13906 if (TYPE_NAME (type) != NULL)
13907 return aarch64_mangle_builtin_type (type);
13908
13909 /* Use the default mangling. */
13910 return NULL;
13911 }
13912
13913 /* Find the first rtx_insn before insn that will generate an assembly
13914 instruction. */
13915
13916 static rtx_insn *
13917 aarch64_prev_real_insn (rtx_insn *insn)
13918 {
13919 if (!insn)
13920 return NULL;
13921
13922 do
13923 {
13924 insn = prev_real_insn (insn);
13925 }
13926 while (insn && recog_memoized (insn) < 0);
13927
13928 return insn;
13929 }
13930
13931 static bool
13932 is_madd_op (enum attr_type t1)
13933 {
13934 unsigned int i;
13935 /* A number of these may be AArch32 only. */
13936 enum attr_type mlatypes[] = {
13937 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13938 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13939 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13940 };
13941
13942 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13943 {
13944 if (t1 == mlatypes[i])
13945 return true;
13946 }
13947
13948 return false;
13949 }
13950
13951 /* Check if there is a register dependency between a load and the insn
13952 for which we hold recog_data. */
13953
13954 static bool
13955 dep_between_memop_and_curr (rtx memop)
13956 {
13957 rtx load_reg;
13958 int opno;
13959
13960 gcc_assert (GET_CODE (memop) == SET);
13961
13962 if (!REG_P (SET_DEST (memop)))
13963 return false;
13964
13965 load_reg = SET_DEST (memop);
13966 for (opno = 1; opno < recog_data.n_operands; opno++)
13967 {
13968 rtx operand = recog_data.operand[opno];
13969 if (REG_P (operand)
13970 && reg_overlap_mentioned_p (load_reg, operand))
13971 return true;
13972
13973 }
13974 return false;
13975 }
13976
13977
13978 /* When working around the Cortex-A53 erratum 835769,
13979 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13980 instruction and has a preceding memory instruction such that a NOP
13981 should be inserted between them. */
13982
13983 bool
13984 aarch64_madd_needs_nop (rtx_insn* insn)
13985 {
13986 enum attr_type attr_type;
13987 rtx_insn *prev;
13988 rtx body;
13989
13990 if (!TARGET_FIX_ERR_A53_835769)
13991 return false;
13992
13993 if (!INSN_P (insn) || recog_memoized (insn) < 0)
13994 return false;
13995
13996 attr_type = get_attr_type (insn);
13997 if (!is_madd_op (attr_type))
13998 return false;
13999
14000 prev = aarch64_prev_real_insn (insn);
14001 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14002 Restore recog state to INSN to avoid state corruption. */
14003 extract_constrain_insn_cached (insn);
14004
14005 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14006 return false;
14007
14008 body = single_set (prev);
14009
14010 /* If the previous insn is a memory op and there is no dependency between
14011 it and the DImode madd, emit a NOP between them. If body is NULL then we
14012 have a complex memory operation, probably a load/store pair.
14013 Be conservative for now and emit a NOP. */
14014 if (GET_MODE (recog_data.operand[0]) == DImode
14015 && (!body || !dep_between_memop_and_curr (body)))
14016 return true;
14017
14018 return false;
14019
14020 }
14021
14022
14023 /* Implement FINAL_PRESCAN_INSN. */
14024
14025 void
14026 aarch64_final_prescan_insn (rtx_insn *insn)
14027 {
14028 if (aarch64_madd_needs_nop (insn))
14029 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14030 }
14031
14032
14033 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14034 instruction. */
14035
14036 bool
14037 aarch64_sve_index_immediate_p (rtx base_or_step)
14038 {
14039 return (CONST_INT_P (base_or_step)
14040 && IN_RANGE (INTVAL (base_or_step), -16, 15));
14041 }
14042
14043 /* Return true if X is a valid immediate for the SVE ADD and SUB
14044 instructions. Negate X first if NEGATE_P is true. */
14045
14046 bool
14047 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14048 {
14049 rtx elt;
14050
14051 if (!const_vec_duplicate_p (x, &elt)
14052 || !CONST_INT_P (elt))
14053 return false;
14054
14055 HOST_WIDE_INT val = INTVAL (elt);
14056 if (negate_p)
14057 val = -val;
14058 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14059
14060 if (val & 0xff)
14061 return IN_RANGE (val, 0, 0xff);
14062 return IN_RANGE (val, 0, 0xff00);
14063 }
14064
14065 /* Return true if X is a valid immediate operand for an SVE logical
14066 instruction such as AND. */
14067
14068 bool
14069 aarch64_sve_bitmask_immediate_p (rtx x)
14070 {
14071 rtx elt;
14072
14073 return (const_vec_duplicate_p (x, &elt)
14074 && CONST_INT_P (elt)
14075 && aarch64_bitmask_imm (INTVAL (elt),
14076 GET_MODE_INNER (GET_MODE (x))));
14077 }
14078
14079 /* Return true if X is a valid immediate for the SVE DUP and CPY
14080 instructions. */
14081
14082 bool
14083 aarch64_sve_dup_immediate_p (rtx x)
14084 {
14085 rtx elt;
14086
14087 if (!const_vec_duplicate_p (x, &elt)
14088 || !CONST_INT_P (elt))
14089 return false;
14090
14091 HOST_WIDE_INT val = INTVAL (elt);
14092 if (val & 0xff)
14093 return IN_RANGE (val, -0x80, 0x7f);
14094 return IN_RANGE (val, -0x8000, 0x7f00);
14095 }
14096
14097 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14098 SIGNED_P says whether the operand is signed rather than unsigned. */
14099
14100 bool
14101 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14102 {
14103 rtx elt;
14104
14105 return (const_vec_duplicate_p (x, &elt)
14106 && CONST_INT_P (elt)
14107 && (signed_p
14108 ? IN_RANGE (INTVAL (elt), -16, 15)
14109 : IN_RANGE (INTVAL (elt), 0, 127)));
14110 }
14111
14112 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14113 instruction. Negate X first if NEGATE_P is true. */
14114
14115 bool
14116 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14117 {
14118 rtx elt;
14119 REAL_VALUE_TYPE r;
14120
14121 if (!const_vec_duplicate_p (x, &elt)
14122 || GET_CODE (elt) != CONST_DOUBLE)
14123 return false;
14124
14125 r = *CONST_DOUBLE_REAL_VALUE (elt);
14126
14127 if (negate_p)
14128 r = real_value_negate (&r);
14129
14130 if (real_equal (&r, &dconst1))
14131 return true;
14132 if (real_equal (&r, &dconsthalf))
14133 return true;
14134 return false;
14135 }
14136
14137 /* Return true if X is a valid immediate operand for an SVE FMUL
14138 instruction. */
14139
14140 bool
14141 aarch64_sve_float_mul_immediate_p (rtx x)
14142 {
14143 rtx elt;
14144
14145 /* GCC will never generate a multiply with an immediate of 2, so there is no
14146 point testing for it (even though it is a valid constant). */
14147 return (const_vec_duplicate_p (x, &elt)
14148 && GET_CODE (elt) == CONST_DOUBLE
14149 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14150 }
14151
14152 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14153 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14154 is nonnull, use it to describe valid immediates. */
14155 static bool
14156 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14157 simd_immediate_info *info,
14158 enum simd_immediate_check which,
14159 simd_immediate_info::insn_type insn)
14160 {
14161 /* Try a 4-byte immediate with LSL. */
14162 for (unsigned int shift = 0; shift < 32; shift += 8)
14163 if ((val32 & (0xff << shift)) == val32)
14164 {
14165 if (info)
14166 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14167 simd_immediate_info::LSL, shift);
14168 return true;
14169 }
14170
14171 /* Try a 2-byte immediate with LSL. */
14172 unsigned int imm16 = val32 & 0xffff;
14173 if (imm16 == (val32 >> 16))
14174 for (unsigned int shift = 0; shift < 16; shift += 8)
14175 if ((imm16 & (0xff << shift)) == imm16)
14176 {
14177 if (info)
14178 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14179 simd_immediate_info::LSL, shift);
14180 return true;
14181 }
14182
14183 /* Try a 4-byte immediate with MSL, except for cases that MVN
14184 can handle. */
14185 if (which == AARCH64_CHECK_MOV)
14186 for (unsigned int shift = 8; shift < 24; shift += 8)
14187 {
14188 unsigned int low = (1 << shift) - 1;
14189 if (((val32 & (0xff << shift)) | low) == val32)
14190 {
14191 if (info)
14192 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14193 simd_immediate_info::MSL, shift);
14194 return true;
14195 }
14196 }
14197
14198 return false;
14199 }
14200
14201 /* Return true if replicating VAL64 is a valid immediate for the
14202 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14203 use it to describe valid immediates. */
14204 static bool
14205 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14206 simd_immediate_info *info,
14207 enum simd_immediate_check which)
14208 {
14209 unsigned int val32 = val64 & 0xffffffff;
14210 unsigned int val16 = val64 & 0xffff;
14211 unsigned int val8 = val64 & 0xff;
14212
14213 if (val32 == (val64 >> 32))
14214 {
14215 if ((which & AARCH64_CHECK_ORR) != 0
14216 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14217 simd_immediate_info::MOV))
14218 return true;
14219
14220 if ((which & AARCH64_CHECK_BIC) != 0
14221 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14222 simd_immediate_info::MVN))
14223 return true;
14224
14225 /* Try using a replicated byte. */
14226 if (which == AARCH64_CHECK_MOV
14227 && val16 == (val32 >> 16)
14228 && val8 == (val16 >> 8))
14229 {
14230 if (info)
14231 *info = simd_immediate_info (QImode, val8);
14232 return true;
14233 }
14234 }
14235
14236 /* Try using a bit-to-bytemask. */
14237 if (which == AARCH64_CHECK_MOV)
14238 {
14239 unsigned int i;
14240 for (i = 0; i < 64; i += 8)
14241 {
14242 unsigned char byte = (val64 >> i) & 0xff;
14243 if (byte != 0 && byte != 0xff)
14244 break;
14245 }
14246 if (i == 64)
14247 {
14248 if (info)
14249 *info = simd_immediate_info (DImode, val64);
14250 return true;
14251 }
14252 }
14253 return false;
14254 }
14255
14256 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14257 instruction. If INFO is nonnull, use it to describe valid immediates. */
14258
14259 static bool
14260 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14261 simd_immediate_info *info)
14262 {
14263 scalar_int_mode mode = DImode;
14264 unsigned int val32 = val64 & 0xffffffff;
14265 if (val32 == (val64 >> 32))
14266 {
14267 mode = SImode;
14268 unsigned int val16 = val32 & 0xffff;
14269 if (val16 == (val32 >> 16))
14270 {
14271 mode = HImode;
14272 unsigned int val8 = val16 & 0xff;
14273 if (val8 == (val16 >> 8))
14274 mode = QImode;
14275 }
14276 }
14277 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14278 if (IN_RANGE (val, -0x80, 0x7f))
14279 {
14280 /* DUP with no shift. */
14281 if (info)
14282 *info = simd_immediate_info (mode, val);
14283 return true;
14284 }
14285 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14286 {
14287 /* DUP with LSL #8. */
14288 if (info)
14289 *info = simd_immediate_info (mode, val);
14290 return true;
14291 }
14292 if (aarch64_bitmask_imm (val64, mode))
14293 {
14294 /* DUPM. */
14295 if (info)
14296 *info = simd_immediate_info (mode, val);
14297 return true;
14298 }
14299 return false;
14300 }
14301
14302 /* Return true if OP is a valid SIMD immediate for the operation
14303 described by WHICH. If INFO is nonnull, use it to describe valid
14304 immediates. */
14305 bool
14306 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14307 enum simd_immediate_check which)
14308 {
14309 machine_mode mode = GET_MODE (op);
14310 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14311 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14312 return false;
14313
14314 scalar_mode elt_mode = GET_MODE_INNER (mode);
14315 rtx base, step;
14316 unsigned int n_elts;
14317 if (GET_CODE (op) == CONST_VECTOR
14318 && CONST_VECTOR_DUPLICATE_P (op))
14319 n_elts = CONST_VECTOR_NPATTERNS (op);
14320 else if ((vec_flags & VEC_SVE_DATA)
14321 && const_vec_series_p (op, &base, &step))
14322 {
14323 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14324 if (!aarch64_sve_index_immediate_p (base)
14325 || !aarch64_sve_index_immediate_p (step))
14326 return false;
14327
14328 if (info)
14329 *info = simd_immediate_info (elt_mode, base, step);
14330 return true;
14331 }
14332 else if (GET_CODE (op) == CONST_VECTOR
14333 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14334 /* N_ELTS set above. */;
14335 else
14336 return false;
14337
14338 /* Handle PFALSE and PTRUE. */
14339 if (vec_flags & VEC_SVE_PRED)
14340 return (op == CONST0_RTX (mode)
14341 || op == CONSTM1_RTX (mode));
14342
14343 scalar_float_mode elt_float_mode;
14344 if (n_elts == 1
14345 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14346 {
14347 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14348 if (aarch64_float_const_zero_rtx_p (elt)
14349 || aarch64_float_const_representable_p (elt))
14350 {
14351 if (info)
14352 *info = simd_immediate_info (elt_float_mode, elt);
14353 return true;
14354 }
14355 }
14356
14357 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14358 if (elt_size > 8)
14359 return false;
14360
14361 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14362
14363 /* Expand the vector constant out into a byte vector, with the least
14364 significant byte of the register first. */
14365 auto_vec<unsigned char, 16> bytes;
14366 bytes.reserve (n_elts * elt_size);
14367 for (unsigned int i = 0; i < n_elts; i++)
14368 {
14369 /* The vector is provided in gcc endian-neutral fashion.
14370 For aarch64_be Advanced SIMD, it must be laid out in the vector
14371 register in reverse order. */
14372 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14373 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14374
14375 if (elt_mode != elt_int_mode)
14376 elt = gen_lowpart (elt_int_mode, elt);
14377
14378 if (!CONST_INT_P (elt))
14379 return false;
14380
14381 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14382 for (unsigned int byte = 0; byte < elt_size; byte++)
14383 {
14384 bytes.quick_push (elt_val & 0xff);
14385 elt_val >>= BITS_PER_UNIT;
14386 }
14387 }
14388
14389 /* The immediate must repeat every eight bytes. */
14390 unsigned int nbytes = bytes.length ();
14391 for (unsigned i = 8; i < nbytes; ++i)
14392 if (bytes[i] != bytes[i - 8])
14393 return false;
14394
14395 /* Get the repeating 8-byte value as an integer. No endian correction
14396 is needed here because bytes is already in lsb-first order. */
14397 unsigned HOST_WIDE_INT val64 = 0;
14398 for (unsigned int i = 0; i < 8; i++)
14399 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14400 << (i * BITS_PER_UNIT));
14401
14402 if (vec_flags & VEC_SVE_DATA)
14403 return aarch64_sve_valid_immediate (val64, info);
14404 else
14405 return aarch64_advsimd_valid_immediate (val64, info, which);
14406 }
14407
14408 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14409 has a step in the range of INDEX. Return the index expression if so,
14410 otherwise return null. */
14411 rtx
14412 aarch64_check_zero_based_sve_index_immediate (rtx x)
14413 {
14414 rtx base, step;
14415 if (const_vec_series_p (x, &base, &step)
14416 && base == const0_rtx
14417 && aarch64_sve_index_immediate_p (step))
14418 return step;
14419 return NULL_RTX;
14420 }
14421
14422 /* Check of immediate shift constants are within range. */
14423 bool
14424 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14425 {
14426 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14427 if (left)
14428 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14429 else
14430 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14431 }
14432
14433 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14434 operation of width WIDTH at bit position POS. */
14435
14436 rtx
14437 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14438 {
14439 gcc_assert (CONST_INT_P (width));
14440 gcc_assert (CONST_INT_P (pos));
14441
14442 unsigned HOST_WIDE_INT mask
14443 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14444 return GEN_INT (mask << UINTVAL (pos));
14445 }
14446
14447 bool
14448 aarch64_mov_operand_p (rtx x, machine_mode mode)
14449 {
14450 if (GET_CODE (x) == HIGH
14451 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14452 return true;
14453
14454 if (CONST_INT_P (x))
14455 return true;
14456
14457 if (VECTOR_MODE_P (GET_MODE (x)))
14458 return aarch64_simd_valid_immediate (x, NULL);
14459
14460 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14461 return true;
14462
14463 if (aarch64_sve_cnt_immediate_p (x))
14464 return true;
14465
14466 return aarch64_classify_symbolic_expression (x)
14467 == SYMBOL_TINY_ABSOLUTE;
14468 }
14469
14470 /* Return a const_int vector of VAL. */
14471 rtx
14472 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14473 {
14474 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14475 return gen_const_vec_duplicate (mode, c);
14476 }
14477
14478 /* Check OP is a legal scalar immediate for the MOVI instruction. */
14479
14480 bool
14481 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14482 {
14483 machine_mode vmode;
14484
14485 vmode = aarch64_simd_container_mode (mode, 64);
14486 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14487 return aarch64_simd_valid_immediate (op_v, NULL);
14488 }
14489
14490 /* Construct and return a PARALLEL RTX vector with elements numbering the
14491 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14492 the vector - from the perspective of the architecture. This does not
14493 line up with GCC's perspective on lane numbers, so we end up with
14494 different masks depending on our target endian-ness. The diagram
14495 below may help. We must draw the distinction when building masks
14496 which select one half of the vector. An instruction selecting
14497 architectural low-lanes for a big-endian target, must be described using
14498 a mask selecting GCC high-lanes.
14499
14500 Big-Endian Little-Endian
14501
14502 GCC 0 1 2 3 3 2 1 0
14503 | x | x | x | x | | x | x | x | x |
14504 Architecture 3 2 1 0 3 2 1 0
14505
14506 Low Mask: { 2, 3 } { 0, 1 }
14507 High Mask: { 0, 1 } { 2, 3 }
14508
14509 MODE Is the mode of the vector and NUNITS is the number of units in it. */
14510
14511 rtx
14512 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14513 {
14514 rtvec v = rtvec_alloc (nunits / 2);
14515 int high_base = nunits / 2;
14516 int low_base = 0;
14517 int base;
14518 rtx t1;
14519 int i;
14520
14521 if (BYTES_BIG_ENDIAN)
14522 base = high ? low_base : high_base;
14523 else
14524 base = high ? high_base : low_base;
14525
14526 for (i = 0; i < nunits / 2; i++)
14527 RTVEC_ELT (v, i) = GEN_INT (base + i);
14528
14529 t1 = gen_rtx_PARALLEL (mode, v);
14530 return t1;
14531 }
14532
14533 /* Check OP for validity as a PARALLEL RTX vector with elements
14534 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14535 from the perspective of the architecture. See the diagram above
14536 aarch64_simd_vect_par_cnst_half for more details. */
14537
14538 bool
14539 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14540 bool high)
14541 {
14542 int nelts;
14543 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14544 return false;
14545
14546 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14547 HOST_WIDE_INT count_op = XVECLEN (op, 0);
14548 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14549 int i = 0;
14550
14551 if (count_op != count_ideal)
14552 return false;
14553
14554 for (i = 0; i < count_ideal; i++)
14555 {
14556 rtx elt_op = XVECEXP (op, 0, i);
14557 rtx elt_ideal = XVECEXP (ideal, 0, i);
14558
14559 if (!CONST_INT_P (elt_op)
14560 || INTVAL (elt_ideal) != INTVAL (elt_op))
14561 return false;
14562 }
14563 return true;
14564 }
14565
14566 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
14567 HIGH (exclusive). */
14568 void
14569 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14570 const_tree exp)
14571 {
14572 HOST_WIDE_INT lane;
14573 gcc_assert (CONST_INT_P (operand));
14574 lane = INTVAL (operand);
14575
14576 if (lane < low || lane >= high)
14577 {
14578 if (exp)
14579 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14580 else
14581 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14582 }
14583 }
14584
14585 /* Peform endian correction on lane number N, which indexes a vector
14586 of mode MODE, and return the result as an SImode rtx. */
14587
14588 rtx
14589 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14590 {
14591 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14592 }
14593
14594 /* Return TRUE if OP is a valid vector addressing mode. */
14595
14596 bool
14597 aarch64_simd_mem_operand_p (rtx op)
14598 {
14599 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14600 || REG_P (XEXP (op, 0)));
14601 }
14602
14603 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
14604
14605 bool
14606 aarch64_sve_ld1r_operand_p (rtx op)
14607 {
14608 struct aarch64_address_info addr;
14609 scalar_mode mode;
14610
14611 return (MEM_P (op)
14612 && is_a <scalar_mode> (GET_MODE (op), &mode)
14613 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14614 && addr.type == ADDRESS_REG_IMM
14615 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14616 }
14617
14618 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14619 The conditions for STR are the same. */
14620 bool
14621 aarch64_sve_ldr_operand_p (rtx op)
14622 {
14623 struct aarch64_address_info addr;
14624
14625 return (MEM_P (op)
14626 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14627 false, ADDR_QUERY_ANY)
14628 && addr.type == ADDRESS_REG_IMM);
14629 }
14630
14631 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14632 We need to be able to access the individual pieces, so the range
14633 is different from LD[234] and ST[234]. */
14634 bool
14635 aarch64_sve_struct_memory_operand_p (rtx op)
14636 {
14637 if (!MEM_P (op))
14638 return false;
14639
14640 machine_mode mode = GET_MODE (op);
14641 struct aarch64_address_info addr;
14642 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14643 ADDR_QUERY_ANY)
14644 || addr.type != ADDRESS_REG_IMM)
14645 return false;
14646
14647 poly_int64 first = addr.const_offset;
14648 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14649 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14650 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14651 }
14652
14653 /* Emit a register copy from operand to operand, taking care not to
14654 early-clobber source registers in the process.
14655
14656 COUNT is the number of components into which the copy needs to be
14657 decomposed. */
14658 void
14659 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14660 unsigned int count)
14661 {
14662 unsigned int i;
14663 int rdest = REGNO (operands[0]);
14664 int rsrc = REGNO (operands[1]);
14665
14666 if (!reg_overlap_mentioned_p (operands[0], operands[1])
14667 || rdest < rsrc)
14668 for (i = 0; i < count; i++)
14669 emit_move_insn (gen_rtx_REG (mode, rdest + i),
14670 gen_rtx_REG (mode, rsrc + i));
14671 else
14672 for (i = 0; i < count; i++)
14673 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14674 gen_rtx_REG (mode, rsrc + count - i - 1));
14675 }
14676
14677 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14678 one of VSTRUCT modes: OI, CI, or XI. */
14679 int
14680 aarch64_simd_attr_length_rglist (machine_mode mode)
14681 {
14682 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
14683 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14684 }
14685
14686 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
14687 alignment of a vector to 128 bits. SVE predicates have an alignment of
14688 16 bits. */
14689 static HOST_WIDE_INT
14690 aarch64_simd_vector_alignment (const_tree type)
14691 {
14692 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14693 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14694 be set for non-predicate vectors of booleans. Modes are the most
14695 direct way we have of identifying real SVE predicate types. */
14696 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14697 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
14698 return MIN (align, 128);
14699 }
14700
14701 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
14702 static poly_uint64
14703 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14704 {
14705 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14706 {
14707 /* If the length of the vector is fixed, try to align to that length,
14708 otherwise don't try to align at all. */
14709 HOST_WIDE_INT result;
14710 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14711 result = TYPE_ALIGN (TREE_TYPE (type));
14712 return result;
14713 }
14714 return TYPE_ALIGN (type);
14715 }
14716
14717 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
14718 static bool
14719 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14720 {
14721 if (is_packed)
14722 return false;
14723
14724 /* For fixed-length vectors, check that the vectorizer will aim for
14725 full-vector alignment. This isn't true for generic GCC vectors
14726 that are wider than the ABI maximum of 128 bits. */
14727 poly_uint64 preferred_alignment =
14728 aarch64_vectorize_preferred_vector_alignment (type);
14729 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14730 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14731 preferred_alignment))
14732 return false;
14733
14734 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
14735 return true;
14736 }
14737
14738 /* Return true if the vector misalignment factor is supported by the
14739 target. */
14740 static bool
14741 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14742 const_tree type, int misalignment,
14743 bool is_packed)
14744 {
14745 if (TARGET_SIMD && STRICT_ALIGNMENT)
14746 {
14747 /* Return if movmisalign pattern is not supported for this mode. */
14748 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14749 return false;
14750
14751 /* Misalignment factor is unknown at compile time. */
14752 if (misalignment == -1)
14753 return false;
14754 }
14755 return default_builtin_support_vector_misalignment (mode, type, misalignment,
14756 is_packed);
14757 }
14758
14759 /* If VALS is a vector constant that can be loaded into a register
14760 using DUP, generate instructions to do so and return an RTX to
14761 assign to the register. Otherwise return NULL_RTX. */
14762 static rtx
14763 aarch64_simd_dup_constant (rtx vals)
14764 {
14765 machine_mode mode = GET_MODE (vals);
14766 machine_mode inner_mode = GET_MODE_INNER (mode);
14767 rtx x;
14768
14769 if (!const_vec_duplicate_p (vals, &x))
14770 return NULL_RTX;
14771
14772 /* We can load this constant by using DUP and a constant in a
14773 single ARM register. This will be cheaper than a vector
14774 load. */
14775 x = copy_to_mode_reg (inner_mode, x);
14776 return gen_vec_duplicate (mode, x);
14777 }
14778
14779
14780 /* Generate code to load VALS, which is a PARALLEL containing only
14781 constants (for vec_init) or CONST_VECTOR, efficiently into a
14782 register. Returns an RTX to copy into the register, or NULL_RTX
14783 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
14784 static rtx
14785 aarch64_simd_make_constant (rtx vals)
14786 {
14787 machine_mode mode = GET_MODE (vals);
14788 rtx const_dup;
14789 rtx const_vec = NULL_RTX;
14790 int n_const = 0;
14791 int i;
14792
14793 if (GET_CODE (vals) == CONST_VECTOR)
14794 const_vec = vals;
14795 else if (GET_CODE (vals) == PARALLEL)
14796 {
14797 /* A CONST_VECTOR must contain only CONST_INTs and
14798 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
14799 Only store valid constants in a CONST_VECTOR. */
14800 int n_elts = XVECLEN (vals, 0);
14801 for (i = 0; i < n_elts; ++i)
14802 {
14803 rtx x = XVECEXP (vals, 0, i);
14804 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14805 n_const++;
14806 }
14807 if (n_const == n_elts)
14808 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
14809 }
14810 else
14811 gcc_unreachable ();
14812
14813 if (const_vec != NULL_RTX
14814 && aarch64_simd_valid_immediate (const_vec, NULL))
14815 /* Load using MOVI/MVNI. */
14816 return const_vec;
14817 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
14818 /* Loaded using DUP. */
14819 return const_dup;
14820 else if (const_vec != NULL_RTX)
14821 /* Load from constant pool. We cannot take advantage of single-cycle
14822 LD1 because we need a PC-relative addressing mode. */
14823 return const_vec;
14824 else
14825 /* A PARALLEL containing something not valid inside CONST_VECTOR.
14826 We cannot construct an initializer. */
14827 return NULL_RTX;
14828 }
14829
14830 /* Expand a vector initialisation sequence, such that TARGET is
14831 initialised to contain VALS. */
14832
14833 void
14834 aarch64_expand_vector_init (rtx target, rtx vals)
14835 {
14836 machine_mode mode = GET_MODE (target);
14837 scalar_mode inner_mode = GET_MODE_INNER (mode);
14838 /* The number of vector elements. */
14839 int n_elts = XVECLEN (vals, 0);
14840 /* The number of vector elements which are not constant. */
14841 int n_var = 0;
14842 rtx any_const = NULL_RTX;
14843 /* The first element of vals. */
14844 rtx v0 = XVECEXP (vals, 0, 0);
14845 bool all_same = true;
14846
14847 /* Count the number of variable elements to initialise. */
14848 for (int i = 0; i < n_elts; ++i)
14849 {
14850 rtx x = XVECEXP (vals, 0, i);
14851 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
14852 ++n_var;
14853 else
14854 any_const = x;
14855
14856 all_same &= rtx_equal_p (x, v0);
14857 }
14858
14859 /* No variable elements, hand off to aarch64_simd_make_constant which knows
14860 how best to handle this. */
14861 if (n_var == 0)
14862 {
14863 rtx constant = aarch64_simd_make_constant (vals);
14864 if (constant != NULL_RTX)
14865 {
14866 emit_move_insn (target, constant);
14867 return;
14868 }
14869 }
14870
14871 /* Splat a single non-constant element if we can. */
14872 if (all_same)
14873 {
14874 rtx x = copy_to_mode_reg (inner_mode, v0);
14875 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14876 return;
14877 }
14878
14879 enum insn_code icode = optab_handler (vec_set_optab, mode);
14880 gcc_assert (icode != CODE_FOR_nothing);
14881
14882 /* If there are only variable elements, try to optimize
14883 the insertion using dup for the most common element
14884 followed by insertions. */
14885
14886 /* The algorithm will fill matches[*][0] with the earliest matching element,
14887 and matches[X][1] with the count of duplicate elements (if X is the
14888 earliest element which has duplicates). */
14889
14890 if (n_var == n_elts && n_elts <= 16)
14891 {
14892 int matches[16][2] = {0};
14893 for (int i = 0; i < n_elts; i++)
14894 {
14895 for (int j = 0; j <= i; j++)
14896 {
14897 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
14898 {
14899 matches[i][0] = j;
14900 matches[j][1]++;
14901 break;
14902 }
14903 }
14904 }
14905 int maxelement = 0;
14906 int maxv = 0;
14907 for (int i = 0; i < n_elts; i++)
14908 if (matches[i][1] > maxv)
14909 {
14910 maxelement = i;
14911 maxv = matches[i][1];
14912 }
14913
14914 /* Create a duplicate of the most common element, unless all elements
14915 are equally useless to us, in which case just immediately set the
14916 vector register using the first element. */
14917
14918 if (maxv == 1)
14919 {
14920 /* For vectors of two 64-bit elements, we can do even better. */
14921 if (n_elts == 2
14922 && (inner_mode == E_DImode
14923 || inner_mode == E_DFmode))
14924
14925 {
14926 rtx x0 = XVECEXP (vals, 0, 0);
14927 rtx x1 = XVECEXP (vals, 0, 1);
14928 /* Combine can pick up this case, but handling it directly
14929 here leaves clearer RTL.
14930
14931 This is load_pair_lanes<mode>, and also gives us a clean-up
14932 for store_pair_lanes<mode>. */
14933 if (memory_operand (x0, inner_mode)
14934 && memory_operand (x1, inner_mode)
14935 && !STRICT_ALIGNMENT
14936 && rtx_equal_p (XEXP (x1, 0),
14937 plus_constant (Pmode,
14938 XEXP (x0, 0),
14939 GET_MODE_SIZE (inner_mode))))
14940 {
14941 rtx t;
14942 if (inner_mode == DFmode)
14943 t = gen_load_pair_lanesdf (target, x0, x1);
14944 else
14945 t = gen_load_pair_lanesdi (target, x0, x1);
14946 emit_insn (t);
14947 return;
14948 }
14949 }
14950 /* The subreg-move sequence below will move into lane zero of the
14951 vector register. For big-endian we want that position to hold
14952 the last element of VALS. */
14953 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14954 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14955 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14956 }
14957 else
14958 {
14959 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14960 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14961 }
14962
14963 /* Insert the rest. */
14964 for (int i = 0; i < n_elts; i++)
14965 {
14966 rtx x = XVECEXP (vals, 0, i);
14967 if (matches[i][0] == maxelement)
14968 continue;
14969 x = copy_to_mode_reg (inner_mode, x);
14970 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14971 }
14972 return;
14973 }
14974
14975 /* Initialise a vector which is part-variable. We want to first try
14976 to build those lanes which are constant in the most efficient way we
14977 can. */
14978 if (n_var != n_elts)
14979 {
14980 rtx copy = copy_rtx (vals);
14981
14982 /* Load constant part of vector. We really don't care what goes into the
14983 parts we will overwrite, but we're more likely to be able to load the
14984 constant efficiently if it has fewer, larger, repeating parts
14985 (see aarch64_simd_valid_immediate). */
14986 for (int i = 0; i < n_elts; i++)
14987 {
14988 rtx x = XVECEXP (vals, 0, i);
14989 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14990 continue;
14991 rtx subst = any_const;
14992 for (int bit = n_elts / 2; bit > 0; bit /= 2)
14993 {
14994 /* Look in the copied vector, as more elements are const. */
14995 rtx test = XVECEXP (copy, 0, i ^ bit);
14996 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14997 {
14998 subst = test;
14999 break;
15000 }
15001 }
15002 XVECEXP (copy, 0, i) = subst;
15003 }
15004 aarch64_expand_vector_init (target, copy);
15005 }
15006
15007 /* Insert the variable lanes directly. */
15008 for (int i = 0; i < n_elts; i++)
15009 {
15010 rtx x = XVECEXP (vals, 0, i);
15011 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15012 continue;
15013 x = copy_to_mode_reg (inner_mode, x);
15014 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15015 }
15016 }
15017
15018 static unsigned HOST_WIDE_INT
15019 aarch64_shift_truncation_mask (machine_mode mode)
15020 {
15021 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15022 return 0;
15023 return GET_MODE_UNIT_BITSIZE (mode) - 1;
15024 }
15025
15026 /* Select a format to encode pointers in exception handling data. */
15027 int
15028 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15029 {
15030 int type;
15031 switch (aarch64_cmodel)
15032 {
15033 case AARCH64_CMODEL_TINY:
15034 case AARCH64_CMODEL_TINY_PIC:
15035 case AARCH64_CMODEL_SMALL:
15036 case AARCH64_CMODEL_SMALL_PIC:
15037 case AARCH64_CMODEL_SMALL_SPIC:
15038 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15039 for everything. */
15040 type = DW_EH_PE_sdata4;
15041 break;
15042 default:
15043 /* No assumptions here. 8-byte relocs required. */
15044 type = DW_EH_PE_sdata8;
15045 break;
15046 }
15047 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15048 }
15049
15050 /* The last .arch and .tune assembly strings that we printed. */
15051 static std::string aarch64_last_printed_arch_string;
15052 static std::string aarch64_last_printed_tune_string;
15053
15054 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15055 by the function fndecl. */
15056
15057 void
15058 aarch64_declare_function_name (FILE *stream, const char* name,
15059 tree fndecl)
15060 {
15061 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15062
15063 struct cl_target_option *targ_options;
15064 if (target_parts)
15065 targ_options = TREE_TARGET_OPTION (target_parts);
15066 else
15067 targ_options = TREE_TARGET_OPTION (target_option_current_node);
15068 gcc_assert (targ_options);
15069
15070 const struct processor *this_arch
15071 = aarch64_get_arch (targ_options->x_explicit_arch);
15072
15073 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15074 std::string extension
15075 = aarch64_get_extension_string_for_isa_flags (isa_flags,
15076 this_arch->flags);
15077 /* Only update the assembler .arch string if it is distinct from the last
15078 such string we printed. */
15079 std::string to_print = this_arch->name + extension;
15080 if (to_print != aarch64_last_printed_arch_string)
15081 {
15082 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15083 aarch64_last_printed_arch_string = to_print;
15084 }
15085
15086 /* Print the cpu name we're tuning for in the comments, might be
15087 useful to readers of the generated asm. Do it only when it changes
15088 from function to function and verbose assembly is requested. */
15089 const struct processor *this_tune
15090 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15091
15092 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15093 {
15094 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15095 this_tune->name);
15096 aarch64_last_printed_tune_string = this_tune->name;
15097 }
15098
15099 /* Don't forget the type directive for ELF. */
15100 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15101 ASM_OUTPUT_LABEL (stream, name);
15102 }
15103
15104 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15105
15106 static void
15107 aarch64_start_file (void)
15108 {
15109 struct cl_target_option *default_options
15110 = TREE_TARGET_OPTION (target_option_default_node);
15111
15112 const struct processor *default_arch
15113 = aarch64_get_arch (default_options->x_explicit_arch);
15114 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15115 std::string extension
15116 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15117 default_arch->flags);
15118
15119 aarch64_last_printed_arch_string = default_arch->name + extension;
15120 aarch64_last_printed_tune_string = "";
15121 asm_fprintf (asm_out_file, "\t.arch %s\n",
15122 aarch64_last_printed_arch_string.c_str ());
15123
15124 default_file_start ();
15125 }
15126
15127 /* Emit load exclusive. */
15128
15129 static void
15130 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15131 rtx mem, rtx model_rtx)
15132 {
15133 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15134 }
15135
15136 /* Emit store exclusive. */
15137
15138 static void
15139 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15140 rtx rval, rtx mem, rtx model_rtx)
15141 {
15142 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15143 }
15144
15145 /* Mark the previous jump instruction as unlikely. */
15146
15147 static void
15148 aarch64_emit_unlikely_jump (rtx insn)
15149 {
15150 rtx_insn *jump = emit_jump_insn (insn);
15151 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15152 }
15153
15154 /* Expand a compare and swap pattern. */
15155
15156 void
15157 aarch64_expand_compare_and_swap (rtx operands[])
15158 {
15159 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15160 machine_mode mode, r_mode;
15161
15162 bval = operands[0];
15163 rval = operands[1];
15164 mem = operands[2];
15165 oldval = operands[3];
15166 newval = operands[4];
15167 is_weak = operands[5];
15168 mod_s = operands[6];
15169 mod_f = operands[7];
15170 mode = GET_MODE (mem);
15171
15172 /* Normally the succ memory model must be stronger than fail, but in the
15173 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15174 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15175 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15176 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15177 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15178
15179 r_mode = mode;
15180 if (mode == QImode || mode == HImode)
15181 {
15182 r_mode = SImode;
15183 rval = gen_reg_rtx (r_mode);
15184 }
15185
15186 if (TARGET_LSE)
15187 {
15188 /* The CAS insn requires oldval and rval overlap, but we need to
15189 have a copy of oldval saved across the operation to tell if
15190 the operation is successful. */
15191 if (reg_overlap_mentioned_p (rval, oldval))
15192 rval = copy_to_mode_reg (r_mode, oldval);
15193 else
15194 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15195
15196 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15197 newval, mod_s));
15198 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15199 }
15200 else
15201 {
15202 /* The oldval predicate varies by mode. Test it and force to reg. */
15203 insn_code code = code_for_aarch64_compare_and_swap (mode);
15204 if (!insn_data[code].operand[2].predicate (oldval, mode))
15205 oldval = force_reg (mode, oldval);
15206
15207 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15208 is_weak, mod_s, mod_f));
15209 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15210 }
15211
15212 if (r_mode != mode)
15213 rval = gen_lowpart (mode, rval);
15214 emit_move_insn (operands[1], rval);
15215
15216 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15217 emit_insn (gen_rtx_SET (bval, x));
15218 }
15219
15220 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15221 sequence implementing an atomic operation. */
15222
15223 static void
15224 aarch64_emit_post_barrier (enum memmodel model)
15225 {
15226 const enum memmodel base_model = memmodel_base (model);
15227
15228 if (is_mm_sync (model)
15229 && (base_model == MEMMODEL_ACQUIRE
15230 || base_model == MEMMODEL_ACQ_REL
15231 || base_model == MEMMODEL_SEQ_CST))
15232 {
15233 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15234 }
15235 }
15236
15237 /* Split a compare and swap pattern. */
15238
15239 void
15240 aarch64_split_compare_and_swap (rtx operands[])
15241 {
15242 rtx rval, mem, oldval, newval, scratch;
15243 machine_mode mode;
15244 bool is_weak;
15245 rtx_code_label *label1, *label2;
15246 rtx x, cond;
15247 enum memmodel model;
15248 rtx model_rtx;
15249
15250 rval = operands[0];
15251 mem = operands[1];
15252 oldval = operands[2];
15253 newval = operands[3];
15254 is_weak = (operands[4] != const0_rtx);
15255 model_rtx = operands[5];
15256 scratch = operands[7];
15257 mode = GET_MODE (mem);
15258 model = memmodel_from_int (INTVAL (model_rtx));
15259
15260 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15261 loop:
15262 .label1:
15263 LD[A]XR rval, [mem]
15264 CBNZ rval, .label2
15265 ST[L]XR scratch, newval, [mem]
15266 CBNZ scratch, .label1
15267 .label2:
15268 CMP rval, 0. */
15269 bool strong_zero_p = !is_weak && oldval == const0_rtx;
15270
15271 label1 = NULL;
15272 if (!is_weak)
15273 {
15274 label1 = gen_label_rtx ();
15275 emit_label (label1);
15276 }
15277 label2 = gen_label_rtx ();
15278
15279 /* The initial load can be relaxed for a __sync operation since a final
15280 barrier will be emitted to stop code hoisting. */
15281 if (is_mm_sync (model))
15282 aarch64_emit_load_exclusive (mode, rval, mem,
15283 GEN_INT (MEMMODEL_RELAXED));
15284 else
15285 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15286
15287 if (strong_zero_p)
15288 {
15289 if (aarch64_track_speculation)
15290 {
15291 /* Emit an explicit compare instruction, so that we can correctly
15292 track the condition codes. */
15293 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15294 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15295 }
15296 else
15297 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15298
15299 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15300 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15301 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15302 }
15303 else
15304 {
15305 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15306 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15307 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15308 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15309 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15310 }
15311
15312 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15313
15314 if (!is_weak)
15315 {
15316 if (aarch64_track_speculation)
15317 {
15318 /* Emit an explicit compare instruction, so that we can correctly
15319 track the condition codes. */
15320 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15321 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15322 }
15323 else
15324 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15325
15326 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15327 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15328 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15329 }
15330 else
15331 {
15332 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15333 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15334 emit_insn (gen_rtx_SET (cond, x));
15335 }
15336
15337 emit_label (label2);
15338 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15339 to set the condition flags. If this is not used it will be removed by
15340 later passes. */
15341 if (strong_zero_p)
15342 {
15343 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15344 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15345 emit_insn (gen_rtx_SET (cond, x));
15346 }
15347 /* Emit any final barrier needed for a __sync operation. */
15348 if (is_mm_sync (model))
15349 aarch64_emit_post_barrier (model);
15350 }
15351
15352 /* Split an atomic operation. */
15353
15354 void
15355 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15356 rtx value, rtx model_rtx, rtx cond)
15357 {
15358 machine_mode mode = GET_MODE (mem);
15359 machine_mode wmode = (mode == DImode ? DImode : SImode);
15360 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15361 const bool is_sync = is_mm_sync (model);
15362 rtx_code_label *label;
15363 rtx x;
15364
15365 /* Split the atomic operation into a sequence. */
15366 label = gen_label_rtx ();
15367 emit_label (label);
15368
15369 if (new_out)
15370 new_out = gen_lowpart (wmode, new_out);
15371 if (old_out)
15372 old_out = gen_lowpart (wmode, old_out);
15373 else
15374 old_out = new_out;
15375 value = simplify_gen_subreg (wmode, value, mode, 0);
15376
15377 /* The initial load can be relaxed for a __sync operation since a final
15378 barrier will be emitted to stop code hoisting. */
15379 if (is_sync)
15380 aarch64_emit_load_exclusive (mode, old_out, mem,
15381 GEN_INT (MEMMODEL_RELAXED));
15382 else
15383 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15384
15385 switch (code)
15386 {
15387 case SET:
15388 new_out = value;
15389 break;
15390
15391 case NOT:
15392 x = gen_rtx_AND (wmode, old_out, value);
15393 emit_insn (gen_rtx_SET (new_out, x));
15394 x = gen_rtx_NOT (wmode, new_out);
15395 emit_insn (gen_rtx_SET (new_out, x));
15396 break;
15397
15398 case MINUS:
15399 if (CONST_INT_P (value))
15400 {
15401 value = GEN_INT (-INTVAL (value));
15402 code = PLUS;
15403 }
15404 /* Fall through. */
15405
15406 default:
15407 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15408 emit_insn (gen_rtx_SET (new_out, x));
15409 break;
15410 }
15411
15412 aarch64_emit_store_exclusive (mode, cond, mem,
15413 gen_lowpart (mode, new_out), model_rtx);
15414
15415 if (aarch64_track_speculation)
15416 {
15417 /* Emit an explicit compare instruction, so that we can correctly
15418 track the condition codes. */
15419 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15420 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15421 }
15422 else
15423 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15424
15425 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15426 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15427 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15428
15429 /* Emit any final barrier needed for a __sync operation. */
15430 if (is_sync)
15431 aarch64_emit_post_barrier (model);
15432 }
15433
15434 static void
15435 aarch64_init_libfuncs (void)
15436 {
15437 /* Half-precision float operations. The compiler handles all operations
15438 with NULL libfuncs by converting to SFmode. */
15439
15440 /* Conversions. */
15441 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15442 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15443
15444 /* Arithmetic. */
15445 set_optab_libfunc (add_optab, HFmode, NULL);
15446 set_optab_libfunc (sdiv_optab, HFmode, NULL);
15447 set_optab_libfunc (smul_optab, HFmode, NULL);
15448 set_optab_libfunc (neg_optab, HFmode, NULL);
15449 set_optab_libfunc (sub_optab, HFmode, NULL);
15450
15451 /* Comparisons. */
15452 set_optab_libfunc (eq_optab, HFmode, NULL);
15453 set_optab_libfunc (ne_optab, HFmode, NULL);
15454 set_optab_libfunc (lt_optab, HFmode, NULL);
15455 set_optab_libfunc (le_optab, HFmode, NULL);
15456 set_optab_libfunc (ge_optab, HFmode, NULL);
15457 set_optab_libfunc (gt_optab, HFmode, NULL);
15458 set_optab_libfunc (unord_optab, HFmode, NULL);
15459 }
15460
15461 /* Target hook for c_mode_for_suffix. */
15462 static machine_mode
15463 aarch64_c_mode_for_suffix (char suffix)
15464 {
15465 if (suffix == 'q')
15466 return TFmode;
15467
15468 return VOIDmode;
15469 }
15470
15471 /* We can only represent floating point constants which will fit in
15472 "quarter-precision" values. These values are characterised by
15473 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
15474 by:
15475
15476 (-1)^s * (n/16) * 2^r
15477
15478 Where:
15479 's' is the sign bit.
15480 'n' is an integer in the range 16 <= n <= 31.
15481 'r' is an integer in the range -3 <= r <= 4. */
15482
15483 /* Return true iff X can be represented by a quarter-precision
15484 floating point immediate operand X. Note, we cannot represent 0.0. */
15485 bool
15486 aarch64_float_const_representable_p (rtx x)
15487 {
15488 /* This represents our current view of how many bits
15489 make up the mantissa. */
15490 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15491 int exponent;
15492 unsigned HOST_WIDE_INT mantissa, mask;
15493 REAL_VALUE_TYPE r, m;
15494 bool fail;
15495
15496 if (!CONST_DOUBLE_P (x))
15497 return false;
15498
15499 if (GET_MODE (x) == VOIDmode
15500 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15501 return false;
15502
15503 r = *CONST_DOUBLE_REAL_VALUE (x);
15504
15505 /* We cannot represent infinities, NaNs or +/-zero. We won't
15506 know if we have +zero until we analyse the mantissa, but we
15507 can reject the other invalid values. */
15508 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15509 || REAL_VALUE_MINUS_ZERO (r))
15510 return false;
15511
15512 /* Extract exponent. */
15513 r = real_value_abs (&r);
15514 exponent = REAL_EXP (&r);
15515
15516 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15517 highest (sign) bit, with a fixed binary point at bit point_pos.
15518 m1 holds the low part of the mantissa, m2 the high part.
15519 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15520 bits for the mantissa, this can fail (low bits will be lost). */
15521 real_ldexp (&m, &r, point_pos - exponent);
15522 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15523
15524 /* If the low part of the mantissa has bits set we cannot represent
15525 the value. */
15526 if (w.ulow () != 0)
15527 return false;
15528 /* We have rejected the lower HOST_WIDE_INT, so update our
15529 understanding of how many bits lie in the mantissa and
15530 look only at the high HOST_WIDE_INT. */
15531 mantissa = w.elt (1);
15532 point_pos -= HOST_BITS_PER_WIDE_INT;
15533
15534 /* We can only represent values with a mantissa of the form 1.xxxx. */
15535 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15536 if ((mantissa & mask) != 0)
15537 return false;
15538
15539 /* Having filtered unrepresentable values, we may now remove all
15540 but the highest 5 bits. */
15541 mantissa >>= point_pos - 5;
15542
15543 /* We cannot represent the value 0.0, so reject it. This is handled
15544 elsewhere. */
15545 if (mantissa == 0)
15546 return false;
15547
15548 /* Then, as bit 4 is always set, we can mask it off, leaving
15549 the mantissa in the range [0, 15]. */
15550 mantissa &= ~(1 << 4);
15551 gcc_assert (mantissa <= 15);
15552
15553 /* GCC internally does not use IEEE754-like encoding (where normalized
15554 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
15555 Our mantissa values are shifted 4 places to the left relative to
15556 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15557 by 5 places to correct for GCC's representation. */
15558 exponent = 5 - exponent;
15559
15560 return (exponent >= 0 && exponent <= 7);
15561 }
15562
15563 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15564 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
15565 output MOVI/MVNI, ORR or BIC immediate. */
15566 char*
15567 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15568 enum simd_immediate_check which)
15569 {
15570 bool is_valid;
15571 static char templ[40];
15572 const char *mnemonic;
15573 const char *shift_op;
15574 unsigned int lane_count = 0;
15575 char element_char;
15576
15577 struct simd_immediate_info info;
15578
15579 /* This will return true to show const_vector is legal for use as either
15580 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15581 It will also update INFO to show how the immediate should be generated.
15582 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
15583 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15584 gcc_assert (is_valid);
15585
15586 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15587 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15588
15589 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15590 {
15591 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15592 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15593 move immediate path. */
15594 if (aarch64_float_const_zero_rtx_p (info.value))
15595 info.value = GEN_INT (0);
15596 else
15597 {
15598 const unsigned int buf_size = 20;
15599 char float_buf[buf_size] = {'\0'};
15600 real_to_decimal_for_mode (float_buf,
15601 CONST_DOUBLE_REAL_VALUE (info.value),
15602 buf_size, buf_size, 1, info.elt_mode);
15603
15604 if (lane_count == 1)
15605 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15606 else
15607 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15608 lane_count, element_char, float_buf);
15609 return templ;
15610 }
15611 }
15612
15613 gcc_assert (CONST_INT_P (info.value));
15614
15615 if (which == AARCH64_CHECK_MOV)
15616 {
15617 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15618 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15619 if (lane_count == 1)
15620 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15621 mnemonic, UINTVAL (info.value));
15622 else if (info.shift)
15623 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15624 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15625 element_char, UINTVAL (info.value), shift_op, info.shift);
15626 else
15627 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15628 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15629 element_char, UINTVAL (info.value));
15630 }
15631 else
15632 {
15633 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15634 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15635 if (info.shift)
15636 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15637 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15638 element_char, UINTVAL (info.value), "lsl", info.shift);
15639 else
15640 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15641 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15642 element_char, UINTVAL (info.value));
15643 }
15644 return templ;
15645 }
15646
15647 char*
15648 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15649 {
15650
15651 /* If a floating point number was passed and we desire to use it in an
15652 integer mode do the conversion to integer. */
15653 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15654 {
15655 unsigned HOST_WIDE_INT ival;
15656 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15657 gcc_unreachable ();
15658 immediate = gen_int_mode (ival, mode);
15659 }
15660
15661 machine_mode vmode;
15662 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15663 a 128 bit vector mode. */
15664 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15665
15666 vmode = aarch64_simd_container_mode (mode, width);
15667 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15668 return aarch64_output_simd_mov_immediate (v_op, width);
15669 }
15670
15671 /* Return the output string to use for moving immediate CONST_VECTOR
15672 into an SVE register. */
15673
15674 char *
15675 aarch64_output_sve_mov_immediate (rtx const_vector)
15676 {
15677 static char templ[40];
15678 struct simd_immediate_info info;
15679 char element_char;
15680
15681 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15682 gcc_assert (is_valid);
15683
15684 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15685
15686 if (info.step)
15687 {
15688 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15689 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15690 element_char, INTVAL (info.value), INTVAL (info.step));
15691 return templ;
15692 }
15693
15694 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15695 {
15696 if (aarch64_float_const_zero_rtx_p (info.value))
15697 info.value = GEN_INT (0);
15698 else
15699 {
15700 const int buf_size = 20;
15701 char float_buf[buf_size] = {};
15702 real_to_decimal_for_mode (float_buf,
15703 CONST_DOUBLE_REAL_VALUE (info.value),
15704 buf_size, buf_size, 1, info.elt_mode);
15705
15706 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15707 element_char, float_buf);
15708 return templ;
15709 }
15710 }
15711
15712 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15713 element_char, INTVAL (info.value));
15714 return templ;
15715 }
15716
15717 /* Return the asm format for a PTRUE instruction whose destination has
15718 mode MODE. SUFFIX is the element size suffix. */
15719
15720 char *
15721 aarch64_output_ptrue (machine_mode mode, char suffix)
15722 {
15723 unsigned int nunits;
15724 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15725 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15726 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15727 else
15728 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15729 return buf;
15730 }
15731
15732 /* Split operands into moves from op[1] + op[2] into op[0]. */
15733
15734 void
15735 aarch64_split_combinev16qi (rtx operands[3])
15736 {
15737 unsigned int dest = REGNO (operands[0]);
15738 unsigned int src1 = REGNO (operands[1]);
15739 unsigned int src2 = REGNO (operands[2]);
15740 machine_mode halfmode = GET_MODE (operands[1]);
15741 unsigned int halfregs = REG_NREGS (operands[1]);
15742 rtx destlo, desthi;
15743
15744 gcc_assert (halfmode == V16QImode);
15745
15746 if (src1 == dest && src2 == dest + halfregs)
15747 {
15748 /* No-op move. Can't split to nothing; emit something. */
15749 emit_note (NOTE_INSN_DELETED);
15750 return;
15751 }
15752
15753 /* Preserve register attributes for variable tracking. */
15754 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15755 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15756 GET_MODE_SIZE (halfmode));
15757
15758 /* Special case of reversed high/low parts. */
15759 if (reg_overlap_mentioned_p (operands[2], destlo)
15760 && reg_overlap_mentioned_p (operands[1], desthi))
15761 {
15762 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15763 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15764 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15765 }
15766 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15767 {
15768 /* Try to avoid unnecessary moves if part of the result
15769 is in the right place already. */
15770 if (src1 != dest)
15771 emit_move_insn (destlo, operands[1]);
15772 if (src2 != dest + halfregs)
15773 emit_move_insn (desthi, operands[2]);
15774 }
15775 else
15776 {
15777 if (src2 != dest + halfregs)
15778 emit_move_insn (desthi, operands[2]);
15779 if (src1 != dest)
15780 emit_move_insn (destlo, operands[1]);
15781 }
15782 }
15783
15784 /* vec_perm support. */
15785
15786 struct expand_vec_perm_d
15787 {
15788 rtx target, op0, op1;
15789 vec_perm_indices perm;
15790 machine_mode vmode;
15791 unsigned int vec_flags;
15792 bool one_vector_p;
15793 bool testing_p;
15794 };
15795
15796 /* Generate a variable permutation. */
15797
15798 static void
15799 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15800 {
15801 machine_mode vmode = GET_MODE (target);
15802 bool one_vector_p = rtx_equal_p (op0, op1);
15803
15804 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15805 gcc_checking_assert (GET_MODE (op0) == vmode);
15806 gcc_checking_assert (GET_MODE (op1) == vmode);
15807 gcc_checking_assert (GET_MODE (sel) == vmode);
15808 gcc_checking_assert (TARGET_SIMD);
15809
15810 if (one_vector_p)
15811 {
15812 if (vmode == V8QImode)
15813 {
15814 /* Expand the argument to a V16QI mode by duplicating it. */
15815 rtx pair = gen_reg_rtx (V16QImode);
15816 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15817 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15818 }
15819 else
15820 {
15821 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15822 }
15823 }
15824 else
15825 {
15826 rtx pair;
15827
15828 if (vmode == V8QImode)
15829 {
15830 pair = gen_reg_rtx (V16QImode);
15831 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15832 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15833 }
15834 else
15835 {
15836 pair = gen_reg_rtx (OImode);
15837 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15838 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15839 }
15840 }
15841 }
15842
15843 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15844 NELT is the number of elements in the vector. */
15845
15846 void
15847 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15848 unsigned int nelt)
15849 {
15850 machine_mode vmode = GET_MODE (target);
15851 bool one_vector_p = rtx_equal_p (op0, op1);
15852 rtx mask;
15853
15854 /* The TBL instruction does not use a modulo index, so we must take care
15855 of that ourselves. */
15856 mask = aarch64_simd_gen_const_vector_dup (vmode,
15857 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15858 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15859
15860 /* For big-endian, we also need to reverse the index within the vector
15861 (but not which vector). */
15862 if (BYTES_BIG_ENDIAN)
15863 {
15864 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15865 if (!one_vector_p)
15866 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15867 sel = expand_simple_binop (vmode, XOR, sel, mask,
15868 NULL, 0, OPTAB_LIB_WIDEN);
15869 }
15870 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15871 }
15872
15873 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15874
15875 static void
15876 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15877 {
15878 emit_insn (gen_rtx_SET (target,
15879 gen_rtx_UNSPEC (GET_MODE (target),
15880 gen_rtvec (2, op0, op1), code)));
15881 }
15882
15883 /* Expand an SVE vec_perm with the given operands. */
15884
15885 void
15886 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15887 {
15888 machine_mode data_mode = GET_MODE (target);
15889 machine_mode sel_mode = GET_MODE (sel);
15890 /* Enforced by the pattern condition. */
15891 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15892
15893 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15894 size of the two value vectors, i.e. the upper bits of the indices
15895 are effectively ignored. SVE TBL instead produces 0 for any
15896 out-of-range indices, so we need to modulo all the vec_perm indices
15897 to ensure they are all in range. */
15898 rtx sel_reg = force_reg (sel_mode, sel);
15899
15900 /* Check if the sel only references the first values vector. */
15901 if (GET_CODE (sel) == CONST_VECTOR
15902 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15903 {
15904 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15905 return;
15906 }
15907
15908 /* Check if the two values vectors are the same. */
15909 if (rtx_equal_p (op0, op1))
15910 {
15911 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15912 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15913 NULL, 0, OPTAB_DIRECT);
15914 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15915 return;
15916 }
15917
15918 /* Run TBL on for each value vector and combine the results. */
15919
15920 rtx res0 = gen_reg_rtx (data_mode);
15921 rtx res1 = gen_reg_rtx (data_mode);
15922 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15923 if (GET_CODE (sel) != CONST_VECTOR
15924 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15925 {
15926 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15927 2 * nunits - 1);
15928 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15929 NULL, 0, OPTAB_DIRECT);
15930 }
15931 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15932 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15933 NULL, 0, OPTAB_DIRECT);
15934 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15935 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15936 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15937 else
15938 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15939 }
15940
15941 /* Recognize patterns suitable for the TRN instructions. */
15942 static bool
15943 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15944 {
15945 HOST_WIDE_INT odd;
15946 poly_uint64 nelt = d->perm.length ();
15947 rtx out, in0, in1, x;
15948 machine_mode vmode = d->vmode;
15949
15950 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15951 return false;
15952
15953 /* Note that these are little-endian tests.
15954 We correct for big-endian later. */
15955 if (!d->perm[0].is_constant (&odd)
15956 || (odd != 0 && odd != 1)
15957 || !d->perm.series_p (0, 2, odd, 2)
15958 || !d->perm.series_p (1, 2, nelt + odd, 2))
15959 return false;
15960
15961 /* Success! */
15962 if (d->testing_p)
15963 return true;
15964
15965 in0 = d->op0;
15966 in1 = d->op1;
15967 /* We don't need a big-endian lane correction for SVE; see the comment
15968 at the head of aarch64-sve.md for details. */
15969 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15970 {
15971 x = in0, in0 = in1, in1 = x;
15972 odd = !odd;
15973 }
15974 out = d->target;
15975
15976 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15977 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15978 return true;
15979 }
15980
15981 /* Recognize patterns suitable for the UZP instructions. */
15982 static bool
15983 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15984 {
15985 HOST_WIDE_INT odd;
15986 rtx out, in0, in1, x;
15987 machine_mode vmode = d->vmode;
15988
15989 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15990 return false;
15991
15992 /* Note that these are little-endian tests.
15993 We correct for big-endian later. */
15994 if (!d->perm[0].is_constant (&odd)
15995 || (odd != 0 && odd != 1)
15996 || !d->perm.series_p (0, 1, odd, 2))
15997 return false;
15998
15999 /* Success! */
16000 if (d->testing_p)
16001 return true;
16002
16003 in0 = d->op0;
16004 in1 = d->op1;
16005 /* We don't need a big-endian lane correction for SVE; see the comment
16006 at the head of aarch64-sve.md for details. */
16007 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16008 {
16009 x = in0, in0 = in1, in1 = x;
16010 odd = !odd;
16011 }
16012 out = d->target;
16013
16014 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16015 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16016 return true;
16017 }
16018
16019 /* Recognize patterns suitable for the ZIP instructions. */
16020 static bool
16021 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16022 {
16023 unsigned int high;
16024 poly_uint64 nelt = d->perm.length ();
16025 rtx out, in0, in1, x;
16026 machine_mode vmode = d->vmode;
16027
16028 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16029 return false;
16030
16031 /* Note that these are little-endian tests.
16032 We correct for big-endian later. */
16033 poly_uint64 first = d->perm[0];
16034 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16035 || !d->perm.series_p (0, 2, first, 1)
16036 || !d->perm.series_p (1, 2, first + nelt, 1))
16037 return false;
16038 high = maybe_ne (first, 0U);
16039
16040 /* Success! */
16041 if (d->testing_p)
16042 return true;
16043
16044 in0 = d->op0;
16045 in1 = d->op1;
16046 /* We don't need a big-endian lane correction for SVE; see the comment
16047 at the head of aarch64-sve.md for details. */
16048 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16049 {
16050 x = in0, in0 = in1, in1 = x;
16051 high = !high;
16052 }
16053 out = d->target;
16054
16055 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16056 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16057 return true;
16058 }
16059
16060 /* Recognize patterns for the EXT insn. */
16061
16062 static bool
16063 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16064 {
16065 HOST_WIDE_INT location;
16066 rtx offset;
16067
16068 /* The first element always refers to the first vector.
16069 Check if the extracted indices are increasing by one. */
16070 if (d->vec_flags == VEC_SVE_PRED
16071 || !d->perm[0].is_constant (&location)
16072 || !d->perm.series_p (0, 1, location, 1))
16073 return false;
16074
16075 /* Success! */
16076 if (d->testing_p)
16077 return true;
16078
16079 /* The case where (location == 0) is a no-op for both big- and little-endian,
16080 and is removed by the mid-end at optimization levels -O1 and higher.
16081
16082 We don't need a big-endian lane correction for SVE; see the comment
16083 at the head of aarch64-sve.md for details. */
16084 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16085 {
16086 /* After setup, we want the high elements of the first vector (stored
16087 at the LSB end of the register), and the low elements of the second
16088 vector (stored at the MSB end of the register). So swap. */
16089 std::swap (d->op0, d->op1);
16090 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16091 to_constant () is safe since this is restricted to Advanced SIMD
16092 vectors. */
16093 location = d->perm.length ().to_constant () - location;
16094 }
16095
16096 offset = GEN_INT (location);
16097 emit_set_insn (d->target,
16098 gen_rtx_UNSPEC (d->vmode,
16099 gen_rtvec (3, d->op0, d->op1, offset),
16100 UNSPEC_EXT));
16101 return true;
16102 }
16103
16104 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16105 within each 64-bit, 32-bit or 16-bit granule. */
16106
16107 static bool
16108 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16109 {
16110 HOST_WIDE_INT diff;
16111 unsigned int i, size, unspec;
16112 machine_mode pred_mode;
16113
16114 if (d->vec_flags == VEC_SVE_PRED
16115 || !d->one_vector_p
16116 || !d->perm[0].is_constant (&diff))
16117 return false;
16118
16119 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16120 if (size == 8)
16121 {
16122 unspec = UNSPEC_REV64;
16123 pred_mode = VNx2BImode;
16124 }
16125 else if (size == 4)
16126 {
16127 unspec = UNSPEC_REV32;
16128 pred_mode = VNx4BImode;
16129 }
16130 else if (size == 2)
16131 {
16132 unspec = UNSPEC_REV16;
16133 pred_mode = VNx8BImode;
16134 }
16135 else
16136 return false;
16137
16138 unsigned int step = diff + 1;
16139 for (i = 0; i < step; ++i)
16140 if (!d->perm.series_p (i, step, diff - i, step))
16141 return false;
16142
16143 /* Success! */
16144 if (d->testing_p)
16145 return true;
16146
16147 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16148 if (d->vec_flags == VEC_SVE_DATA)
16149 {
16150 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16151 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16152 UNSPEC_MERGE_PTRUE);
16153 }
16154 emit_set_insn (d->target, src);
16155 return true;
16156 }
16157
16158 /* Recognize patterns for the REV insn, which reverses elements within
16159 a full vector. */
16160
16161 static bool
16162 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16163 {
16164 poly_uint64 nelt = d->perm.length ();
16165
16166 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16167 return false;
16168
16169 if (!d->perm.series_p (0, 1, nelt - 1, -1))
16170 return false;
16171
16172 /* Success! */
16173 if (d->testing_p)
16174 return true;
16175
16176 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16177 emit_set_insn (d->target, src);
16178 return true;
16179 }
16180
16181 static bool
16182 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16183 {
16184 rtx out = d->target;
16185 rtx in0;
16186 HOST_WIDE_INT elt;
16187 machine_mode vmode = d->vmode;
16188 rtx lane;
16189
16190 if (d->vec_flags == VEC_SVE_PRED
16191 || d->perm.encoding ().encoded_nelts () != 1
16192 || !d->perm[0].is_constant (&elt))
16193 return false;
16194
16195 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16196 return false;
16197
16198 /* Success! */
16199 if (d->testing_p)
16200 return true;
16201
16202 /* The generic preparation in aarch64_expand_vec_perm_const_1
16203 swaps the operand order and the permute indices if it finds
16204 d->perm[0] to be in the second operand. Thus, we can always
16205 use d->op0 and need not do any extra arithmetic to get the
16206 correct lane number. */
16207 in0 = d->op0;
16208 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
16209
16210 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16211 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16212 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16213 return true;
16214 }
16215
16216 static bool
16217 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16218 {
16219 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16220 machine_mode vmode = d->vmode;
16221
16222 /* Make sure that the indices are constant. */
16223 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16224 for (unsigned int i = 0; i < encoded_nelts; ++i)
16225 if (!d->perm[i].is_constant ())
16226 return false;
16227
16228 if (d->testing_p)
16229 return true;
16230
16231 /* Generic code will try constant permutation twice. Once with the
16232 original mode and again with the elements lowered to QImode.
16233 So wait and don't do the selector expansion ourselves. */
16234 if (vmode != V8QImode && vmode != V16QImode)
16235 return false;
16236
16237 /* to_constant is safe since this routine is specific to Advanced SIMD
16238 vectors. */
16239 unsigned int nelt = d->perm.length ().to_constant ();
16240 for (unsigned int i = 0; i < nelt; ++i)
16241 /* If big-endian and two vectors we end up with a weird mixed-endian
16242 mode on NEON. Reverse the index within each word but not the word
16243 itself. to_constant is safe because we checked is_constant above. */
16244 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16245 ? d->perm[i].to_constant () ^ (nelt - 1)
16246 : d->perm[i].to_constant ());
16247
16248 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16249 sel = force_reg (vmode, sel);
16250
16251 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16252 return true;
16253 }
16254
16255 /* Try to implement D using an SVE TBL instruction. */
16256
16257 static bool
16258 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16259 {
16260 unsigned HOST_WIDE_INT nelt;
16261
16262 /* Permuting two variable-length vectors could overflow the
16263 index range. */
16264 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16265 return false;
16266
16267 if (d->testing_p)
16268 return true;
16269
16270 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16271 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16272 if (d->one_vector_p)
16273 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16274 else
16275 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16276 return true;
16277 }
16278
16279 static bool
16280 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16281 {
16282 /* The pattern matching functions above are written to look for a small
16283 number to begin the sequence (0, 1, N/2). If we begin with an index
16284 from the second operand, we can swap the operands. */
16285 poly_int64 nelt = d->perm.length ();
16286 if (known_ge (d->perm[0], nelt))
16287 {
16288 d->perm.rotate_inputs (1);
16289 std::swap (d->op0, d->op1);
16290 }
16291
16292 if ((d->vec_flags == VEC_ADVSIMD
16293 || d->vec_flags == VEC_SVE_DATA
16294 || d->vec_flags == VEC_SVE_PRED)
16295 && known_gt (nelt, 1))
16296 {
16297 if (aarch64_evpc_rev_local (d))
16298 return true;
16299 else if (aarch64_evpc_rev_global (d))
16300 return true;
16301 else if (aarch64_evpc_ext (d))
16302 return true;
16303 else if (aarch64_evpc_dup (d))
16304 return true;
16305 else if (aarch64_evpc_zip (d))
16306 return true;
16307 else if (aarch64_evpc_uzp (d))
16308 return true;
16309 else if (aarch64_evpc_trn (d))
16310 return true;
16311 if (d->vec_flags == VEC_SVE_DATA)
16312 return aarch64_evpc_sve_tbl (d);
16313 else if (d->vec_flags == VEC_ADVSIMD)
16314 return aarch64_evpc_tbl (d);
16315 }
16316 return false;
16317 }
16318
16319 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
16320
16321 static bool
16322 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16323 rtx op1, const vec_perm_indices &sel)
16324 {
16325 struct expand_vec_perm_d d;
16326
16327 /* Check whether the mask can be applied to a single vector. */
16328 if (sel.ninputs () == 1
16329 || (op0 && rtx_equal_p (op0, op1)))
16330 d.one_vector_p = true;
16331 else if (sel.all_from_input_p (0))
16332 {
16333 d.one_vector_p = true;
16334 op1 = op0;
16335 }
16336 else if (sel.all_from_input_p (1))
16337 {
16338 d.one_vector_p = true;
16339 op0 = op1;
16340 }
16341 else
16342 d.one_vector_p = false;
16343
16344 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16345 sel.nelts_per_input ());
16346 d.vmode = vmode;
16347 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16348 d.target = target;
16349 d.op0 = op0;
16350 d.op1 = op1;
16351 d.testing_p = !target;
16352
16353 if (!d.testing_p)
16354 return aarch64_expand_vec_perm_const_1 (&d);
16355
16356 rtx_insn *last = get_last_insn ();
16357 bool ret = aarch64_expand_vec_perm_const_1 (&d);
16358 gcc_assert (last == get_last_insn ());
16359
16360 return ret;
16361 }
16362
16363 /* Generate a byte permute mask for a register of mode MODE,
16364 which has NUNITS units. */
16365
16366 rtx
16367 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16368 {
16369 /* We have to reverse each vector because we dont have
16370 a permuted load that can reverse-load according to ABI rules. */
16371 rtx mask;
16372 rtvec v = rtvec_alloc (16);
16373 unsigned int i, j;
16374 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16375
16376 gcc_assert (BYTES_BIG_ENDIAN);
16377 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16378
16379 for (i = 0; i < nunits; i++)
16380 for (j = 0; j < usize; j++)
16381 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16382 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16383 return force_reg (V16QImode, mask);
16384 }
16385
16386 /* Return true if X is a valid second operand for the SVE instruction
16387 that implements integer comparison OP_CODE. */
16388
16389 static bool
16390 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16391 {
16392 if (register_operand (x, VOIDmode))
16393 return true;
16394
16395 switch (op_code)
16396 {
16397 case LTU:
16398 case LEU:
16399 case GEU:
16400 case GTU:
16401 return aarch64_sve_cmp_immediate_p (x, false);
16402 case LT:
16403 case LE:
16404 case GE:
16405 case GT:
16406 case NE:
16407 case EQ:
16408 return aarch64_sve_cmp_immediate_p (x, true);
16409 default:
16410 gcc_unreachable ();
16411 }
16412 }
16413
16414 /* Use predicated SVE instructions to implement the equivalent of:
16415
16416 (set TARGET OP)
16417
16418 given that PTRUE is an all-true predicate of the appropriate mode. */
16419
16420 static void
16421 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16422 {
16423 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16424 gen_rtvec (2, ptrue, op),
16425 UNSPEC_MERGE_PTRUE);
16426 rtx_insn *insn = emit_set_insn (target, unspec);
16427 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16428 }
16429
16430 /* Likewise, but also clobber the condition codes. */
16431
16432 static void
16433 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16434 {
16435 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16436 gen_rtvec (2, ptrue, op),
16437 UNSPEC_MERGE_PTRUE);
16438 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16439 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16440 }
16441
16442 /* Return the UNSPEC_COND_* code for comparison CODE. */
16443
16444 static unsigned int
16445 aarch64_unspec_cond_code (rtx_code code)
16446 {
16447 switch (code)
16448 {
16449 case NE:
16450 return UNSPEC_COND_NE;
16451 case EQ:
16452 return UNSPEC_COND_EQ;
16453 case LT:
16454 return UNSPEC_COND_LT;
16455 case GT:
16456 return UNSPEC_COND_GT;
16457 case LE:
16458 return UNSPEC_COND_LE;
16459 case GE:
16460 return UNSPEC_COND_GE;
16461 default:
16462 gcc_unreachable ();
16463 }
16464 }
16465
16466 /* Emit:
16467
16468 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16469
16470 where <X> is the operation associated with comparison CODE. This form
16471 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16472 semantics, such as when PRED might not be all-true and when comparing
16473 inactive lanes could have side effects. */
16474
16475 static void
16476 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16477 rtx pred, rtx op0, rtx op1)
16478 {
16479 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16480 gen_rtvec (3, pred, op0, op1),
16481 aarch64_unspec_cond_code (code));
16482 emit_set_insn (target, unspec);
16483 }
16484
16485 /* Expand an SVE integer comparison using the SVE equivalent of:
16486
16487 (set TARGET (CODE OP0 OP1)). */
16488
16489 void
16490 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16491 {
16492 machine_mode pred_mode = GET_MODE (target);
16493 machine_mode data_mode = GET_MODE (op0);
16494
16495 if (!aarch64_sve_cmp_operand_p (code, op1))
16496 op1 = force_reg (data_mode, op1);
16497
16498 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16499 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16500 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16501 }
16502
16503 /* Emit the SVE equivalent of:
16504
16505 (set TMP1 (CODE1 OP0 OP1))
16506 (set TMP2 (CODE2 OP0 OP1))
16507 (set TARGET (ior:PRED_MODE TMP1 TMP2))
16508
16509 PTRUE is an all-true predicate with the same mode as TARGET. */
16510
16511 static void
16512 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16513 rtx ptrue, rtx op0, rtx op1)
16514 {
16515 machine_mode pred_mode = GET_MODE (ptrue);
16516 rtx tmp1 = gen_reg_rtx (pred_mode);
16517 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16518 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16519 rtx tmp2 = gen_reg_rtx (pred_mode);
16520 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16521 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16522 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16523 }
16524
16525 /* Emit the SVE equivalent of:
16526
16527 (set TMP (CODE OP0 OP1))
16528 (set TARGET (not TMP))
16529
16530 PTRUE is an all-true predicate with the same mode as TARGET. */
16531
16532 static void
16533 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16534 rtx op0, rtx op1)
16535 {
16536 machine_mode pred_mode = GET_MODE (ptrue);
16537 rtx tmp = gen_reg_rtx (pred_mode);
16538 aarch64_emit_sve_ptrue_op (tmp, ptrue,
16539 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16540 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16541 }
16542
16543 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16544
16545 (set TARGET (CODE OP0 OP1))
16546
16547 If CAN_INVERT_P is true, the caller can also handle inverted results;
16548 return true if the result is in fact inverted. */
16549
16550 bool
16551 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16552 rtx op0, rtx op1, bool can_invert_p)
16553 {
16554 machine_mode pred_mode = GET_MODE (target);
16555 machine_mode data_mode = GET_MODE (op0);
16556
16557 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16558 switch (code)
16559 {
16560 case UNORDERED:
16561 /* UNORDERED has no immediate form. */
16562 op1 = force_reg (data_mode, op1);
16563 /* fall through */
16564 case LT:
16565 case LE:
16566 case GT:
16567 case GE:
16568 case EQ:
16569 case NE:
16570 {
16571 /* There is native support for the comparison. */
16572 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16573 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16574 return false;
16575 }
16576
16577 case LTGT:
16578 /* This is a trapping operation (LT or GT). */
16579 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16580 return false;
16581
16582 case UNEQ:
16583 if (!flag_trapping_math)
16584 {
16585 /* This would trap for signaling NaNs. */
16586 op1 = force_reg (data_mode, op1);
16587 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16588 return false;
16589 }
16590 /* fall through */
16591 case UNLT:
16592 case UNLE:
16593 case UNGT:
16594 case UNGE:
16595 if (flag_trapping_math)
16596 {
16597 /* Work out which elements are ordered. */
16598 rtx ordered = gen_reg_rtx (pred_mode);
16599 op1 = force_reg (data_mode, op1);
16600 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16601
16602 /* Test the opposite condition for the ordered elements,
16603 then invert the result. */
16604 if (code == UNEQ)
16605 code = NE;
16606 else
16607 code = reverse_condition_maybe_unordered (code);
16608 if (can_invert_p)
16609 {
16610 aarch64_emit_sve_predicated_cond (target, code,
16611 ordered, op0, op1);
16612 return true;
16613 }
16614 rtx tmp = gen_reg_rtx (pred_mode);
16615 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16616 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16617 return false;
16618 }
16619 break;
16620
16621 case ORDERED:
16622 /* ORDERED has no immediate form. */
16623 op1 = force_reg (data_mode, op1);
16624 break;
16625
16626 default:
16627 gcc_unreachable ();
16628 }
16629
16630 /* There is native support for the inverse comparison. */
16631 code = reverse_condition_maybe_unordered (code);
16632 if (can_invert_p)
16633 {
16634 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16635 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16636 return true;
16637 }
16638 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16639 return false;
16640 }
16641
16642 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16643 of the data being selected and CMP_MODE is the mode of the values being
16644 compared. */
16645
16646 void
16647 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16648 rtx *ops)
16649 {
16650 machine_mode pred_mode
16651 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16652 GET_MODE_SIZE (cmp_mode)).require ();
16653 rtx pred = gen_reg_rtx (pred_mode);
16654 if (FLOAT_MODE_P (cmp_mode))
16655 {
16656 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16657 ops[4], ops[5], true))
16658 std::swap (ops[1], ops[2]);
16659 }
16660 else
16661 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16662
16663 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16664 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16665 }
16666
16667 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16668 true. However due to issues with register allocation it is preferable
16669 to avoid tieing integer scalar and FP scalar modes. Executing integer
16670 operations in general registers is better than treating them as scalar
16671 vector operations. This reduces latency and avoids redundant int<->FP
16672 moves. So tie modes if they are either the same class, or vector modes
16673 with other vector modes, vector structs or any scalar mode. */
16674
16675 static bool
16676 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16677 {
16678 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16679 return true;
16680
16681 /* We specifically want to allow elements of "structure" modes to
16682 be tieable to the structure. This more general condition allows
16683 other rarer situations too. The reason we don't extend this to
16684 predicate modes is that there are no predicate structure modes
16685 nor any specific instructions for extracting part of a predicate
16686 register. */
16687 if (aarch64_vector_data_mode_p (mode1)
16688 && aarch64_vector_data_mode_p (mode2))
16689 return true;
16690
16691 /* Also allow any scalar modes with vectors. */
16692 if (aarch64_vector_mode_supported_p (mode1)
16693 || aarch64_vector_mode_supported_p (mode2))
16694 return true;
16695
16696 return false;
16697 }
16698
16699 /* Return a new RTX holding the result of moving POINTER forward by
16700 AMOUNT bytes. */
16701
16702 static rtx
16703 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16704 {
16705 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16706
16707 return adjust_automodify_address (pointer, GET_MODE (pointer),
16708 next, amount);
16709 }
16710
16711 /* Return a new RTX holding the result of moving POINTER forward by the
16712 size of the mode it points to. */
16713
16714 static rtx
16715 aarch64_progress_pointer (rtx pointer)
16716 {
16717 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16718 }
16719
16720 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16721 MODE bytes. */
16722
16723 static void
16724 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16725 machine_mode mode)
16726 {
16727 rtx reg = gen_reg_rtx (mode);
16728
16729 /* "Cast" the pointers to the correct mode. */
16730 *src = adjust_address (*src, mode, 0);
16731 *dst = adjust_address (*dst, mode, 0);
16732 /* Emit the memcpy. */
16733 emit_move_insn (reg, *src);
16734 emit_move_insn (*dst, reg);
16735 /* Move the pointers forward. */
16736 *src = aarch64_progress_pointer (*src);
16737 *dst = aarch64_progress_pointer (*dst);
16738 }
16739
16740 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16741 we succeed, otherwise return false. */
16742
16743 bool
16744 aarch64_expand_movmem (rtx *operands)
16745 {
16746 int n, mode_bits;
16747 rtx dst = operands[0];
16748 rtx src = operands[1];
16749 rtx base;
16750 machine_mode cur_mode = BLKmode, next_mode;
16751 bool speed_p = !optimize_function_for_size_p (cfun);
16752
16753 /* When optimizing for size, give a better estimate of the length of a
16754 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16755 will always require an even number of instructions to do now. And each
16756 operation requires both a load+store, so devide the max number by 2. */
16757 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16758
16759 /* We can't do anything smart if the amount to copy is not constant. */
16760 if (!CONST_INT_P (operands[2]))
16761 return false;
16762
16763 n = INTVAL (operands[2]);
16764
16765 /* Try to keep the number of instructions low. For all cases we will do at
16766 most two moves for the residual amount, since we'll always overlap the
16767 remainder. */
16768 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16769 return false;
16770
16771 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16772 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16773
16774 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16775 src = adjust_automodify_address (src, VOIDmode, base, 0);
16776
16777 /* Convert n to bits to make the rest of the code simpler. */
16778 n = n * BITS_PER_UNIT;
16779
16780 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
16781 larger than TImode, but we should not use them for loads/stores here. */
16782 const int copy_limit = GET_MODE_BITSIZE (TImode);
16783
16784 while (n > 0)
16785 {
16786 /* Find the largest mode in which to do the copy in without over reading
16787 or writing. */
16788 opt_scalar_int_mode mode_iter;
16789 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16790 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
16791 cur_mode = mode_iter.require ();
16792
16793 gcc_assert (cur_mode != BLKmode);
16794
16795 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16796 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16797
16798 n -= mode_bits;
16799
16800 /* Do certain trailing copies as overlapping if it's going to be
16801 cheaper. i.e. less instructions to do so. For instance doing a 15
16802 byte copy it's more efficient to do two overlapping 8 byte copies than
16803 8 + 6 + 1. */
16804 if (n > 0 && n <= 8 * BITS_PER_UNIT)
16805 {
16806 next_mode = smallest_mode_for_size (n, MODE_INT);
16807 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16808 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16809 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16810 n = n_bits;
16811 }
16812 }
16813
16814 return true;
16815 }
16816
16817 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16818 SImode stores. Handle the case when the constant has identical
16819 bottom and top halves. This is beneficial when the two stores can be
16820 merged into an STP and we avoid synthesising potentially expensive
16821 immediates twice. Return true if such a split is possible. */
16822
16823 bool
16824 aarch64_split_dimode_const_store (rtx dst, rtx src)
16825 {
16826 rtx lo = gen_lowpart (SImode, src);
16827 rtx hi = gen_highpart_mode (SImode, DImode, src);
16828
16829 bool size_p = optimize_function_for_size_p (cfun);
16830
16831 if (!rtx_equal_p (lo, hi))
16832 return false;
16833
16834 unsigned int orig_cost
16835 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16836 unsigned int lo_cost
16837 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16838
16839 /* We want to transform:
16840 MOV x1, 49370
16841 MOVK x1, 0x140, lsl 16
16842 MOVK x1, 0xc0da, lsl 32
16843 MOVK x1, 0x140, lsl 48
16844 STR x1, [x0]
16845 into:
16846 MOV w1, 49370
16847 MOVK w1, 0x140, lsl 16
16848 STP w1, w1, [x0]
16849 So we want to perform this only when we save two instructions
16850 or more. When optimizing for size, however, accept any code size
16851 savings we can. */
16852 if (size_p && orig_cost <= lo_cost)
16853 return false;
16854
16855 if (!size_p
16856 && (orig_cost <= lo_cost + 1))
16857 return false;
16858
16859 rtx mem_lo = adjust_address (dst, SImode, 0);
16860 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16861 return false;
16862
16863 rtx tmp_reg = gen_reg_rtx (SImode);
16864 aarch64_expand_mov_immediate (tmp_reg, lo);
16865 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16866 /* Don't emit an explicit store pair as this may not be always profitable.
16867 Let the sched-fusion logic decide whether to merge them. */
16868 emit_move_insn (mem_lo, tmp_reg);
16869 emit_move_insn (mem_hi, tmp_reg);
16870
16871 return true;
16872 }
16873
16874 /* Generate RTL for a conditional branch with rtx comparison CODE in
16875 mode CC_MODE. The destination of the unlikely conditional branch
16876 is LABEL_REF. */
16877
16878 void
16879 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
16880 rtx label_ref)
16881 {
16882 rtx x;
16883 x = gen_rtx_fmt_ee (code, VOIDmode,
16884 gen_rtx_REG (cc_mode, CC_REGNUM),
16885 const0_rtx);
16886
16887 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16888 gen_rtx_LABEL_REF (VOIDmode, label_ref),
16889 pc_rtx);
16890 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16891 }
16892
16893 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16894
16895 OP1 represents the TImode destination operand 1
16896 OP2 represents the TImode destination operand 2
16897 LOW_DEST represents the low half (DImode) of TImode operand 0
16898 LOW_IN1 represents the low half (DImode) of TImode operand 1
16899 LOW_IN2 represents the low half (DImode) of TImode operand 2
16900 HIGH_DEST represents the high half (DImode) of TImode operand 0
16901 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16902 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16903
16904 void
16905 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16906 rtx *low_in1, rtx *low_in2,
16907 rtx *high_dest, rtx *high_in1,
16908 rtx *high_in2)
16909 {
16910 *low_dest = gen_reg_rtx (DImode);
16911 *low_in1 = gen_lowpart (DImode, op1);
16912 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16913 subreg_lowpart_offset (DImode, TImode));
16914 *high_dest = gen_reg_rtx (DImode);
16915 *high_in1 = gen_highpart (DImode, op1);
16916 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16917 subreg_highpart_offset (DImode, TImode));
16918 }
16919
16920 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16921
16922 This function differs from 'arch64_addti_scratch_regs' in that
16923 OP1 can be an immediate constant (zero). We must call
16924 subreg_highpart_offset with DImode and TImode arguments, otherwise
16925 VOIDmode will be used for the const_int which generates an internal
16926 error from subreg_size_highpart_offset which does not expect a size of zero.
16927
16928 OP1 represents the TImode destination operand 1
16929 OP2 represents the TImode destination operand 2
16930 LOW_DEST represents the low half (DImode) of TImode operand 0
16931 LOW_IN1 represents the low half (DImode) of TImode operand 1
16932 LOW_IN2 represents the low half (DImode) of TImode operand 2
16933 HIGH_DEST represents the high half (DImode) of TImode operand 0
16934 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16935 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16936
16937
16938 void
16939 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16940 rtx *low_in1, rtx *low_in2,
16941 rtx *high_dest, rtx *high_in1,
16942 rtx *high_in2)
16943 {
16944 *low_dest = gen_reg_rtx (DImode);
16945 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
16946 subreg_lowpart_offset (DImode, TImode));
16947
16948 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16949 subreg_lowpart_offset (DImode, TImode));
16950 *high_dest = gen_reg_rtx (DImode);
16951
16952 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
16953 subreg_highpart_offset (DImode, TImode));
16954 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16955 subreg_highpart_offset (DImode, TImode));
16956 }
16957
16958 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16959
16960 OP0 represents the TImode destination operand 0
16961 LOW_DEST represents the low half (DImode) of TImode operand 0
16962 LOW_IN1 represents the low half (DImode) of TImode operand 1
16963 LOW_IN2 represents the low half (DImode) of TImode operand 2
16964 HIGH_DEST represents the high half (DImode) of TImode operand 0
16965 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16966 HIGH_IN2 represents the high half (DImode) of TImode operand 2
16967 UNSIGNED_P is true if the operation is being performed on unsigned
16968 values. */
16969 void
16970 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
16971 rtx low_in2, rtx high_dest, rtx high_in1,
16972 rtx high_in2, bool unsigned_p)
16973 {
16974 if (low_in2 == const0_rtx)
16975 {
16976 low_dest = low_in1;
16977 high_in2 = force_reg (DImode, high_in2);
16978 if (unsigned_p)
16979 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
16980 else
16981 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
16982 }
16983 else
16984 {
16985 if (CONST_INT_P (low_in2))
16986 {
16987 high_in2 = force_reg (DImode, high_in2);
16988 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
16989 GEN_INT (-INTVAL (low_in2))));
16990 }
16991 else
16992 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
16993
16994 if (unsigned_p)
16995 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
16996 else
16997 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
16998 }
16999
17000 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17001 emit_move_insn (gen_highpart (DImode, op0), high_dest);
17002
17003 }
17004
17005 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
17006
17007 static unsigned HOST_WIDE_INT
17008 aarch64_asan_shadow_offset (void)
17009 {
17010 return (HOST_WIDE_INT_1 << 36);
17011 }
17012
17013 static rtx
17014 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17015 int code, tree treeop0, tree treeop1)
17016 {
17017 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17018 rtx op0, op1;
17019 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17020 insn_code icode;
17021 struct expand_operand ops[4];
17022
17023 start_sequence ();
17024 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17025
17026 op_mode = GET_MODE (op0);
17027 if (op_mode == VOIDmode)
17028 op_mode = GET_MODE (op1);
17029
17030 switch (op_mode)
17031 {
17032 case E_QImode:
17033 case E_HImode:
17034 case E_SImode:
17035 cmp_mode = SImode;
17036 icode = CODE_FOR_cmpsi;
17037 break;
17038
17039 case E_DImode:
17040 cmp_mode = DImode;
17041 icode = CODE_FOR_cmpdi;
17042 break;
17043
17044 case E_SFmode:
17045 cmp_mode = SFmode;
17046 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17047 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17048 break;
17049
17050 case E_DFmode:
17051 cmp_mode = DFmode;
17052 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17053 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17054 break;
17055
17056 default:
17057 end_sequence ();
17058 return NULL_RTX;
17059 }
17060
17061 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17062 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17063 if (!op0 || !op1)
17064 {
17065 end_sequence ();
17066 return NULL_RTX;
17067 }
17068 *prep_seq = get_insns ();
17069 end_sequence ();
17070
17071 create_fixed_operand (&ops[0], op0);
17072 create_fixed_operand (&ops[1], op1);
17073
17074 start_sequence ();
17075 if (!maybe_expand_insn (icode, 2, ops))
17076 {
17077 end_sequence ();
17078 return NULL_RTX;
17079 }
17080 *gen_seq = get_insns ();
17081 end_sequence ();
17082
17083 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17084 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17085 }
17086
17087 static rtx
17088 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17089 int cmp_code, tree treeop0, tree treeop1, int bit_code)
17090 {
17091 rtx op0, op1, target;
17092 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17093 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17094 insn_code icode;
17095 struct expand_operand ops[6];
17096 int aarch64_cond;
17097
17098 push_to_sequence (*prep_seq);
17099 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17100
17101 op_mode = GET_MODE (op0);
17102 if (op_mode == VOIDmode)
17103 op_mode = GET_MODE (op1);
17104
17105 switch (op_mode)
17106 {
17107 case E_QImode:
17108 case E_HImode:
17109 case E_SImode:
17110 cmp_mode = SImode;
17111 icode = CODE_FOR_ccmpsi;
17112 break;
17113
17114 case E_DImode:
17115 cmp_mode = DImode;
17116 icode = CODE_FOR_ccmpdi;
17117 break;
17118
17119 case E_SFmode:
17120 cmp_mode = SFmode;
17121 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17122 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17123 break;
17124
17125 case E_DFmode:
17126 cmp_mode = DFmode;
17127 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17128 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17129 break;
17130
17131 default:
17132 end_sequence ();
17133 return NULL_RTX;
17134 }
17135
17136 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17137 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17138 if (!op0 || !op1)
17139 {
17140 end_sequence ();
17141 return NULL_RTX;
17142 }
17143 *prep_seq = get_insns ();
17144 end_sequence ();
17145
17146 target = gen_rtx_REG (cc_mode, CC_REGNUM);
17147 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17148
17149 if (bit_code != AND)
17150 {
17151 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17152 GET_MODE (XEXP (prev, 0))),
17153 VOIDmode, XEXP (prev, 0), const0_rtx);
17154 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17155 }
17156
17157 create_fixed_operand (&ops[0], XEXP (prev, 0));
17158 create_fixed_operand (&ops[1], target);
17159 create_fixed_operand (&ops[2], op0);
17160 create_fixed_operand (&ops[3], op1);
17161 create_fixed_operand (&ops[4], prev);
17162 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17163
17164 push_to_sequence (*gen_seq);
17165 if (!maybe_expand_insn (icode, 6, ops))
17166 {
17167 end_sequence ();
17168 return NULL_RTX;
17169 }
17170
17171 *gen_seq = get_insns ();
17172 end_sequence ();
17173
17174 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17175 }
17176
17177 #undef TARGET_GEN_CCMP_FIRST
17178 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17179
17180 #undef TARGET_GEN_CCMP_NEXT
17181 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17182
17183 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17184 instruction fusion of some sort. */
17185
17186 static bool
17187 aarch64_macro_fusion_p (void)
17188 {
17189 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17190 }
17191
17192
17193 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17194 should be kept together during scheduling. */
17195
17196 static bool
17197 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17198 {
17199 rtx set_dest;
17200 rtx prev_set = single_set (prev);
17201 rtx curr_set = single_set (curr);
17202 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17203 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17204
17205 if (!aarch64_macro_fusion_p ())
17206 return false;
17207
17208 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17209 {
17210 /* We are trying to match:
17211 prev (mov) == (set (reg r0) (const_int imm16))
17212 curr (movk) == (set (zero_extract (reg r0)
17213 (const_int 16)
17214 (const_int 16))
17215 (const_int imm16_1)) */
17216
17217 set_dest = SET_DEST (curr_set);
17218
17219 if (GET_CODE (set_dest) == ZERO_EXTRACT
17220 && CONST_INT_P (SET_SRC (curr_set))
17221 && CONST_INT_P (SET_SRC (prev_set))
17222 && CONST_INT_P (XEXP (set_dest, 2))
17223 && INTVAL (XEXP (set_dest, 2)) == 16
17224 && REG_P (XEXP (set_dest, 0))
17225 && REG_P (SET_DEST (prev_set))
17226 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17227 {
17228 return true;
17229 }
17230 }
17231
17232 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17233 {
17234
17235 /* We're trying to match:
17236 prev (adrp) == (set (reg r1)
17237 (high (symbol_ref ("SYM"))))
17238 curr (add) == (set (reg r0)
17239 (lo_sum (reg r1)
17240 (symbol_ref ("SYM"))))
17241 Note that r0 need not necessarily be the same as r1, especially
17242 during pre-regalloc scheduling. */
17243
17244 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17245 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17246 {
17247 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17248 && REG_P (XEXP (SET_SRC (curr_set), 0))
17249 && REGNO (XEXP (SET_SRC (curr_set), 0))
17250 == REGNO (SET_DEST (prev_set))
17251 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17252 XEXP (SET_SRC (curr_set), 1)))
17253 return true;
17254 }
17255 }
17256
17257 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17258 {
17259
17260 /* We're trying to match:
17261 prev (movk) == (set (zero_extract (reg r0)
17262 (const_int 16)
17263 (const_int 32))
17264 (const_int imm16_1))
17265 curr (movk) == (set (zero_extract (reg r0)
17266 (const_int 16)
17267 (const_int 48))
17268 (const_int imm16_2)) */
17269
17270 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17271 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17272 && REG_P (XEXP (SET_DEST (prev_set), 0))
17273 && REG_P (XEXP (SET_DEST (curr_set), 0))
17274 && REGNO (XEXP (SET_DEST (prev_set), 0))
17275 == REGNO (XEXP (SET_DEST (curr_set), 0))
17276 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17277 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17278 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17279 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17280 && CONST_INT_P (SET_SRC (prev_set))
17281 && CONST_INT_P (SET_SRC (curr_set)))
17282 return true;
17283
17284 }
17285 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17286 {
17287 /* We're trying to match:
17288 prev (adrp) == (set (reg r0)
17289 (high (symbol_ref ("SYM"))))
17290 curr (ldr) == (set (reg r1)
17291 (mem (lo_sum (reg r0)
17292 (symbol_ref ("SYM")))))
17293 or
17294 curr (ldr) == (set (reg r1)
17295 (zero_extend (mem
17296 (lo_sum (reg r0)
17297 (symbol_ref ("SYM")))))) */
17298 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17299 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17300 {
17301 rtx curr_src = SET_SRC (curr_set);
17302
17303 if (GET_CODE (curr_src) == ZERO_EXTEND)
17304 curr_src = XEXP (curr_src, 0);
17305
17306 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17307 && REG_P (XEXP (XEXP (curr_src, 0), 0))
17308 && REGNO (XEXP (XEXP (curr_src, 0), 0))
17309 == REGNO (SET_DEST (prev_set))
17310 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17311 XEXP (SET_SRC (prev_set), 0)))
17312 return true;
17313 }
17314 }
17315
17316 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17317 && aarch_crypto_can_dual_issue (prev, curr))
17318 return true;
17319
17320 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17321 && any_condjump_p (curr))
17322 {
17323 unsigned int condreg1, condreg2;
17324 rtx cc_reg_1;
17325 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17326 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17327
17328 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17329 && prev
17330 && modified_in_p (cc_reg_1, prev))
17331 {
17332 enum attr_type prev_type = get_attr_type (prev);
17333
17334 /* FIXME: this misses some which is considered simple arthematic
17335 instructions for ThunderX. Simple shifts are missed here. */
17336 if (prev_type == TYPE_ALUS_SREG
17337 || prev_type == TYPE_ALUS_IMM
17338 || prev_type == TYPE_LOGICS_REG
17339 || prev_type == TYPE_LOGICS_IMM)
17340 return true;
17341 }
17342 }
17343
17344 if (prev_set
17345 && curr_set
17346 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17347 && any_condjump_p (curr))
17348 {
17349 /* We're trying to match:
17350 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17351 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
17352 (const_int 0))
17353 (label_ref ("SYM"))
17354 (pc)) */
17355 if (SET_DEST (curr_set) == (pc_rtx)
17356 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17357 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17358 && REG_P (SET_DEST (prev_set))
17359 && REGNO (SET_DEST (prev_set))
17360 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17361 {
17362 /* Fuse ALU operations followed by conditional branch instruction. */
17363 switch (get_attr_type (prev))
17364 {
17365 case TYPE_ALU_IMM:
17366 case TYPE_ALU_SREG:
17367 case TYPE_ADC_REG:
17368 case TYPE_ADC_IMM:
17369 case TYPE_ADCS_REG:
17370 case TYPE_ADCS_IMM:
17371 case TYPE_LOGIC_REG:
17372 case TYPE_LOGIC_IMM:
17373 case TYPE_CSEL:
17374 case TYPE_ADR:
17375 case TYPE_MOV_IMM:
17376 case TYPE_SHIFT_REG:
17377 case TYPE_SHIFT_IMM:
17378 case TYPE_BFM:
17379 case TYPE_RBIT:
17380 case TYPE_REV:
17381 case TYPE_EXTEND:
17382 return true;
17383
17384 default:;
17385 }
17386 }
17387 }
17388
17389 return false;
17390 }
17391
17392 /* Return true iff the instruction fusion described by OP is enabled. */
17393
17394 bool
17395 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17396 {
17397 return (aarch64_tune_params.fusible_ops & op) != 0;
17398 }
17399
17400 /* If MEM is in the form of [base+offset], extract the two parts
17401 of address and set to BASE and OFFSET, otherwise return false
17402 after clearing BASE and OFFSET. */
17403
17404 bool
17405 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17406 {
17407 rtx addr;
17408
17409 gcc_assert (MEM_P (mem));
17410
17411 addr = XEXP (mem, 0);
17412
17413 if (REG_P (addr))
17414 {
17415 *base = addr;
17416 *offset = const0_rtx;
17417 return true;
17418 }
17419
17420 if (GET_CODE (addr) == PLUS
17421 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17422 {
17423 *base = XEXP (addr, 0);
17424 *offset = XEXP (addr, 1);
17425 return true;
17426 }
17427
17428 *base = NULL_RTX;
17429 *offset = NULL_RTX;
17430
17431 return false;
17432 }
17433
17434 /* Types for scheduling fusion. */
17435 enum sched_fusion_type
17436 {
17437 SCHED_FUSION_NONE = 0,
17438 SCHED_FUSION_LD_SIGN_EXTEND,
17439 SCHED_FUSION_LD_ZERO_EXTEND,
17440 SCHED_FUSION_LD,
17441 SCHED_FUSION_ST,
17442 SCHED_FUSION_NUM
17443 };
17444
17445 /* If INSN is a load or store of address in the form of [base+offset],
17446 extract the two parts and set to BASE and OFFSET. Return scheduling
17447 fusion type this INSN is. */
17448
17449 static enum sched_fusion_type
17450 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17451 {
17452 rtx x, dest, src;
17453 enum sched_fusion_type fusion = SCHED_FUSION_LD;
17454
17455 gcc_assert (INSN_P (insn));
17456 x = PATTERN (insn);
17457 if (GET_CODE (x) != SET)
17458 return SCHED_FUSION_NONE;
17459
17460 src = SET_SRC (x);
17461 dest = SET_DEST (x);
17462
17463 machine_mode dest_mode = GET_MODE (dest);
17464
17465 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17466 return SCHED_FUSION_NONE;
17467
17468 if (GET_CODE (src) == SIGN_EXTEND)
17469 {
17470 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17471 src = XEXP (src, 0);
17472 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17473 return SCHED_FUSION_NONE;
17474 }
17475 else if (GET_CODE (src) == ZERO_EXTEND)
17476 {
17477 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17478 src = XEXP (src, 0);
17479 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17480 return SCHED_FUSION_NONE;
17481 }
17482
17483 if (GET_CODE (src) == MEM && REG_P (dest))
17484 extract_base_offset_in_addr (src, base, offset);
17485 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17486 {
17487 fusion = SCHED_FUSION_ST;
17488 extract_base_offset_in_addr (dest, base, offset);
17489 }
17490 else
17491 return SCHED_FUSION_NONE;
17492
17493 if (*base == NULL_RTX || *offset == NULL_RTX)
17494 fusion = SCHED_FUSION_NONE;
17495
17496 return fusion;
17497 }
17498
17499 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17500
17501 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17502 and PRI are only calculated for these instructions. For other instruction,
17503 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
17504 type instruction fusion can be added by returning different priorities.
17505
17506 It's important that irrelevant instructions get the largest FUSION_PRI. */
17507
17508 static void
17509 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17510 int *fusion_pri, int *pri)
17511 {
17512 int tmp, off_val;
17513 rtx base, offset;
17514 enum sched_fusion_type fusion;
17515
17516 gcc_assert (INSN_P (insn));
17517
17518 tmp = max_pri - 1;
17519 fusion = fusion_load_store (insn, &base, &offset);
17520 if (fusion == SCHED_FUSION_NONE)
17521 {
17522 *pri = tmp;
17523 *fusion_pri = tmp;
17524 return;
17525 }
17526
17527 /* Set FUSION_PRI according to fusion type and base register. */
17528 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17529
17530 /* Calculate PRI. */
17531 tmp /= 2;
17532
17533 /* INSN with smaller offset goes first. */
17534 off_val = (int)(INTVAL (offset));
17535 if (off_val >= 0)
17536 tmp -= (off_val & 0xfffff);
17537 else
17538 tmp += ((- off_val) & 0xfffff);
17539
17540 *pri = tmp;
17541 return;
17542 }
17543
17544 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17545 Adjust priority of sha1h instructions so they are scheduled before
17546 other SHA1 instructions. */
17547
17548 static int
17549 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17550 {
17551 rtx x = PATTERN (insn);
17552
17553 if (GET_CODE (x) == SET)
17554 {
17555 x = SET_SRC (x);
17556
17557 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17558 return priority + 10;
17559 }
17560
17561 return priority;
17562 }
17563
17564 /* Given OPERANDS of consecutive load/store, check if we can merge
17565 them into ldp/stp. LOAD is true if they are load instructions.
17566 MODE is the mode of memory operands. */
17567
17568 bool
17569 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17570 machine_mode mode)
17571 {
17572 HOST_WIDE_INT offval_1, offval_2, msize;
17573 enum reg_class rclass_1, rclass_2;
17574 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17575
17576 if (load)
17577 {
17578 mem_1 = operands[1];
17579 mem_2 = operands[3];
17580 reg_1 = operands[0];
17581 reg_2 = operands[2];
17582 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17583 if (REGNO (reg_1) == REGNO (reg_2))
17584 return false;
17585 }
17586 else
17587 {
17588 mem_1 = operands[0];
17589 mem_2 = operands[2];
17590 reg_1 = operands[1];
17591 reg_2 = operands[3];
17592 }
17593
17594 /* The mems cannot be volatile. */
17595 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17596 return false;
17597
17598 /* If we have SImode and slow unaligned ldp,
17599 check the alignment to be at least 8 byte. */
17600 if (mode == SImode
17601 && (aarch64_tune_params.extra_tuning_flags
17602 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17603 && !optimize_size
17604 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17605 return false;
17606
17607 /* Check if the addresses are in the form of [base+offset]. */
17608 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17609 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17610 return false;
17611 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17612 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17613 return false;
17614
17615 /* Check if the bases are same. */
17616 if (!rtx_equal_p (base_1, base_2))
17617 return false;
17618
17619 /* The operands must be of the same size. */
17620 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17621 GET_MODE_SIZE (GET_MODE (mem_2))));
17622
17623 offval_1 = INTVAL (offset_1);
17624 offval_2 = INTVAL (offset_2);
17625 /* We should only be trying this for fixed-sized modes. There is no
17626 SVE LDP/STP instruction. */
17627 msize = GET_MODE_SIZE (mode).to_constant ();
17628 /* Check if the offsets are consecutive. */
17629 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17630 return false;
17631
17632 /* Check if the addresses are clobbered by load. */
17633 if (load)
17634 {
17635 if (reg_mentioned_p (reg_1, mem_1))
17636 return false;
17637
17638 /* In increasing order, the last load can clobber the address. */
17639 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17640 return false;
17641 }
17642
17643 /* One of the memory accesses must be a mempair operand.
17644 If it is not the first one, they need to be swapped by the
17645 peephole. */
17646 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17647 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17648 return false;
17649
17650 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17651 rclass_1 = FP_REGS;
17652 else
17653 rclass_1 = GENERAL_REGS;
17654
17655 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17656 rclass_2 = FP_REGS;
17657 else
17658 rclass_2 = GENERAL_REGS;
17659
17660 /* Check if the registers are of same class. */
17661 if (rclass_1 != rclass_2)
17662 return false;
17663
17664 return true;
17665 }
17666
17667 /* Given OPERANDS of consecutive load/store that can be merged,
17668 swap them if they are not in ascending order. */
17669 void
17670 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17671 {
17672 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17673 HOST_WIDE_INT offval_1, offval_2;
17674
17675 if (load)
17676 {
17677 mem_1 = operands[1];
17678 mem_2 = operands[3];
17679 }
17680 else
17681 {
17682 mem_1 = operands[0];
17683 mem_2 = operands[2];
17684 }
17685
17686 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17687 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17688
17689 offval_1 = INTVAL (offset_1);
17690 offval_2 = INTVAL (offset_2);
17691
17692 if (offval_1 > offval_2)
17693 {
17694 /* Irrespective of whether this is a load or a store,
17695 we do the same swap. */
17696 std::swap (operands[0], operands[2]);
17697 std::swap (operands[1], operands[3]);
17698 }
17699 }
17700
17701 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17702 comparison between the two. */
17703 int
17704 aarch64_host_wide_int_compare (const void *x, const void *y)
17705 {
17706 return wi::cmps (* ((const HOST_WIDE_INT *) x),
17707 * ((const HOST_WIDE_INT *) y));
17708 }
17709
17710 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17711 other pointing to a REG rtx containing an offset, compare the offsets
17712 of the two pairs.
17713
17714 Return:
17715
17716 1 iff offset (X) > offset (Y)
17717 0 iff offset (X) == offset (Y)
17718 -1 iff offset (X) < offset (Y) */
17719 int
17720 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17721 {
17722 const rtx * operands_1 = (const rtx *) x;
17723 const rtx * operands_2 = (const rtx *) y;
17724 rtx mem_1, mem_2, base, offset_1, offset_2;
17725
17726 if (MEM_P (operands_1[0]))
17727 mem_1 = operands_1[0];
17728 else
17729 mem_1 = operands_1[1];
17730
17731 if (MEM_P (operands_2[0]))
17732 mem_2 = operands_2[0];
17733 else
17734 mem_2 = operands_2[1];
17735
17736 /* Extract the offsets. */
17737 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17738 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17739
17740 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17741
17742 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17743 }
17744
17745 /* Given OPERANDS of consecutive load/store, check if we can merge
17746 them into ldp/stp by adjusting the offset. LOAD is true if they
17747 are load instructions. MODE is the mode of memory operands.
17748
17749 Given below consecutive stores:
17750
17751 str w1, [xb, 0x100]
17752 str w1, [xb, 0x104]
17753 str w1, [xb, 0x108]
17754 str w1, [xb, 0x10c]
17755
17756 Though the offsets are out of the range supported by stp, we can
17757 still pair them after adjusting the offset, like:
17758
17759 add scratch, xb, 0x100
17760 stp w1, w1, [scratch]
17761 stp w1, w1, [scratch, 0x8]
17762
17763 The peephole patterns detecting this opportunity should guarantee
17764 the scratch register is avaliable. */
17765
17766 bool
17767 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17768 scalar_mode mode)
17769 {
17770 const int num_insns = 4;
17771 enum reg_class rclass;
17772 HOST_WIDE_INT offvals[num_insns], msize;
17773 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17774
17775 if (load)
17776 {
17777 for (int i = 0; i < num_insns; i++)
17778 {
17779 reg[i] = operands[2 * i];
17780 mem[i] = operands[2 * i + 1];
17781
17782 gcc_assert (REG_P (reg[i]));
17783 }
17784
17785 /* Do not attempt to merge the loads if the loads clobber each other. */
17786 for (int i = 0; i < 8; i += 2)
17787 for (int j = i + 2; j < 8; j += 2)
17788 if (reg_overlap_mentioned_p (operands[i], operands[j]))
17789 return false;
17790 }
17791 else
17792 for (int i = 0; i < num_insns; i++)
17793 {
17794 mem[i] = operands[2 * i];
17795 reg[i] = operands[2 * i + 1];
17796 }
17797
17798 /* Skip if memory operand is by itself valid for ldp/stp. */
17799 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
17800 return false;
17801
17802 for (int i = 0; i < num_insns; i++)
17803 {
17804 /* The mems cannot be volatile. */
17805 if (MEM_VOLATILE_P (mem[i]))
17806 return false;
17807
17808 /* Check if the addresses are in the form of [base+offset]. */
17809 extract_base_offset_in_addr (mem[i], base + i, offset + i);
17810 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
17811 return false;
17812 }
17813
17814 /* Check if the registers are of same class. */
17815 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
17816 ? FP_REGS : GENERAL_REGS;
17817
17818 for (int i = 1; i < num_insns; i++)
17819 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
17820 {
17821 if (rclass != FP_REGS)
17822 return false;
17823 }
17824 else
17825 {
17826 if (rclass != GENERAL_REGS)
17827 return false;
17828 }
17829
17830 /* Only the last register in the order in which they occur
17831 may be clobbered by the load. */
17832 if (rclass == GENERAL_REGS && load)
17833 for (int i = 0; i < num_insns - 1; i++)
17834 if (reg_mentioned_p (reg[i], mem[i]))
17835 return false;
17836
17837 /* Check if the bases are same. */
17838 for (int i = 0; i < num_insns - 1; i++)
17839 if (!rtx_equal_p (base[i], base[i + 1]))
17840 return false;
17841
17842 for (int i = 0; i < num_insns; i++)
17843 offvals[i] = INTVAL (offset[i]);
17844
17845 msize = GET_MODE_SIZE (mode);
17846
17847 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17848 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
17849 aarch64_host_wide_int_compare);
17850
17851 if (!(offvals[1] == offvals[0] + msize
17852 && offvals[3] == offvals[2] + msize))
17853 return false;
17854
17855 /* Check that offsets are within range of each other. The ldp/stp
17856 instructions have 7 bit immediate offsets, so use 0x80. */
17857 if (offvals[2] - offvals[0] >= msize * 0x80)
17858 return false;
17859
17860 /* The offsets must be aligned with respect to each other. */
17861 if (offvals[0] % msize != offvals[2] % msize)
17862 return false;
17863
17864 /* If we have SImode and slow unaligned ldp,
17865 check the alignment to be at least 8 byte. */
17866 if (mode == SImode
17867 && (aarch64_tune_params.extra_tuning_flags
17868 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17869 && !optimize_size
17870 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
17871 return false;
17872
17873 return true;
17874 }
17875
17876 /* Given OPERANDS of consecutive load/store, this function pairs them
17877 into LDP/STP after adjusting the offset. It depends on the fact
17878 that the operands can be sorted so the offsets are correct for STP.
17879 MODE is the mode of memory operands. CODE is the rtl operator
17880 which should be applied to all memory operands, it's SIGN_EXTEND,
17881 ZERO_EXTEND or UNKNOWN. */
17882
17883 bool
17884 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17885 scalar_mode mode, RTX_CODE code)
17886 {
17887 rtx base, offset_1, offset_3, t1, t2;
17888 rtx mem_1, mem_2, mem_3, mem_4;
17889 rtx temp_operands[8];
17890 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17891 stp_off_upper_limit, stp_off_lower_limit, msize;
17892
17893 /* We make changes on a copy as we may still bail out. */
17894 for (int i = 0; i < 8; i ++)
17895 temp_operands[i] = operands[i];
17896
17897 /* Sort the operands. */
17898 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17899
17900 if (load)
17901 {
17902 mem_1 = temp_operands[1];
17903 mem_2 = temp_operands[3];
17904 mem_3 = temp_operands[5];
17905 mem_4 = temp_operands[7];
17906 }
17907 else
17908 {
17909 mem_1 = temp_operands[0];
17910 mem_2 = temp_operands[2];
17911 mem_3 = temp_operands[4];
17912 mem_4 = temp_operands[6];
17913 gcc_assert (code == UNKNOWN);
17914 }
17915
17916 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17917 extract_base_offset_in_addr (mem_3, &base, &offset_3);
17918 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17919 && offset_3 != NULL_RTX);
17920
17921 /* Adjust offset so it can fit in LDP/STP instruction. */
17922 msize = GET_MODE_SIZE (mode);
17923 stp_off_upper_limit = msize * (0x40 - 1);
17924 stp_off_lower_limit = - msize * 0x40;
17925
17926 off_val_1 = INTVAL (offset_1);
17927 off_val_3 = INTVAL (offset_3);
17928
17929 /* The base offset is optimally half way between the two STP/LDP offsets. */
17930 if (msize <= 4)
17931 base_off = (off_val_1 + off_val_3) / 2;
17932 else
17933 /* However, due to issues with negative LDP/STP offset generation for
17934 larger modes, for DF, DI and vector modes. we must not use negative
17935 addresses smaller than 9 signed unadjusted bits can store. This
17936 provides the most range in this case. */
17937 base_off = off_val_1;
17938
17939 /* Adjust the base so that it is aligned with the addresses but still
17940 optimal. */
17941 if (base_off % msize != off_val_1 % msize)
17942 /* Fix the offset, bearing in mind we want to make it bigger not
17943 smaller. */
17944 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17945 else if (msize <= 4)
17946 /* The negative range of LDP/STP is one larger than the positive range. */
17947 base_off += msize;
17948
17949 /* Check if base offset is too big or too small. We can attempt to resolve
17950 this issue by setting it to the maximum value and seeing if the offsets
17951 still fit. */
17952 if (base_off >= 0x1000)
17953 {
17954 base_off = 0x1000 - 1;
17955 /* We must still make sure that the base offset is aligned with respect
17956 to the address. But it may may not be made any bigger. */
17957 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17958 }
17959
17960 /* Likewise for the case where the base is too small. */
17961 if (base_off <= -0x1000)
17962 {
17963 base_off = -0x1000 + 1;
17964 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17965 }
17966
17967 /* Offset of the first STP/LDP. */
17968 new_off_1 = off_val_1 - base_off;
17969
17970 /* Offset of the second STP/LDP. */
17971 new_off_3 = off_val_3 - base_off;
17972
17973 /* The offsets must be within the range of the LDP/STP instructions. */
17974 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17975 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17976 return false;
17977
17978 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17979 new_off_1), true);
17980 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17981 new_off_1 + msize), true);
17982 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17983 new_off_3), true);
17984 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17985 new_off_3 + msize), true);
17986
17987 if (!aarch64_mem_pair_operand (mem_1, mode)
17988 || !aarch64_mem_pair_operand (mem_3, mode))
17989 return false;
17990
17991 if (code == ZERO_EXTEND)
17992 {
17993 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17994 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17995 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17996 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17997 }
17998 else if (code == SIGN_EXTEND)
17999 {
18000 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18001 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18002 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18003 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18004 }
18005
18006 if (load)
18007 {
18008 operands[0] = temp_operands[0];
18009 operands[1] = mem_1;
18010 operands[2] = temp_operands[2];
18011 operands[3] = mem_2;
18012 operands[4] = temp_operands[4];
18013 operands[5] = mem_3;
18014 operands[6] = temp_operands[6];
18015 operands[7] = mem_4;
18016 }
18017 else
18018 {
18019 operands[0] = mem_1;
18020 operands[1] = temp_operands[1];
18021 operands[2] = mem_2;
18022 operands[3] = temp_operands[3];
18023 operands[4] = mem_3;
18024 operands[5] = temp_operands[5];
18025 operands[6] = mem_4;
18026 operands[7] = temp_operands[7];
18027 }
18028
18029 /* Emit adjusting instruction. */
18030 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18031 /* Emit ldp/stp instructions. */
18032 t1 = gen_rtx_SET (operands[0], operands[1]);
18033 t2 = gen_rtx_SET (operands[2], operands[3]);
18034 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18035 t1 = gen_rtx_SET (operands[4], operands[5]);
18036 t2 = gen_rtx_SET (operands[6], operands[7]);
18037 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18038 return true;
18039 }
18040
18041 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18042 it isn't worth branching around empty masked ops (including masked
18043 stores). */
18044
18045 static bool
18046 aarch64_empty_mask_is_expensive (unsigned)
18047 {
18048 return false;
18049 }
18050
18051 /* Return 1 if pseudo register should be created and used to hold
18052 GOT address for PIC code. */
18053
18054 bool
18055 aarch64_use_pseudo_pic_reg (void)
18056 {
18057 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18058 }
18059
18060 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18061
18062 static int
18063 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18064 {
18065 switch (XINT (x, 1))
18066 {
18067 case UNSPEC_GOTSMALLPIC:
18068 case UNSPEC_GOTSMALLPIC28K:
18069 case UNSPEC_GOTTINYPIC:
18070 return 0;
18071 default:
18072 break;
18073 }
18074
18075 return default_unspec_may_trap_p (x, flags);
18076 }
18077
18078
18079 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18080 return the log2 of that value. Otherwise return -1. */
18081
18082 int
18083 aarch64_fpconst_pow_of_2 (rtx x)
18084 {
18085 const REAL_VALUE_TYPE *r;
18086
18087 if (!CONST_DOUBLE_P (x))
18088 return -1;
18089
18090 r = CONST_DOUBLE_REAL_VALUE (x);
18091
18092 if (REAL_VALUE_NEGATIVE (*r)
18093 || REAL_VALUE_ISNAN (*r)
18094 || REAL_VALUE_ISINF (*r)
18095 || !real_isinteger (r, DFmode))
18096 return -1;
18097
18098 return exact_log2 (real_to_integer (r));
18099 }
18100
18101 /* If X is a vector of equal CONST_DOUBLE values and that value is
18102 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18103
18104 int
18105 aarch64_vec_fpconst_pow_of_2 (rtx x)
18106 {
18107 int nelts;
18108 if (GET_CODE (x) != CONST_VECTOR
18109 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18110 return -1;
18111
18112 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18113 return -1;
18114
18115 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18116 if (firstval <= 0)
18117 return -1;
18118
18119 for (int i = 1; i < nelts; i++)
18120 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18121 return -1;
18122
18123 return firstval;
18124 }
18125
18126 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18127 to float.
18128
18129 __fp16 always promotes through this hook.
18130 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18131 through the generic excess precision logic rather than here. */
18132
18133 static tree
18134 aarch64_promoted_type (const_tree t)
18135 {
18136 if (SCALAR_FLOAT_TYPE_P (t)
18137 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18138 return float_type_node;
18139
18140 return NULL_TREE;
18141 }
18142
18143 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18144
18145 static bool
18146 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18147 optimization_type opt_type)
18148 {
18149 switch (op)
18150 {
18151 case rsqrt_optab:
18152 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18153
18154 default:
18155 return true;
18156 }
18157 }
18158
18159 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18160
18161 static unsigned int
18162 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18163 int *offset)
18164 {
18165 /* Polynomial invariant 1 == (VG / 2) - 1. */
18166 gcc_assert (i == 1);
18167 *factor = 2;
18168 *offset = 1;
18169 return AARCH64_DWARF_VG;
18170 }
18171
18172 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18173 if MODE is HFmode, and punt to the generic implementation otherwise. */
18174
18175 static bool
18176 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18177 {
18178 return (mode == HFmode
18179 ? true
18180 : default_libgcc_floating_mode_supported_p (mode));
18181 }
18182
18183 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18184 if MODE is HFmode, and punt to the generic implementation otherwise. */
18185
18186 static bool
18187 aarch64_scalar_mode_supported_p (scalar_mode mode)
18188 {
18189 return (mode == HFmode
18190 ? true
18191 : default_scalar_mode_supported_p (mode));
18192 }
18193
18194 /* Set the value of FLT_EVAL_METHOD.
18195 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18196
18197 0: evaluate all operations and constants, whose semantic type has at
18198 most the range and precision of type float, to the range and
18199 precision of float; evaluate all other operations and constants to
18200 the range and precision of the semantic type;
18201
18202 N, where _FloatN is a supported interchange floating type
18203 evaluate all operations and constants, whose semantic type has at
18204 most the range and precision of _FloatN type, to the range and
18205 precision of the _FloatN type; evaluate all other operations and
18206 constants to the range and precision of the semantic type;
18207
18208 If we have the ARMv8.2-A extensions then we support _Float16 in native
18209 precision, so we should set this to 16. Otherwise, we support the type,
18210 but want to evaluate expressions in float precision, so set this to
18211 0. */
18212
18213 static enum flt_eval_method
18214 aarch64_excess_precision (enum excess_precision_type type)
18215 {
18216 switch (type)
18217 {
18218 case EXCESS_PRECISION_TYPE_FAST:
18219 case EXCESS_PRECISION_TYPE_STANDARD:
18220 /* We can calculate either in 16-bit range and precision or
18221 32-bit range and precision. Make that decision based on whether
18222 we have native support for the ARMv8.2-A 16-bit floating-point
18223 instructions or not. */
18224 return (TARGET_FP_F16INST
18225 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18226 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18227 case EXCESS_PRECISION_TYPE_IMPLICIT:
18228 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18229 default:
18230 gcc_unreachable ();
18231 }
18232 return FLT_EVAL_METHOD_UNPREDICTABLE;
18233 }
18234
18235 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18236 scheduled for speculative execution. Reject the long-running division
18237 and square-root instructions. */
18238
18239 static bool
18240 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18241 {
18242 switch (get_attr_type (insn))
18243 {
18244 case TYPE_SDIV:
18245 case TYPE_UDIV:
18246 case TYPE_FDIVS:
18247 case TYPE_FDIVD:
18248 case TYPE_FSQRTS:
18249 case TYPE_FSQRTD:
18250 case TYPE_NEON_FP_SQRT_S:
18251 case TYPE_NEON_FP_SQRT_D:
18252 case TYPE_NEON_FP_SQRT_S_Q:
18253 case TYPE_NEON_FP_SQRT_D_Q:
18254 case TYPE_NEON_FP_DIV_S:
18255 case TYPE_NEON_FP_DIV_D:
18256 case TYPE_NEON_FP_DIV_S_Q:
18257 case TYPE_NEON_FP_DIV_D_Q:
18258 return false;
18259 default:
18260 return true;
18261 }
18262 }
18263
18264 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18265
18266 static int
18267 aarch64_compute_pressure_classes (reg_class *classes)
18268 {
18269 int i = 0;
18270 classes[i++] = GENERAL_REGS;
18271 classes[i++] = FP_REGS;
18272 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18273 registers need to go in PR_LO_REGS at some point during their
18274 lifetime. Splitting it into two halves has the effect of making
18275 all predicates count against PR_LO_REGS, so that we try whenever
18276 possible to restrict the number of live predicates to 8. This
18277 greatly reduces the amount of spilling in certain loops. */
18278 classes[i++] = PR_LO_REGS;
18279 classes[i++] = PR_HI_REGS;
18280 return i;
18281 }
18282
18283 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18284
18285 static bool
18286 aarch64_can_change_mode_class (machine_mode from,
18287 machine_mode to, reg_class_t)
18288 {
18289 if (BYTES_BIG_ENDIAN)
18290 {
18291 bool from_sve_p = aarch64_sve_data_mode_p (from);
18292 bool to_sve_p = aarch64_sve_data_mode_p (to);
18293
18294 /* Don't allow changes between SVE data modes and non-SVE modes.
18295 See the comment at the head of aarch64-sve.md for details. */
18296 if (from_sve_p != to_sve_p)
18297 return false;
18298
18299 /* Don't allow changes in element size: lane 0 of the new vector
18300 would not then be lane 0 of the old vector. See the comment
18301 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18302 description.
18303
18304 In the worst case, this forces a register to be spilled in
18305 one mode and reloaded in the other, which handles the
18306 endianness correctly. */
18307 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18308 return false;
18309 }
18310 return true;
18311 }
18312
18313 /* Implement TARGET_EARLY_REMAT_MODES. */
18314
18315 static void
18316 aarch64_select_early_remat_modes (sbitmap modes)
18317 {
18318 /* SVE values are not normally live across a call, so it should be
18319 worth doing early rematerialization even in VL-specific mode. */
18320 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18321 {
18322 machine_mode mode = (machine_mode) i;
18323 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18324 if (vec_flags & VEC_ANY_SVE)
18325 bitmap_set_bit (modes, i);
18326 }
18327 }
18328
18329 /* Override the default target speculation_safe_value. */
18330 static rtx
18331 aarch64_speculation_safe_value (machine_mode mode,
18332 rtx result, rtx val, rtx failval)
18333 {
18334 /* Maybe we should warn if falling back to hard barriers. They are
18335 likely to be noticably more expensive than the alternative below. */
18336 if (!aarch64_track_speculation)
18337 return default_speculation_safe_value (mode, result, val, failval);
18338
18339 if (!REG_P (val))
18340 val = copy_to_mode_reg (mode, val);
18341
18342 if (!aarch64_reg_or_zero (failval, mode))
18343 failval = copy_to_mode_reg (mode, failval);
18344
18345 emit_insn (gen_despeculate_copy (mode, result, val, failval));
18346 return result;
18347 }
18348
18349 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18350 Look into the tuning structure for an estimate.
18351 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18352 Advanced SIMD 128 bits. */
18353
18354 static HOST_WIDE_INT
18355 aarch64_estimated_poly_value (poly_int64 val)
18356 {
18357 enum aarch64_sve_vector_bits_enum width_source
18358 = aarch64_tune_params.sve_width;
18359
18360 /* If we still don't have an estimate, use the default. */
18361 if (width_source == SVE_SCALABLE)
18362 return default_estimated_poly_value (val);
18363
18364 HOST_WIDE_INT over_128 = width_source - 128;
18365 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18366 }
18367
18368 /* Target-specific selftests. */
18369
18370 #if CHECKING_P
18371
18372 namespace selftest {
18373
18374 /* Selftest for the RTL loader.
18375 Verify that the RTL loader copes with a dump from
18376 print_rtx_function. This is essentially just a test that class
18377 function_reader can handle a real dump, but it also verifies
18378 that lookup_reg_by_dump_name correctly handles hard regs.
18379 The presence of hard reg names in the dump means that the test is
18380 target-specific, hence it is in this file. */
18381
18382 static void
18383 aarch64_test_loading_full_dump ()
18384 {
18385 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18386
18387 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18388
18389 rtx_insn *insn_1 = get_insn_by_uid (1);
18390 ASSERT_EQ (NOTE, GET_CODE (insn_1));
18391
18392 rtx_insn *insn_15 = get_insn_by_uid (15);
18393 ASSERT_EQ (INSN, GET_CODE (insn_15));
18394 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18395
18396 /* Verify crtl->return_rtx. */
18397 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18398 ASSERT_EQ (0, REGNO (crtl->return_rtx));
18399 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18400 }
18401
18402 /* Run all target-specific selftests. */
18403
18404 static void
18405 aarch64_run_selftests (void)
18406 {
18407 aarch64_test_loading_full_dump ();
18408 }
18409
18410 } // namespace selftest
18411
18412 #endif /* #if CHECKING_P */
18413
18414 #undef TARGET_ADDRESS_COST
18415 #define TARGET_ADDRESS_COST aarch64_address_cost
18416
18417 /* This hook will determines whether unnamed bitfields affect the alignment
18418 of the containing structure. The hook returns true if the structure
18419 should inherit the alignment requirements of an unnamed bitfield's
18420 type. */
18421 #undef TARGET_ALIGN_ANON_BITFIELD
18422 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18423
18424 #undef TARGET_ASM_ALIGNED_DI_OP
18425 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18426
18427 #undef TARGET_ASM_ALIGNED_HI_OP
18428 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18429
18430 #undef TARGET_ASM_ALIGNED_SI_OP
18431 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18432
18433 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18434 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18435 hook_bool_const_tree_hwi_hwi_const_tree_true
18436
18437 #undef TARGET_ASM_FILE_START
18438 #define TARGET_ASM_FILE_START aarch64_start_file
18439
18440 #undef TARGET_ASM_OUTPUT_MI_THUNK
18441 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18442
18443 #undef TARGET_ASM_SELECT_RTX_SECTION
18444 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18445
18446 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18447 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18448
18449 #undef TARGET_BUILD_BUILTIN_VA_LIST
18450 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18451
18452 #undef TARGET_CALLEE_COPIES
18453 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18454
18455 #undef TARGET_CAN_ELIMINATE
18456 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18457
18458 #undef TARGET_CAN_INLINE_P
18459 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18460
18461 #undef TARGET_CANNOT_FORCE_CONST_MEM
18462 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18463
18464 #undef TARGET_CASE_VALUES_THRESHOLD
18465 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18466
18467 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18468 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18469
18470 /* Only the least significant bit is used for initialization guard
18471 variables. */
18472 #undef TARGET_CXX_GUARD_MASK_BIT
18473 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18474
18475 #undef TARGET_C_MODE_FOR_SUFFIX
18476 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18477
18478 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18479 #undef TARGET_DEFAULT_TARGET_FLAGS
18480 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18481 #endif
18482
18483 #undef TARGET_CLASS_MAX_NREGS
18484 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18485
18486 #undef TARGET_BUILTIN_DECL
18487 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18488
18489 #undef TARGET_BUILTIN_RECIPROCAL
18490 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18491
18492 #undef TARGET_C_EXCESS_PRECISION
18493 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18494
18495 #undef TARGET_EXPAND_BUILTIN
18496 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18497
18498 #undef TARGET_EXPAND_BUILTIN_VA_START
18499 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18500
18501 #undef TARGET_FOLD_BUILTIN
18502 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18503
18504 #undef TARGET_FUNCTION_ARG
18505 #define TARGET_FUNCTION_ARG aarch64_function_arg
18506
18507 #undef TARGET_FUNCTION_ARG_ADVANCE
18508 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18509
18510 #undef TARGET_FUNCTION_ARG_BOUNDARY
18511 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18512
18513 #undef TARGET_FUNCTION_ARG_PADDING
18514 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18515
18516 #undef TARGET_GET_RAW_RESULT_MODE
18517 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18518 #undef TARGET_GET_RAW_ARG_MODE
18519 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18520
18521 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18522 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18523
18524 #undef TARGET_FUNCTION_VALUE
18525 #define TARGET_FUNCTION_VALUE aarch64_function_value
18526
18527 #undef TARGET_FUNCTION_VALUE_REGNO_P
18528 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18529
18530 #undef TARGET_GIMPLE_FOLD_BUILTIN
18531 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18532
18533 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18534 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18535
18536 #undef TARGET_INIT_BUILTINS
18537 #define TARGET_INIT_BUILTINS aarch64_init_builtins
18538
18539 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18540 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18541 aarch64_ira_change_pseudo_allocno_class
18542
18543 #undef TARGET_LEGITIMATE_ADDRESS_P
18544 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18545
18546 #undef TARGET_LEGITIMATE_CONSTANT_P
18547 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18548
18549 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18550 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18551 aarch64_legitimize_address_displacement
18552
18553 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18554 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18555
18556 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18557 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18558 aarch64_libgcc_floating_mode_supported_p
18559
18560 #undef TARGET_MANGLE_TYPE
18561 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18562
18563 #undef TARGET_MEMORY_MOVE_COST
18564 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18565
18566 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18567 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18568
18569 #undef TARGET_MUST_PASS_IN_STACK
18570 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18571
18572 /* This target hook should return true if accesses to volatile bitfields
18573 should use the narrowest mode possible. It should return false if these
18574 accesses should use the bitfield container type. */
18575 #undef TARGET_NARROW_VOLATILE_BITFIELD
18576 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18577
18578 #undef TARGET_OPTION_OVERRIDE
18579 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18580
18581 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18582 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18583 aarch64_override_options_after_change
18584
18585 #undef TARGET_OPTION_SAVE
18586 #define TARGET_OPTION_SAVE aarch64_option_save
18587
18588 #undef TARGET_OPTION_RESTORE
18589 #define TARGET_OPTION_RESTORE aarch64_option_restore
18590
18591 #undef TARGET_OPTION_PRINT
18592 #define TARGET_OPTION_PRINT aarch64_option_print
18593
18594 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18595 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18596
18597 #undef TARGET_SET_CURRENT_FUNCTION
18598 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18599
18600 #undef TARGET_PASS_BY_REFERENCE
18601 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18602
18603 #undef TARGET_PREFERRED_RELOAD_CLASS
18604 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18605
18606 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18607 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18608
18609 #undef TARGET_PROMOTED_TYPE
18610 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18611
18612 #undef TARGET_SECONDARY_RELOAD
18613 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18614
18615 #undef TARGET_SHIFT_TRUNCATION_MASK
18616 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
18617
18618 #undef TARGET_SETUP_INCOMING_VARARGS
18619 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
18620
18621 #undef TARGET_STRUCT_VALUE_RTX
18622 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
18623
18624 #undef TARGET_REGISTER_MOVE_COST
18625 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
18626
18627 #undef TARGET_RETURN_IN_MEMORY
18628 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
18629
18630 #undef TARGET_RETURN_IN_MSB
18631 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
18632
18633 #undef TARGET_RTX_COSTS
18634 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
18635
18636 #undef TARGET_SCALAR_MODE_SUPPORTED_P
18637 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
18638
18639 #undef TARGET_SCHED_ISSUE_RATE
18640 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
18641
18642 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
18643 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
18644 aarch64_sched_first_cycle_multipass_dfa_lookahead
18645
18646 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
18647 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
18648 aarch64_first_cycle_multipass_dfa_lookahead_guard
18649
18650 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
18651 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
18652 aarch64_get_separate_components
18653
18654 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
18655 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
18656 aarch64_components_for_bb
18657
18658 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
18659 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
18660 aarch64_disqualify_components
18661
18662 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
18663 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
18664 aarch64_emit_prologue_components
18665
18666 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
18667 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
18668 aarch64_emit_epilogue_components
18669
18670 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
18671 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
18672 aarch64_set_handled_components
18673
18674 #undef TARGET_TRAMPOLINE_INIT
18675 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
18676
18677 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
18678 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
18679
18680 #undef TARGET_VECTOR_MODE_SUPPORTED_P
18681 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
18682
18683 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
18684 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
18685 aarch64_builtin_support_vector_misalignment
18686
18687 #undef TARGET_ARRAY_MODE
18688 #define TARGET_ARRAY_MODE aarch64_array_mode
18689
18690 #undef TARGET_ARRAY_MODE_SUPPORTED_P
18691 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
18692
18693 #undef TARGET_VECTORIZE_ADD_STMT_COST
18694 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
18695
18696 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
18697 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
18698 aarch64_builtin_vectorization_cost
18699
18700 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
18701 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
18702
18703 #undef TARGET_VECTORIZE_BUILTINS
18704 #define TARGET_VECTORIZE_BUILTINS
18705
18706 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
18707 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
18708 aarch64_builtin_vectorized_function
18709
18710 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
18711 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
18712 aarch64_autovectorize_vector_sizes
18713
18714 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18715 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18716 aarch64_atomic_assign_expand_fenv
18717
18718 /* Section anchor support. */
18719
18720 #undef TARGET_MIN_ANCHOR_OFFSET
18721 #define TARGET_MIN_ANCHOR_OFFSET -256
18722
18723 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18724 byte offset; we can do much more for larger data types, but have no way
18725 to determine the size of the access. We assume accesses are aligned. */
18726 #undef TARGET_MAX_ANCHOR_OFFSET
18727 #define TARGET_MAX_ANCHOR_OFFSET 4095
18728
18729 #undef TARGET_VECTOR_ALIGNMENT
18730 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18731
18732 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18733 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18734 aarch64_vectorize_preferred_vector_alignment
18735 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18736 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18737 aarch64_simd_vector_alignment_reachable
18738
18739 /* vec_perm support. */
18740
18741 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18742 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18743 aarch64_vectorize_vec_perm_const
18744
18745 #undef TARGET_VECTORIZE_GET_MASK_MODE
18746 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18747 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18748 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18749 aarch64_empty_mask_is_expensive
18750 #undef TARGET_PREFERRED_ELSE_VALUE
18751 #define TARGET_PREFERRED_ELSE_VALUE \
18752 aarch64_preferred_else_value
18753
18754 #undef TARGET_INIT_LIBFUNCS
18755 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18756
18757 #undef TARGET_FIXED_CONDITION_CODE_REGS
18758 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18759
18760 #undef TARGET_FLAGS_REGNUM
18761 #define TARGET_FLAGS_REGNUM CC_REGNUM
18762
18763 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18764 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18765
18766 #undef TARGET_ASAN_SHADOW_OFFSET
18767 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18768
18769 #undef TARGET_LEGITIMIZE_ADDRESS
18770 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18771
18772 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18773 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18774
18775 #undef TARGET_CAN_USE_DOLOOP_P
18776 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18777
18778 #undef TARGET_SCHED_ADJUST_PRIORITY
18779 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18780
18781 #undef TARGET_SCHED_MACRO_FUSION_P
18782 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18783
18784 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18785 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18786
18787 #undef TARGET_SCHED_FUSION_PRIORITY
18788 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18789
18790 #undef TARGET_UNSPEC_MAY_TRAP_P
18791 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18792
18793 #undef TARGET_USE_PSEUDO_PIC_REG
18794 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18795
18796 #undef TARGET_PRINT_OPERAND
18797 #define TARGET_PRINT_OPERAND aarch64_print_operand
18798
18799 #undef TARGET_PRINT_OPERAND_ADDRESS
18800 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18801
18802 #undef TARGET_OPTAB_SUPPORTED_P
18803 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18804
18805 #undef TARGET_OMIT_STRUCT_RETURN_REG
18806 #define TARGET_OMIT_STRUCT_RETURN_REG true
18807
18808 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18809 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18810 aarch64_dwarf_poly_indeterminate_value
18811
18812 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18813 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18814 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18815
18816 #undef TARGET_HARD_REGNO_NREGS
18817 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18818 #undef TARGET_HARD_REGNO_MODE_OK
18819 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18820
18821 #undef TARGET_MODES_TIEABLE_P
18822 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18823
18824 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18825 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18826 aarch64_hard_regno_call_part_clobbered
18827
18828 #undef TARGET_CONSTANT_ALIGNMENT
18829 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18830
18831 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
18832 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
18833 aarch64_stack_clash_protection_alloca_probe_range
18834
18835 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18836 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18837
18838 #undef TARGET_CAN_CHANGE_MODE_CLASS
18839 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18840
18841 #undef TARGET_SELECT_EARLY_REMAT_MODES
18842 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18843
18844 #undef TARGET_SPECULATION_SAFE_VALUE
18845 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
18846
18847 #undef TARGET_ESTIMATED_POLY_VALUE
18848 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
18849
18850 #undef TARGET_ATTRIBUTE_TABLE
18851 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
18852
18853 #if CHECKING_P
18854 #undef TARGET_RUN_TARGET_SELFTESTS
18855 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18856 #endif /* #if CHECKING_P */
18857
18858 struct gcc_target targetm = TARGET_INITIALIZER;
18859
18860 #include "gt-aarch64.h"