]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
aarch64.c (aarch64_simd_call_p): New function.
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74
75 /* This file should be included last. */
76 #include "target-def.h"
77
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
80
81 /* Information about a legitimate vector immediate operand. */
82 struct simd_immediate_info
83 {
84 enum insn_type { MOV, MVN };
85 enum modifier_type { LSL, MSL };
86
87 simd_immediate_info () {}
88 simd_immediate_info (scalar_float_mode, rtx);
89 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
90 insn_type = MOV, modifier_type = LSL,
91 unsigned int = 0);
92 simd_immediate_info (scalar_mode, rtx, rtx);
93
94 /* The mode of the elements. */
95 scalar_mode elt_mode;
96
97 /* The value of each element if all elements are the same, or the
98 first value if the constant is a series. */
99 rtx value;
100
101 /* The value of the step if the constant is a series, null otherwise. */
102 rtx step;
103
104 /* The instruction to use to move the immediate into a vector. */
105 insn_type insn;
106
107 /* The kind of shift modifier to use, and the number of bits to shift.
108 This is (LSL, 0) if no shift is needed. */
109 modifier_type modifier;
110 unsigned int shift;
111 };
112
113 /* Construct a floating-point immediate in which each element has mode
114 ELT_MODE_IN and value VALUE_IN. */
115 inline simd_immediate_info
116 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
117 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
118 modifier (LSL), shift (0)
119 {}
120
121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
122 and value VALUE_IN. The other parameters are as for the structure
123 fields. */
124 inline simd_immediate_info
125 ::simd_immediate_info (scalar_int_mode elt_mode_in,
126 unsigned HOST_WIDE_INT value_in,
127 insn_type insn_in, modifier_type modifier_in,
128 unsigned int shift_in)
129 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
130 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
131 {}
132
133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
134 and where element I is equal to VALUE_IN + I * STEP_IN. */
135 inline simd_immediate_info
136 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
137 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
138 modifier (LSL), shift (0)
139 {}
140
141 /* The current code model. */
142 enum aarch64_code_model aarch64_cmodel;
143
144 /* The number of 64-bit elements in an SVE vector. */
145 poly_uint16 aarch64_sve_vg;
146
147 #ifdef HAVE_AS_TLS
148 #undef TARGET_HAVE_TLS
149 #define TARGET_HAVE_TLS 1
150 #endif
151
152 static bool aarch64_composite_type_p (const_tree, machine_mode);
153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
154 const_tree,
155 machine_mode *, int *,
156 bool *);
157 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
158 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
159 static void aarch64_override_options_after_change (void);
160 static bool aarch64_vector_mode_supported_p (machine_mode);
161 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
163 const_tree type,
164 int misalignment,
165 bool is_packed);
166 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
167 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
168 aarch64_addr_query_type);
169 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
170
171 /* Major revision number of the ARM Architecture implemented by the target. */
172 unsigned aarch64_architecture_version;
173
174 /* The processor for which instructions should be scheduled. */
175 enum aarch64_processor aarch64_tune = cortexa53;
176
177 /* Mask to specify which instruction scheduling options should be used. */
178 unsigned long aarch64_tune_flags = 0;
179
180 /* Global flag for PC relative loads. */
181 bool aarch64_pcrelative_literal_loads;
182
183 /* Global flag for whether frame pointer is enabled. */
184 bool aarch64_use_frame_pointer;
185
186 #define BRANCH_PROTECT_STR_MAX 255
187 char *accepted_branch_protection_string = NULL;
188
189 static enum aarch64_parse_opt_result
190 aarch64_parse_branch_protection (const char*, char**);
191
192 /* Support for command line parsing of boolean flags in the tuning
193 structures. */
194 struct aarch64_flag_desc
195 {
196 const char* name;
197 unsigned int flag;
198 };
199
200 #define AARCH64_FUSION_PAIR(name, internal_name) \
201 { name, AARCH64_FUSE_##internal_name },
202 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
203 {
204 { "none", AARCH64_FUSE_NOTHING },
205 #include "aarch64-fusion-pairs.def"
206 { "all", AARCH64_FUSE_ALL },
207 { NULL, AARCH64_FUSE_NOTHING }
208 };
209
210 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
211 { name, AARCH64_EXTRA_TUNE_##internal_name },
212 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
213 {
214 { "none", AARCH64_EXTRA_TUNE_NONE },
215 #include "aarch64-tuning-flags.def"
216 { "all", AARCH64_EXTRA_TUNE_ALL },
217 { NULL, AARCH64_EXTRA_TUNE_NONE }
218 };
219
220 /* Tuning parameters. */
221
222 static const struct cpu_addrcost_table generic_addrcost_table =
223 {
224 {
225 1, /* hi */
226 0, /* si */
227 0, /* di */
228 1, /* ti */
229 },
230 0, /* pre_modify */
231 0, /* post_modify */
232 0, /* register_offset */
233 0, /* register_sextend */
234 0, /* register_zextend */
235 0 /* imm_offset */
236 };
237
238 static const struct cpu_addrcost_table exynosm1_addrcost_table =
239 {
240 {
241 0, /* hi */
242 0, /* si */
243 0, /* di */
244 2, /* ti */
245 },
246 0, /* pre_modify */
247 0, /* post_modify */
248 1, /* register_offset */
249 1, /* register_sextend */
250 2, /* register_zextend */
251 0, /* imm_offset */
252 };
253
254 static const struct cpu_addrcost_table xgene1_addrcost_table =
255 {
256 {
257 1, /* hi */
258 0, /* si */
259 0, /* di */
260 1, /* ti */
261 },
262 1, /* pre_modify */
263 1, /* post_modify */
264 0, /* register_offset */
265 1, /* register_sextend */
266 1, /* register_zextend */
267 0, /* imm_offset */
268 };
269
270 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
271 {
272 {
273 1, /* hi */
274 1, /* si */
275 1, /* di */
276 2, /* ti */
277 },
278 0, /* pre_modify */
279 0, /* post_modify */
280 2, /* register_offset */
281 3, /* register_sextend */
282 3, /* register_zextend */
283 0, /* imm_offset */
284 };
285
286 static const struct cpu_addrcost_table tsv110_addrcost_table =
287 {
288 {
289 1, /* hi */
290 0, /* si */
291 0, /* di */
292 1, /* ti */
293 },
294 0, /* pre_modify */
295 0, /* post_modify */
296 0, /* register_offset */
297 1, /* register_sextend */
298 1, /* register_zextend */
299 0, /* imm_offset */
300 };
301
302 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
303 {
304 {
305 1, /* hi */
306 1, /* si */
307 1, /* di */
308 2, /* ti */
309 },
310 1, /* pre_modify */
311 1, /* post_modify */
312 3, /* register_offset */
313 3, /* register_sextend */
314 3, /* register_zextend */
315 2, /* imm_offset */
316 };
317
318 static const struct cpu_regmove_cost generic_regmove_cost =
319 {
320 1, /* GP2GP */
321 /* Avoid the use of slow int<->fp moves for spilling by setting
322 their cost higher than memmov_cost. */
323 5, /* GP2FP */
324 5, /* FP2GP */
325 2 /* FP2FP */
326 };
327
328 static const struct cpu_regmove_cost cortexa57_regmove_cost =
329 {
330 1, /* GP2GP */
331 /* Avoid the use of slow int<->fp moves for spilling by setting
332 their cost higher than memmov_cost. */
333 5, /* GP2FP */
334 5, /* FP2GP */
335 2 /* FP2FP */
336 };
337
338 static const struct cpu_regmove_cost cortexa53_regmove_cost =
339 {
340 1, /* GP2GP */
341 /* Avoid the use of slow int<->fp moves for spilling by setting
342 their cost higher than memmov_cost. */
343 5, /* GP2FP */
344 5, /* FP2GP */
345 2 /* FP2FP */
346 };
347
348 static const struct cpu_regmove_cost exynosm1_regmove_cost =
349 {
350 1, /* GP2GP */
351 /* Avoid the use of slow int<->fp moves for spilling by setting
352 their cost higher than memmov_cost (actual, 4 and 9). */
353 9, /* GP2FP */
354 9, /* FP2GP */
355 1 /* FP2FP */
356 };
357
358 static const struct cpu_regmove_cost thunderx_regmove_cost =
359 {
360 2, /* GP2GP */
361 2, /* GP2FP */
362 6, /* FP2GP */
363 4 /* FP2FP */
364 };
365
366 static const struct cpu_regmove_cost xgene1_regmove_cost =
367 {
368 1, /* GP2GP */
369 /* Avoid the use of slow int<->fp moves for spilling by setting
370 their cost higher than memmov_cost. */
371 8, /* GP2FP */
372 8, /* FP2GP */
373 2 /* FP2FP */
374 };
375
376 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
377 {
378 2, /* GP2GP */
379 /* Avoid the use of int<->fp moves for spilling. */
380 6, /* GP2FP */
381 6, /* FP2GP */
382 4 /* FP2FP */
383 };
384
385 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
386 {
387 1, /* GP2GP */
388 /* Avoid the use of int<->fp moves for spilling. */
389 8, /* GP2FP */
390 8, /* FP2GP */
391 4 /* FP2FP */
392 };
393
394 static const struct cpu_regmove_cost tsv110_regmove_cost =
395 {
396 1, /* GP2GP */
397 /* Avoid the use of slow int<->fp moves for spilling by setting
398 their cost higher than memmov_cost. */
399 2, /* GP2FP */
400 3, /* FP2GP */
401 2 /* FP2FP */
402 };
403
404 /* Generic costs for vector insn classes. */
405 static const struct cpu_vector_cost generic_vector_cost =
406 {
407 1, /* scalar_int_stmt_cost */
408 1, /* scalar_fp_stmt_cost */
409 1, /* scalar_load_cost */
410 1, /* scalar_store_cost */
411 1, /* vec_int_stmt_cost */
412 1, /* vec_fp_stmt_cost */
413 2, /* vec_permute_cost */
414 1, /* vec_to_scalar_cost */
415 1, /* scalar_to_vec_cost */
416 1, /* vec_align_load_cost */
417 1, /* vec_unalign_load_cost */
418 1, /* vec_unalign_store_cost */
419 1, /* vec_store_cost */
420 3, /* cond_taken_branch_cost */
421 1 /* cond_not_taken_branch_cost */
422 };
423
424 /* QDF24XX costs for vector insn classes. */
425 static const struct cpu_vector_cost qdf24xx_vector_cost =
426 {
427 1, /* scalar_int_stmt_cost */
428 1, /* scalar_fp_stmt_cost */
429 1, /* scalar_load_cost */
430 1, /* scalar_store_cost */
431 1, /* vec_int_stmt_cost */
432 3, /* vec_fp_stmt_cost */
433 2, /* vec_permute_cost */
434 1, /* vec_to_scalar_cost */
435 1, /* scalar_to_vec_cost */
436 1, /* vec_align_load_cost */
437 1, /* vec_unalign_load_cost */
438 1, /* vec_unalign_store_cost */
439 1, /* vec_store_cost */
440 3, /* cond_taken_branch_cost */
441 1 /* cond_not_taken_branch_cost */
442 };
443
444 /* ThunderX costs for vector insn classes. */
445 static const struct cpu_vector_cost thunderx_vector_cost =
446 {
447 1, /* scalar_int_stmt_cost */
448 1, /* scalar_fp_stmt_cost */
449 3, /* scalar_load_cost */
450 1, /* scalar_store_cost */
451 4, /* vec_int_stmt_cost */
452 1, /* vec_fp_stmt_cost */
453 4, /* vec_permute_cost */
454 2, /* vec_to_scalar_cost */
455 2, /* scalar_to_vec_cost */
456 3, /* vec_align_load_cost */
457 5, /* vec_unalign_load_cost */
458 5, /* vec_unalign_store_cost */
459 1, /* vec_store_cost */
460 3, /* cond_taken_branch_cost */
461 3 /* cond_not_taken_branch_cost */
462 };
463
464 static const struct cpu_vector_cost tsv110_vector_cost =
465 {
466 1, /* scalar_int_stmt_cost */
467 1, /* scalar_fp_stmt_cost */
468 5, /* scalar_load_cost */
469 1, /* scalar_store_cost */
470 2, /* vec_int_stmt_cost */
471 2, /* vec_fp_stmt_cost */
472 2, /* vec_permute_cost */
473 3, /* vec_to_scalar_cost */
474 2, /* scalar_to_vec_cost */
475 5, /* vec_align_load_cost */
476 5, /* vec_unalign_load_cost */
477 1, /* vec_unalign_store_cost */
478 1, /* vec_store_cost */
479 1, /* cond_taken_branch_cost */
480 1 /* cond_not_taken_branch_cost */
481 };
482
483 /* Generic costs for vector insn classes. */
484 static const struct cpu_vector_cost cortexa57_vector_cost =
485 {
486 1, /* scalar_int_stmt_cost */
487 1, /* scalar_fp_stmt_cost */
488 4, /* scalar_load_cost */
489 1, /* scalar_store_cost */
490 2, /* vec_int_stmt_cost */
491 2, /* vec_fp_stmt_cost */
492 3, /* vec_permute_cost */
493 8, /* vec_to_scalar_cost */
494 8, /* scalar_to_vec_cost */
495 4, /* vec_align_load_cost */
496 4, /* vec_unalign_load_cost */
497 1, /* vec_unalign_store_cost */
498 1, /* vec_store_cost */
499 1, /* cond_taken_branch_cost */
500 1 /* cond_not_taken_branch_cost */
501 };
502
503 static const struct cpu_vector_cost exynosm1_vector_cost =
504 {
505 1, /* scalar_int_stmt_cost */
506 1, /* scalar_fp_stmt_cost */
507 5, /* scalar_load_cost */
508 1, /* scalar_store_cost */
509 3, /* vec_int_stmt_cost */
510 3, /* vec_fp_stmt_cost */
511 3, /* vec_permute_cost */
512 3, /* vec_to_scalar_cost */
513 3, /* scalar_to_vec_cost */
514 5, /* vec_align_load_cost */
515 5, /* vec_unalign_load_cost */
516 1, /* vec_unalign_store_cost */
517 1, /* vec_store_cost */
518 1, /* cond_taken_branch_cost */
519 1 /* cond_not_taken_branch_cost */
520 };
521
522 /* Generic costs for vector insn classes. */
523 static const struct cpu_vector_cost xgene1_vector_cost =
524 {
525 1, /* scalar_int_stmt_cost */
526 1, /* scalar_fp_stmt_cost */
527 5, /* scalar_load_cost */
528 1, /* scalar_store_cost */
529 2, /* vec_int_stmt_cost */
530 2, /* vec_fp_stmt_cost */
531 2, /* vec_permute_cost */
532 4, /* vec_to_scalar_cost */
533 4, /* scalar_to_vec_cost */
534 10, /* vec_align_load_cost */
535 10, /* vec_unalign_load_cost */
536 2, /* vec_unalign_store_cost */
537 2, /* vec_store_cost */
538 2, /* cond_taken_branch_cost */
539 1 /* cond_not_taken_branch_cost */
540 };
541
542 /* Costs for vector insn classes for Vulcan. */
543 static const struct cpu_vector_cost thunderx2t99_vector_cost =
544 {
545 1, /* scalar_int_stmt_cost */
546 6, /* scalar_fp_stmt_cost */
547 4, /* scalar_load_cost */
548 1, /* scalar_store_cost */
549 5, /* vec_int_stmt_cost */
550 6, /* vec_fp_stmt_cost */
551 3, /* vec_permute_cost */
552 6, /* vec_to_scalar_cost */
553 5, /* scalar_to_vec_cost */
554 8, /* vec_align_load_cost */
555 8, /* vec_unalign_load_cost */
556 4, /* vec_unalign_store_cost */
557 4, /* vec_store_cost */
558 2, /* cond_taken_branch_cost */
559 1 /* cond_not_taken_branch_cost */
560 };
561
562 /* Generic costs for branch instructions. */
563 static const struct cpu_branch_cost generic_branch_cost =
564 {
565 1, /* Predictable. */
566 3 /* Unpredictable. */
567 };
568
569 /* Generic approximation modes. */
570 static const cpu_approx_modes generic_approx_modes =
571 {
572 AARCH64_APPROX_NONE, /* division */
573 AARCH64_APPROX_NONE, /* sqrt */
574 AARCH64_APPROX_NONE /* recip_sqrt */
575 };
576
577 /* Approximation modes for Exynos M1. */
578 static const cpu_approx_modes exynosm1_approx_modes =
579 {
580 AARCH64_APPROX_NONE, /* division */
581 AARCH64_APPROX_ALL, /* sqrt */
582 AARCH64_APPROX_ALL /* recip_sqrt */
583 };
584
585 /* Approximation modes for X-Gene 1. */
586 static const cpu_approx_modes xgene1_approx_modes =
587 {
588 AARCH64_APPROX_NONE, /* division */
589 AARCH64_APPROX_NONE, /* sqrt */
590 AARCH64_APPROX_ALL /* recip_sqrt */
591 };
592
593 /* Generic prefetch settings (which disable prefetch). */
594 static const cpu_prefetch_tune generic_prefetch_tune =
595 {
596 0, /* num_slots */
597 -1, /* l1_cache_size */
598 -1, /* l1_cache_line_size */
599 -1, /* l2_cache_size */
600 true, /* prefetch_dynamic_strides */
601 -1, /* minimum_stride */
602 -1 /* default_opt_level */
603 };
604
605 static const cpu_prefetch_tune exynosm1_prefetch_tune =
606 {
607 0, /* num_slots */
608 -1, /* l1_cache_size */
609 64, /* l1_cache_line_size */
610 -1, /* l2_cache_size */
611 true, /* prefetch_dynamic_strides */
612 -1, /* minimum_stride */
613 -1 /* default_opt_level */
614 };
615
616 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
617 {
618 4, /* num_slots */
619 32, /* l1_cache_size */
620 64, /* l1_cache_line_size */
621 512, /* l2_cache_size */
622 false, /* prefetch_dynamic_strides */
623 2048, /* minimum_stride */
624 3 /* default_opt_level */
625 };
626
627 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
628 {
629 8, /* num_slots */
630 32, /* l1_cache_size */
631 128, /* l1_cache_line_size */
632 16*1024, /* l2_cache_size */
633 true, /* prefetch_dynamic_strides */
634 -1, /* minimum_stride */
635 3 /* default_opt_level */
636 };
637
638 static const cpu_prefetch_tune thunderx_prefetch_tune =
639 {
640 8, /* num_slots */
641 32, /* l1_cache_size */
642 128, /* l1_cache_line_size */
643 -1, /* l2_cache_size */
644 true, /* prefetch_dynamic_strides */
645 -1, /* minimum_stride */
646 -1 /* default_opt_level */
647 };
648
649 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
650 {
651 8, /* num_slots */
652 32, /* l1_cache_size */
653 64, /* l1_cache_line_size */
654 256, /* l2_cache_size */
655 true, /* prefetch_dynamic_strides */
656 -1, /* minimum_stride */
657 -1 /* default_opt_level */
658 };
659
660 static const cpu_prefetch_tune tsv110_prefetch_tune =
661 {
662 0, /* num_slots */
663 64, /* l1_cache_size */
664 64, /* l1_cache_line_size */
665 512, /* l2_cache_size */
666 true, /* prefetch_dynamic_strides */
667 -1, /* minimum_stride */
668 -1 /* default_opt_level */
669 };
670
671 static const cpu_prefetch_tune xgene1_prefetch_tune =
672 {
673 8, /* num_slots */
674 32, /* l1_cache_size */
675 64, /* l1_cache_line_size */
676 256, /* l2_cache_size */
677 true, /* prefetch_dynamic_strides */
678 -1, /* minimum_stride */
679 -1 /* default_opt_level */
680 };
681
682 static const struct tune_params generic_tunings =
683 {
684 &cortexa57_extra_costs,
685 &generic_addrcost_table,
686 &generic_regmove_cost,
687 &generic_vector_cost,
688 &generic_branch_cost,
689 &generic_approx_modes,
690 SVE_NOT_IMPLEMENTED, /* sve_width */
691 4, /* memmov_cost */
692 2, /* issue_rate */
693 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
694 "8", /* function_align. */
695 "4", /* jump_align. */
696 "8", /* loop_align. */
697 2, /* int_reassoc_width. */
698 4, /* fp_reassoc_width. */
699 1, /* vec_reassoc_width. */
700 2, /* min_div_recip_mul_sf. */
701 2, /* min_div_recip_mul_df. */
702 0, /* max_case_values. */
703 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
704 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
705 &generic_prefetch_tune
706 };
707
708 static const struct tune_params cortexa35_tunings =
709 {
710 &cortexa53_extra_costs,
711 &generic_addrcost_table,
712 &cortexa53_regmove_cost,
713 &generic_vector_cost,
714 &generic_branch_cost,
715 &generic_approx_modes,
716 SVE_NOT_IMPLEMENTED, /* sve_width */
717 4, /* memmov_cost */
718 1, /* issue_rate */
719 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
720 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
721 "16", /* function_align. */
722 "4", /* jump_align. */
723 "8", /* loop_align. */
724 2, /* int_reassoc_width. */
725 4, /* fp_reassoc_width. */
726 1, /* vec_reassoc_width. */
727 2, /* min_div_recip_mul_sf. */
728 2, /* min_div_recip_mul_df. */
729 0, /* max_case_values. */
730 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
731 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
732 &generic_prefetch_tune
733 };
734
735 static const struct tune_params cortexa53_tunings =
736 {
737 &cortexa53_extra_costs,
738 &generic_addrcost_table,
739 &cortexa53_regmove_cost,
740 &generic_vector_cost,
741 &generic_branch_cost,
742 &generic_approx_modes,
743 SVE_NOT_IMPLEMENTED, /* sve_width */
744 4, /* memmov_cost */
745 2, /* issue_rate */
746 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
747 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
748 "16", /* function_align. */
749 "4", /* jump_align. */
750 "8", /* loop_align. */
751 2, /* int_reassoc_width. */
752 4, /* fp_reassoc_width. */
753 1, /* vec_reassoc_width. */
754 2, /* min_div_recip_mul_sf. */
755 2, /* min_div_recip_mul_df. */
756 0, /* max_case_values. */
757 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
758 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
759 &generic_prefetch_tune
760 };
761
762 static const struct tune_params cortexa57_tunings =
763 {
764 &cortexa57_extra_costs,
765 &generic_addrcost_table,
766 &cortexa57_regmove_cost,
767 &cortexa57_vector_cost,
768 &generic_branch_cost,
769 &generic_approx_modes,
770 SVE_NOT_IMPLEMENTED, /* sve_width */
771 4, /* memmov_cost */
772 3, /* issue_rate */
773 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
774 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
775 "16", /* function_align. */
776 "4", /* jump_align. */
777 "8", /* loop_align. */
778 2, /* int_reassoc_width. */
779 4, /* fp_reassoc_width. */
780 1, /* vec_reassoc_width. */
781 2, /* min_div_recip_mul_sf. */
782 2, /* min_div_recip_mul_df. */
783 0, /* max_case_values. */
784 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
785 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
786 &generic_prefetch_tune
787 };
788
789 static const struct tune_params cortexa72_tunings =
790 {
791 &cortexa57_extra_costs,
792 &generic_addrcost_table,
793 &cortexa57_regmove_cost,
794 &cortexa57_vector_cost,
795 &generic_branch_cost,
796 &generic_approx_modes,
797 SVE_NOT_IMPLEMENTED, /* sve_width */
798 4, /* memmov_cost */
799 3, /* issue_rate */
800 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
801 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
802 "16", /* function_align. */
803 "4", /* jump_align. */
804 "8", /* loop_align. */
805 2, /* int_reassoc_width. */
806 4, /* fp_reassoc_width. */
807 1, /* vec_reassoc_width. */
808 2, /* min_div_recip_mul_sf. */
809 2, /* min_div_recip_mul_df. */
810 0, /* max_case_values. */
811 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
812 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
813 &generic_prefetch_tune
814 };
815
816 static const struct tune_params cortexa73_tunings =
817 {
818 &cortexa57_extra_costs,
819 &generic_addrcost_table,
820 &cortexa57_regmove_cost,
821 &cortexa57_vector_cost,
822 &generic_branch_cost,
823 &generic_approx_modes,
824 SVE_NOT_IMPLEMENTED, /* sve_width */
825 4, /* memmov_cost. */
826 2, /* issue_rate. */
827 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
828 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
829 "16", /* function_align. */
830 "4", /* jump_align. */
831 "8", /* loop_align. */
832 2, /* int_reassoc_width. */
833 4, /* fp_reassoc_width. */
834 1, /* vec_reassoc_width. */
835 2, /* min_div_recip_mul_sf. */
836 2, /* min_div_recip_mul_df. */
837 0, /* max_case_values. */
838 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
839 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
840 &generic_prefetch_tune
841 };
842
843
844
845 static const struct tune_params exynosm1_tunings =
846 {
847 &exynosm1_extra_costs,
848 &exynosm1_addrcost_table,
849 &exynosm1_regmove_cost,
850 &exynosm1_vector_cost,
851 &generic_branch_cost,
852 &exynosm1_approx_modes,
853 SVE_NOT_IMPLEMENTED, /* sve_width */
854 4, /* memmov_cost */
855 3, /* issue_rate */
856 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
857 "4", /* function_align. */
858 "4", /* jump_align. */
859 "4", /* loop_align. */
860 2, /* int_reassoc_width. */
861 4, /* fp_reassoc_width. */
862 1, /* vec_reassoc_width. */
863 2, /* min_div_recip_mul_sf. */
864 2, /* min_div_recip_mul_df. */
865 48, /* max_case_values. */
866 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
867 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
868 &exynosm1_prefetch_tune
869 };
870
871 static const struct tune_params thunderxt88_tunings =
872 {
873 &thunderx_extra_costs,
874 &generic_addrcost_table,
875 &thunderx_regmove_cost,
876 &thunderx_vector_cost,
877 &generic_branch_cost,
878 &generic_approx_modes,
879 SVE_NOT_IMPLEMENTED, /* sve_width */
880 6, /* memmov_cost */
881 2, /* issue_rate */
882 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
883 "8", /* function_align. */
884 "8", /* jump_align. */
885 "8", /* loop_align. */
886 2, /* int_reassoc_width. */
887 4, /* fp_reassoc_width. */
888 1, /* vec_reassoc_width. */
889 2, /* min_div_recip_mul_sf. */
890 2, /* min_div_recip_mul_df. */
891 0, /* max_case_values. */
892 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
893 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
894 &thunderxt88_prefetch_tune
895 };
896
897 static const struct tune_params thunderx_tunings =
898 {
899 &thunderx_extra_costs,
900 &generic_addrcost_table,
901 &thunderx_regmove_cost,
902 &thunderx_vector_cost,
903 &generic_branch_cost,
904 &generic_approx_modes,
905 SVE_NOT_IMPLEMENTED, /* sve_width */
906 6, /* memmov_cost */
907 2, /* issue_rate */
908 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
909 "8", /* function_align. */
910 "8", /* jump_align. */
911 "8", /* loop_align. */
912 2, /* int_reassoc_width. */
913 4, /* fp_reassoc_width. */
914 1, /* vec_reassoc_width. */
915 2, /* min_div_recip_mul_sf. */
916 2, /* min_div_recip_mul_df. */
917 0, /* max_case_values. */
918 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
919 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
920 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
921 &thunderx_prefetch_tune
922 };
923
924 static const struct tune_params tsv110_tunings =
925 {
926 &tsv110_extra_costs,
927 &tsv110_addrcost_table,
928 &tsv110_regmove_cost,
929 &tsv110_vector_cost,
930 &generic_branch_cost,
931 &generic_approx_modes,
932 SVE_NOT_IMPLEMENTED, /* sve_width */
933 4, /* memmov_cost */
934 4, /* issue_rate */
935 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
936 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
937 "16", /* function_align. */
938 "4", /* jump_align. */
939 "8", /* loop_align. */
940 2, /* int_reassoc_width. */
941 4, /* fp_reassoc_width. */
942 1, /* vec_reassoc_width. */
943 2, /* min_div_recip_mul_sf. */
944 2, /* min_div_recip_mul_df. */
945 0, /* max_case_values. */
946 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
947 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
948 &tsv110_prefetch_tune
949 };
950
951 static const struct tune_params xgene1_tunings =
952 {
953 &xgene1_extra_costs,
954 &xgene1_addrcost_table,
955 &xgene1_regmove_cost,
956 &xgene1_vector_cost,
957 &generic_branch_cost,
958 &xgene1_approx_modes,
959 SVE_NOT_IMPLEMENTED, /* sve_width */
960 6, /* memmov_cost */
961 4, /* issue_rate */
962 AARCH64_FUSE_NOTHING, /* fusible_ops */
963 "16", /* function_align. */
964 "16", /* jump_align. */
965 "16", /* loop_align. */
966 2, /* int_reassoc_width. */
967 4, /* fp_reassoc_width. */
968 1, /* vec_reassoc_width. */
969 2, /* min_div_recip_mul_sf. */
970 2, /* min_div_recip_mul_df. */
971 17, /* max_case_values. */
972 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
973 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
974 &xgene1_prefetch_tune
975 };
976
977 static const struct tune_params emag_tunings =
978 {
979 &xgene1_extra_costs,
980 &xgene1_addrcost_table,
981 &xgene1_regmove_cost,
982 &xgene1_vector_cost,
983 &generic_branch_cost,
984 &xgene1_approx_modes,
985 SVE_NOT_IMPLEMENTED,
986 6, /* memmov_cost */
987 4, /* issue_rate */
988 AARCH64_FUSE_NOTHING, /* fusible_ops */
989 "16", /* function_align. */
990 "16", /* jump_align. */
991 "16", /* loop_align. */
992 2, /* int_reassoc_width. */
993 4, /* fp_reassoc_width. */
994 1, /* vec_reassoc_width. */
995 2, /* min_div_recip_mul_sf. */
996 2, /* min_div_recip_mul_df. */
997 17, /* max_case_values. */
998 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
999 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1000 &xgene1_prefetch_tune
1001 };
1002
1003 static const struct tune_params qdf24xx_tunings =
1004 {
1005 &qdf24xx_extra_costs,
1006 &qdf24xx_addrcost_table,
1007 &qdf24xx_regmove_cost,
1008 &qdf24xx_vector_cost,
1009 &generic_branch_cost,
1010 &generic_approx_modes,
1011 SVE_NOT_IMPLEMENTED, /* sve_width */
1012 4, /* memmov_cost */
1013 4, /* issue_rate */
1014 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1015 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1016 "16", /* function_align. */
1017 "8", /* jump_align. */
1018 "16", /* loop_align. */
1019 2, /* int_reassoc_width. */
1020 4, /* fp_reassoc_width. */
1021 1, /* vec_reassoc_width. */
1022 2, /* min_div_recip_mul_sf. */
1023 2, /* min_div_recip_mul_df. */
1024 0, /* max_case_values. */
1025 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1026 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1027 &qdf24xx_prefetch_tune
1028 };
1029
1030 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1031 for now. */
1032 static const struct tune_params saphira_tunings =
1033 {
1034 &generic_extra_costs,
1035 &generic_addrcost_table,
1036 &generic_regmove_cost,
1037 &generic_vector_cost,
1038 &generic_branch_cost,
1039 &generic_approx_modes,
1040 SVE_NOT_IMPLEMENTED, /* sve_width */
1041 4, /* memmov_cost */
1042 4, /* issue_rate */
1043 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1044 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1045 "16", /* function_align. */
1046 "8", /* jump_align. */
1047 "16", /* loop_align. */
1048 2, /* int_reassoc_width. */
1049 4, /* fp_reassoc_width. */
1050 1, /* vec_reassoc_width. */
1051 2, /* min_div_recip_mul_sf. */
1052 2, /* min_div_recip_mul_df. */
1053 0, /* max_case_values. */
1054 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1055 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1056 &generic_prefetch_tune
1057 };
1058
1059 static const struct tune_params thunderx2t99_tunings =
1060 {
1061 &thunderx2t99_extra_costs,
1062 &thunderx2t99_addrcost_table,
1063 &thunderx2t99_regmove_cost,
1064 &thunderx2t99_vector_cost,
1065 &generic_branch_cost,
1066 &generic_approx_modes,
1067 SVE_NOT_IMPLEMENTED, /* sve_width */
1068 4, /* memmov_cost. */
1069 4, /* issue_rate. */
1070 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1071 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1072 "16", /* function_align. */
1073 "8", /* jump_align. */
1074 "16", /* loop_align. */
1075 3, /* int_reassoc_width. */
1076 2, /* fp_reassoc_width. */
1077 2, /* vec_reassoc_width. */
1078 2, /* min_div_recip_mul_sf. */
1079 2, /* min_div_recip_mul_df. */
1080 0, /* max_case_values. */
1081 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1082 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1083 &thunderx2t99_prefetch_tune
1084 };
1085
1086 /* Support for fine-grained override of the tuning structures. */
1087 struct aarch64_tuning_override_function
1088 {
1089 const char* name;
1090 void (*parse_override)(const char*, struct tune_params*);
1091 };
1092
1093 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1094 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1095 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1096
1097 static const struct aarch64_tuning_override_function
1098 aarch64_tuning_override_functions[] =
1099 {
1100 { "fuse", aarch64_parse_fuse_string },
1101 { "tune", aarch64_parse_tune_string },
1102 { "sve_width", aarch64_parse_sve_width_string },
1103 { NULL, NULL }
1104 };
1105
1106 /* A processor implementing AArch64. */
1107 struct processor
1108 {
1109 const char *const name;
1110 enum aarch64_processor ident;
1111 enum aarch64_processor sched_core;
1112 enum aarch64_arch arch;
1113 unsigned architecture_version;
1114 const unsigned long flags;
1115 const struct tune_params *const tune;
1116 };
1117
1118 /* Architectures implementing AArch64. */
1119 static const struct processor all_architectures[] =
1120 {
1121 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1122 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1123 #include "aarch64-arches.def"
1124 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1125 };
1126
1127 /* Processor cores implementing AArch64. */
1128 static const struct processor all_cores[] =
1129 {
1130 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1131 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1132 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1133 FLAGS, &COSTS##_tunings},
1134 #include "aarch64-cores.def"
1135 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1136 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1137 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1138 };
1139
1140
1141 /* Target specification. These are populated by the -march, -mtune, -mcpu
1142 handling code or by target attributes. */
1143 static const struct processor *selected_arch;
1144 static const struct processor *selected_cpu;
1145 static const struct processor *selected_tune;
1146
1147 /* The current tuning set. */
1148 struct tune_params aarch64_tune_params = generic_tunings;
1149
1150 /* Table of machine attributes. */
1151 static const struct attribute_spec aarch64_attribute_table[] =
1152 {
1153 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1154 affects_type_identity, handler, exclude } */
1155 { "aarch64_vector_pcs", 0, 0, false, true, true, false, NULL, NULL },
1156 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1157 };
1158
1159 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1160
1161 /* An ISA extension in the co-processor and main instruction set space. */
1162 struct aarch64_option_extension
1163 {
1164 const char *const name;
1165 const unsigned long flags_on;
1166 const unsigned long flags_off;
1167 };
1168
1169 typedef enum aarch64_cond_code
1170 {
1171 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1172 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1173 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1174 }
1175 aarch64_cc;
1176
1177 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1178
1179 struct aarch64_branch_protect_type
1180 {
1181 /* The type's name that the user passes to the branch-protection option
1182 string. */
1183 const char* name;
1184 /* Function to handle the protection type and set global variables.
1185 First argument is the string token corresponding with this type and the
1186 second argument is the next token in the option string.
1187 Return values:
1188 * AARCH64_PARSE_OK: Handling was sucessful.
1189 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1190 should print an error.
1191 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1192 own error. */
1193 enum aarch64_parse_opt_result (*handler)(char*, char*);
1194 /* A list of types that can follow this type in the option string. */
1195 const aarch64_branch_protect_type* subtypes;
1196 unsigned int num_subtypes;
1197 };
1198
1199 static enum aarch64_parse_opt_result
1200 aarch64_handle_no_branch_protection (char* str, char* rest)
1201 {
1202 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1203 aarch64_enable_bti = 0;
1204 if (rest)
1205 {
1206 error ("unexpected %<%s%> after %<%s%>", rest, str);
1207 return AARCH64_PARSE_INVALID_FEATURE;
1208 }
1209 return AARCH64_PARSE_OK;
1210 }
1211
1212 static enum aarch64_parse_opt_result
1213 aarch64_handle_standard_branch_protection (char* str, char* rest)
1214 {
1215 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1216 aarch64_enable_bti = 1;
1217 if (rest)
1218 {
1219 error ("unexpected %<%s%> after %<%s%>", rest, str);
1220 return AARCH64_PARSE_INVALID_FEATURE;
1221 }
1222 return AARCH64_PARSE_OK;
1223 }
1224
1225 static enum aarch64_parse_opt_result
1226 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1227 char* rest ATTRIBUTE_UNUSED)
1228 {
1229 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1230 return AARCH64_PARSE_OK;
1231 }
1232
1233 static enum aarch64_parse_opt_result
1234 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1235 char* rest ATTRIBUTE_UNUSED)
1236 {
1237 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1238 return AARCH64_PARSE_OK;
1239 }
1240
1241 static enum aarch64_parse_opt_result
1242 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1243 char* rest ATTRIBUTE_UNUSED)
1244 {
1245 aarch64_enable_bti = 1;
1246 return AARCH64_PARSE_OK;
1247 }
1248
1249 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1250 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1251 { NULL, NULL, NULL, 0 }
1252 };
1253
1254 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1255 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1256 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1257 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1258 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1259 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1260 { NULL, NULL, NULL, 0 }
1261 };
1262
1263 /* The condition codes of the processor, and the inverse function. */
1264 static const char * const aarch64_condition_codes[] =
1265 {
1266 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1267 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1268 };
1269
1270 /* Generate code to enable conditional branches in functions over 1 MiB. */
1271 const char *
1272 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1273 const char * branch_format)
1274 {
1275 rtx_code_label * tmp_label = gen_label_rtx ();
1276 char label_buf[256];
1277 char buffer[128];
1278 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1279 CODE_LABEL_NUMBER (tmp_label));
1280 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1281 rtx dest_label = operands[pos_label];
1282 operands[pos_label] = tmp_label;
1283
1284 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1285 output_asm_insn (buffer, operands);
1286
1287 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1288 operands[pos_label] = dest_label;
1289 output_asm_insn (buffer, operands);
1290 return "";
1291 }
1292
1293 void
1294 aarch64_err_no_fpadvsimd (machine_mode mode)
1295 {
1296 if (TARGET_GENERAL_REGS_ONLY)
1297 if (FLOAT_MODE_P (mode))
1298 error ("%qs is incompatible with the use of floating-point types",
1299 "-mgeneral-regs-only");
1300 else
1301 error ("%qs is incompatible with the use of vector types",
1302 "-mgeneral-regs-only");
1303 else
1304 if (FLOAT_MODE_P (mode))
1305 error ("%qs feature modifier is incompatible with the use of"
1306 " floating-point types", "+nofp");
1307 else
1308 error ("%qs feature modifier is incompatible with the use of"
1309 " vector types", "+nofp");
1310 }
1311
1312 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1313 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1314 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1315 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1316 and GENERAL_REGS is lower than the memory cost (in this case the best class
1317 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1318 cost results in bad allocations with many redundant int<->FP moves which
1319 are expensive on various cores.
1320 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1321 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1322 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1323 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1324 The result of this is that it is no longer inefficient to have a higher
1325 memory move cost than the register move cost.
1326 */
1327
1328 static reg_class_t
1329 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1330 reg_class_t best_class)
1331 {
1332 machine_mode mode;
1333
1334 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1335 || !reg_class_subset_p (FP_REGS, allocno_class))
1336 return allocno_class;
1337
1338 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1339 || !reg_class_subset_p (FP_REGS, best_class))
1340 return best_class;
1341
1342 mode = PSEUDO_REGNO_MODE (regno);
1343 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1344 }
1345
1346 static unsigned int
1347 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1348 {
1349 if (GET_MODE_UNIT_SIZE (mode) == 4)
1350 return aarch64_tune_params.min_div_recip_mul_sf;
1351 return aarch64_tune_params.min_div_recip_mul_df;
1352 }
1353
1354 /* Return the reassociation width of treeop OPC with mode MODE. */
1355 static int
1356 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1357 {
1358 if (VECTOR_MODE_P (mode))
1359 return aarch64_tune_params.vec_reassoc_width;
1360 if (INTEGRAL_MODE_P (mode))
1361 return aarch64_tune_params.int_reassoc_width;
1362 /* Avoid reassociating floating point addition so we emit more FMAs. */
1363 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1364 return aarch64_tune_params.fp_reassoc_width;
1365 return 1;
1366 }
1367
1368 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1369 unsigned
1370 aarch64_dbx_register_number (unsigned regno)
1371 {
1372 if (GP_REGNUM_P (regno))
1373 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1374 else if (regno == SP_REGNUM)
1375 return AARCH64_DWARF_SP;
1376 else if (FP_REGNUM_P (regno))
1377 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1378 else if (PR_REGNUM_P (regno))
1379 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1380 else if (regno == VG_REGNUM)
1381 return AARCH64_DWARF_VG;
1382
1383 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1384 equivalent DWARF register. */
1385 return DWARF_FRAME_REGISTERS;
1386 }
1387
1388 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1389 static bool
1390 aarch64_advsimd_struct_mode_p (machine_mode mode)
1391 {
1392 return (TARGET_SIMD
1393 && (mode == OImode || mode == CImode || mode == XImode));
1394 }
1395
1396 /* Return true if MODE is an SVE predicate mode. */
1397 static bool
1398 aarch64_sve_pred_mode_p (machine_mode mode)
1399 {
1400 return (TARGET_SVE
1401 && (mode == VNx16BImode
1402 || mode == VNx8BImode
1403 || mode == VNx4BImode
1404 || mode == VNx2BImode));
1405 }
1406
1407 /* Three mutually-exclusive flags describing a vector or predicate type. */
1408 const unsigned int VEC_ADVSIMD = 1;
1409 const unsigned int VEC_SVE_DATA = 2;
1410 const unsigned int VEC_SVE_PRED = 4;
1411 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1412 a structure of 2, 3 or 4 vectors. */
1413 const unsigned int VEC_STRUCT = 8;
1414 /* Useful combinations of the above. */
1415 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1416 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1417
1418 /* Return a set of flags describing the vector properties of mode MODE.
1419 Ignore modes that are not supported by the current target. */
1420 static unsigned int
1421 aarch64_classify_vector_mode (machine_mode mode)
1422 {
1423 if (aarch64_advsimd_struct_mode_p (mode))
1424 return VEC_ADVSIMD | VEC_STRUCT;
1425
1426 if (aarch64_sve_pred_mode_p (mode))
1427 return VEC_SVE_PRED;
1428
1429 scalar_mode inner = GET_MODE_INNER (mode);
1430 if (VECTOR_MODE_P (mode)
1431 && (inner == QImode
1432 || inner == HImode
1433 || inner == HFmode
1434 || inner == SImode
1435 || inner == SFmode
1436 || inner == DImode
1437 || inner == DFmode))
1438 {
1439 if (TARGET_SVE)
1440 {
1441 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1442 return VEC_SVE_DATA;
1443 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1444 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1445 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1446 return VEC_SVE_DATA | VEC_STRUCT;
1447 }
1448
1449 /* This includes V1DF but not V1DI (which doesn't exist). */
1450 if (TARGET_SIMD
1451 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1452 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1453 return VEC_ADVSIMD;
1454 }
1455
1456 return 0;
1457 }
1458
1459 /* Return true if MODE is any of the data vector modes, including
1460 structure modes. */
1461 static bool
1462 aarch64_vector_data_mode_p (machine_mode mode)
1463 {
1464 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1465 }
1466
1467 /* Return true if MODE is an SVE data vector mode; either a single vector
1468 or a structure of vectors. */
1469 static bool
1470 aarch64_sve_data_mode_p (machine_mode mode)
1471 {
1472 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1473 }
1474
1475 /* Implement target hook TARGET_ARRAY_MODE. */
1476 static opt_machine_mode
1477 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1478 {
1479 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1480 && IN_RANGE (nelems, 2, 4))
1481 return mode_for_vector (GET_MODE_INNER (mode),
1482 GET_MODE_NUNITS (mode) * nelems);
1483
1484 return opt_machine_mode ();
1485 }
1486
1487 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1488 static bool
1489 aarch64_array_mode_supported_p (machine_mode mode,
1490 unsigned HOST_WIDE_INT nelems)
1491 {
1492 if (TARGET_SIMD
1493 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1494 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1495 && (nelems >= 2 && nelems <= 4))
1496 return true;
1497
1498 return false;
1499 }
1500
1501 /* Return the SVE predicate mode to use for elements that have
1502 ELEM_NBYTES bytes, if such a mode exists. */
1503
1504 opt_machine_mode
1505 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1506 {
1507 if (TARGET_SVE)
1508 {
1509 if (elem_nbytes == 1)
1510 return VNx16BImode;
1511 if (elem_nbytes == 2)
1512 return VNx8BImode;
1513 if (elem_nbytes == 4)
1514 return VNx4BImode;
1515 if (elem_nbytes == 8)
1516 return VNx2BImode;
1517 }
1518 return opt_machine_mode ();
1519 }
1520
1521 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1522
1523 static opt_machine_mode
1524 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1525 {
1526 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1527 {
1528 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1529 machine_mode pred_mode;
1530 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1531 return pred_mode;
1532 }
1533
1534 return default_get_mask_mode (nunits, nbytes);
1535 }
1536
1537 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1538 prefer to use the first arithmetic operand as the else value if
1539 the else value doesn't matter, since that exactly matches the SVE
1540 destructive merging form. For ternary operations we could either
1541 pick the first operand and use FMAD-like instructions or the last
1542 operand and use FMLA-like instructions; the latter seems more
1543 natural. */
1544
1545 static tree
1546 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1547 {
1548 return nops == 3 ? ops[2] : ops[0];
1549 }
1550
1551 /* Implement TARGET_HARD_REGNO_NREGS. */
1552
1553 static unsigned int
1554 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1555 {
1556 /* ??? Logically we should only need to provide a value when
1557 HARD_REGNO_MODE_OK says that the combination is valid,
1558 but at the moment we need to handle all modes. Just ignore
1559 any runtime parts for registers that can't store them. */
1560 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1561 switch (aarch64_regno_regclass (regno))
1562 {
1563 case FP_REGS:
1564 case FP_LO_REGS:
1565 if (aarch64_sve_data_mode_p (mode))
1566 return exact_div (GET_MODE_SIZE (mode),
1567 BYTES_PER_SVE_VECTOR).to_constant ();
1568 return CEIL (lowest_size, UNITS_PER_VREG);
1569 case PR_REGS:
1570 case PR_LO_REGS:
1571 case PR_HI_REGS:
1572 return 1;
1573 default:
1574 return CEIL (lowest_size, UNITS_PER_WORD);
1575 }
1576 gcc_unreachable ();
1577 }
1578
1579 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1580
1581 static bool
1582 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1583 {
1584 if (GET_MODE_CLASS (mode) == MODE_CC)
1585 return regno == CC_REGNUM;
1586
1587 if (regno == VG_REGNUM)
1588 /* This must have the same size as _Unwind_Word. */
1589 return mode == DImode;
1590
1591 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1592 if (vec_flags & VEC_SVE_PRED)
1593 return PR_REGNUM_P (regno);
1594
1595 if (PR_REGNUM_P (regno))
1596 return 0;
1597
1598 if (regno == SP_REGNUM)
1599 /* The purpose of comparing with ptr_mode is to support the
1600 global register variable associated with the stack pointer
1601 register via the syntax of asm ("wsp") in ILP32. */
1602 return mode == Pmode || mode == ptr_mode;
1603
1604 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1605 return mode == Pmode;
1606
1607 if (GP_REGNUM_P (regno))
1608 {
1609 if (known_le (GET_MODE_SIZE (mode), 8))
1610 return true;
1611 else if (known_le (GET_MODE_SIZE (mode), 16))
1612 return (regno & 1) == 0;
1613 }
1614 else if (FP_REGNUM_P (regno))
1615 {
1616 if (vec_flags & VEC_STRUCT)
1617 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1618 else
1619 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1620 }
1621
1622 return false;
1623 }
1624
1625 /* Return true if this is a definition of a vectorized simd function. */
1626
1627 static bool
1628 aarch64_simd_decl_p (tree fndecl)
1629 {
1630 tree fntype;
1631
1632 if (fndecl == NULL)
1633 return false;
1634 fntype = TREE_TYPE (fndecl);
1635 if (fntype == NULL)
1636 return false;
1637
1638 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1639 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1640 return true;
1641
1642 return false;
1643 }
1644
1645 /* Return the mode a register save/restore should use. DImode for integer
1646 registers, DFmode for FP registers in non-SIMD functions (they only save
1647 the bottom half of a 128 bit register), or TFmode for FP registers in
1648 SIMD functions. */
1649
1650 static machine_mode
1651 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1652 {
1653 return GP_REGNUM_P (regno)
1654 ? E_DImode
1655 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1656 }
1657
1658 /* Return true if the instruction is a call to a SIMD function, false
1659 if it is not a SIMD function or if we do not know anything about
1660 the function. */
1661
1662 static bool
1663 aarch64_simd_call_p (rtx_insn *insn)
1664 {
1665 rtx symbol;
1666 rtx call;
1667 tree fndecl;
1668
1669 gcc_assert (CALL_P (insn));
1670 call = get_call_rtx_from (insn);
1671 symbol = XEXP (XEXP (call, 0), 0);
1672 if (GET_CODE (symbol) != SYMBOL_REF)
1673 return false;
1674 fndecl = SYMBOL_REF_DECL (symbol);
1675 if (!fndecl)
1676 return false;
1677
1678 return aarch64_simd_decl_p (fndecl);
1679 }
1680
1681 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1682 a function that uses the SIMD ABI, take advantage of the extra
1683 call-preserved registers that the ABI provides. */
1684
1685 void
1686 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1687 HARD_REG_SET *return_set)
1688 {
1689 if (aarch64_simd_call_p (insn))
1690 {
1691 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1692 if (FP_SIMD_SAVED_REGNUM_P (regno))
1693 CLEAR_HARD_REG_BIT (*return_set, regno);
1694 }
1695 }
1696
1697 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1698 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1699 clobbers the top 64 bits when restoring the bottom 64 bits. */
1700
1701 static bool
1702 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1703 {
1704 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1705 }
1706
1707 /* Implement REGMODE_NATURAL_SIZE. */
1708 poly_uint64
1709 aarch64_regmode_natural_size (machine_mode mode)
1710 {
1711 /* The natural size for SVE data modes is one SVE data vector,
1712 and similarly for predicates. We can't independently modify
1713 anything smaller than that. */
1714 /* ??? For now, only do this for variable-width SVE registers.
1715 Doing it for constant-sized registers breaks lower-subreg.c. */
1716 /* ??? And once that's fixed, we should probably have similar
1717 code for Advanced SIMD. */
1718 if (!aarch64_sve_vg.is_constant ())
1719 {
1720 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1721 if (vec_flags & VEC_SVE_PRED)
1722 return BYTES_PER_SVE_PRED;
1723 if (vec_flags & VEC_SVE_DATA)
1724 return BYTES_PER_SVE_VECTOR;
1725 }
1726 return UNITS_PER_WORD;
1727 }
1728
1729 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1730 machine_mode
1731 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1732 machine_mode mode)
1733 {
1734 /* The predicate mode determines which bits are significant and
1735 which are "don't care". Decreasing the number of lanes would
1736 lose data while increasing the number of lanes would make bits
1737 unnecessarily significant. */
1738 if (PR_REGNUM_P (regno))
1739 return mode;
1740 if (known_ge (GET_MODE_SIZE (mode), 4))
1741 return mode;
1742 else
1743 return SImode;
1744 }
1745
1746 /* Return true if I's bits are consecutive ones from the MSB. */
1747 bool
1748 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1749 {
1750 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1751 }
1752
1753 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1754 that strcpy from constants will be faster. */
1755
1756 static HOST_WIDE_INT
1757 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1758 {
1759 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1760 return MAX (align, BITS_PER_WORD);
1761 return align;
1762 }
1763
1764 /* Return true if calls to DECL should be treated as
1765 long-calls (ie called via a register). */
1766 static bool
1767 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1768 {
1769 return false;
1770 }
1771
1772 /* Return true if calls to symbol-ref SYM should be treated as
1773 long-calls (ie called via a register). */
1774 bool
1775 aarch64_is_long_call_p (rtx sym)
1776 {
1777 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1778 }
1779
1780 /* Return true if calls to symbol-ref SYM should not go through
1781 plt stubs. */
1782
1783 bool
1784 aarch64_is_noplt_call_p (rtx sym)
1785 {
1786 const_tree decl = SYMBOL_REF_DECL (sym);
1787
1788 if (flag_pic
1789 && decl
1790 && (!flag_plt
1791 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1792 && !targetm.binds_local_p (decl))
1793 return true;
1794
1795 return false;
1796 }
1797
1798 /* Return true if the offsets to a zero/sign-extract operation
1799 represent an expression that matches an extend operation. The
1800 operands represent the paramters from
1801
1802 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1803 bool
1804 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1805 rtx extract_imm)
1806 {
1807 HOST_WIDE_INT mult_val, extract_val;
1808
1809 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1810 return false;
1811
1812 mult_val = INTVAL (mult_imm);
1813 extract_val = INTVAL (extract_imm);
1814
1815 if (extract_val > 8
1816 && extract_val < GET_MODE_BITSIZE (mode)
1817 && exact_log2 (extract_val & ~7) > 0
1818 && (extract_val & 7) <= 4
1819 && mult_val == (1 << (extract_val & 7)))
1820 return true;
1821
1822 return false;
1823 }
1824
1825 /* Emit an insn that's a simple single-set. Both the operands must be
1826 known to be valid. */
1827 inline static rtx_insn *
1828 emit_set_insn (rtx x, rtx y)
1829 {
1830 return emit_insn (gen_rtx_SET (x, y));
1831 }
1832
1833 /* X and Y are two things to compare using CODE. Emit the compare insn and
1834 return the rtx for register 0 in the proper mode. */
1835 rtx
1836 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1837 {
1838 machine_mode mode = SELECT_CC_MODE (code, x, y);
1839 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1840
1841 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1842 return cc_reg;
1843 }
1844
1845 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
1846
1847 static rtx
1848 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1849 machine_mode y_mode)
1850 {
1851 if (y_mode == E_QImode || y_mode == E_HImode)
1852 {
1853 if (CONST_INT_P (y))
1854 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1855 else
1856 {
1857 rtx t, cc_reg;
1858 machine_mode cc_mode;
1859
1860 t = gen_rtx_ZERO_EXTEND (SImode, y);
1861 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1862 cc_mode = CC_SWPmode;
1863 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1864 emit_set_insn (cc_reg, t);
1865 return cc_reg;
1866 }
1867 }
1868
1869 return aarch64_gen_compare_reg (code, x, y);
1870 }
1871
1872 /* Build the SYMBOL_REF for __tls_get_addr. */
1873
1874 static GTY(()) rtx tls_get_addr_libfunc;
1875
1876 rtx
1877 aarch64_tls_get_addr (void)
1878 {
1879 if (!tls_get_addr_libfunc)
1880 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1881 return tls_get_addr_libfunc;
1882 }
1883
1884 /* Return the TLS model to use for ADDR. */
1885
1886 static enum tls_model
1887 tls_symbolic_operand_type (rtx addr)
1888 {
1889 enum tls_model tls_kind = TLS_MODEL_NONE;
1890 if (GET_CODE (addr) == CONST)
1891 {
1892 poly_int64 addend;
1893 rtx sym = strip_offset (addr, &addend);
1894 if (GET_CODE (sym) == SYMBOL_REF)
1895 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1896 }
1897 else if (GET_CODE (addr) == SYMBOL_REF)
1898 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1899
1900 return tls_kind;
1901 }
1902
1903 /* We'll allow lo_sum's in addresses in our legitimate addresses
1904 so that combine would take care of combining addresses where
1905 necessary, but for generation purposes, we'll generate the address
1906 as :
1907 RTL Absolute
1908 tmp = hi (symbol_ref); adrp x1, foo
1909 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1910 nop
1911
1912 PIC TLS
1913 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1914 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1915 bl __tls_get_addr
1916 nop
1917
1918 Load TLS symbol, depending on TLS mechanism and TLS access model.
1919
1920 Global Dynamic - Traditional TLS:
1921 adrp tmp, :tlsgd:imm
1922 add dest, tmp, #:tlsgd_lo12:imm
1923 bl __tls_get_addr
1924
1925 Global Dynamic - TLS Descriptors:
1926 adrp dest, :tlsdesc:imm
1927 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1928 add dest, dest, #:tlsdesc_lo12:imm
1929 blr tmp
1930 mrs tp, tpidr_el0
1931 add dest, dest, tp
1932
1933 Initial Exec:
1934 mrs tp, tpidr_el0
1935 adrp tmp, :gottprel:imm
1936 ldr dest, [tmp, #:gottprel_lo12:imm]
1937 add dest, dest, tp
1938
1939 Local Exec:
1940 mrs tp, tpidr_el0
1941 add t0, tp, #:tprel_hi12:imm, lsl #12
1942 add t0, t0, #:tprel_lo12_nc:imm
1943 */
1944
1945 static void
1946 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1947 enum aarch64_symbol_type type)
1948 {
1949 switch (type)
1950 {
1951 case SYMBOL_SMALL_ABSOLUTE:
1952 {
1953 /* In ILP32, the mode of dest can be either SImode or DImode. */
1954 rtx tmp_reg = dest;
1955 machine_mode mode = GET_MODE (dest);
1956
1957 gcc_assert (mode == Pmode || mode == ptr_mode);
1958
1959 if (can_create_pseudo_p ())
1960 tmp_reg = gen_reg_rtx (mode);
1961
1962 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1963 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1964 return;
1965 }
1966
1967 case SYMBOL_TINY_ABSOLUTE:
1968 emit_insn (gen_rtx_SET (dest, imm));
1969 return;
1970
1971 case SYMBOL_SMALL_GOT_28K:
1972 {
1973 machine_mode mode = GET_MODE (dest);
1974 rtx gp_rtx = pic_offset_table_rtx;
1975 rtx insn;
1976 rtx mem;
1977
1978 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1979 here before rtl expand. Tree IVOPT will generate rtl pattern to
1980 decide rtx costs, in which case pic_offset_table_rtx is not
1981 initialized. For that case no need to generate the first adrp
1982 instruction as the final cost for global variable access is
1983 one instruction. */
1984 if (gp_rtx != NULL)
1985 {
1986 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1987 using the page base as GOT base, the first page may be wasted,
1988 in the worst scenario, there is only 28K space for GOT).
1989
1990 The generate instruction sequence for accessing global variable
1991 is:
1992
1993 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1994
1995 Only one instruction needed. But we must initialize
1996 pic_offset_table_rtx properly. We generate initialize insn for
1997 every global access, and allow CSE to remove all redundant.
1998
1999 The final instruction sequences will look like the following
2000 for multiply global variables access.
2001
2002 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2003
2004 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2005 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2006 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2007 ... */
2008
2009 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2010 crtl->uses_pic_offset_table = 1;
2011 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2012
2013 if (mode != GET_MODE (gp_rtx))
2014 gp_rtx = gen_lowpart (mode, gp_rtx);
2015
2016 }
2017
2018 if (mode == ptr_mode)
2019 {
2020 if (mode == DImode)
2021 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2022 else
2023 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2024
2025 mem = XVECEXP (SET_SRC (insn), 0, 0);
2026 }
2027 else
2028 {
2029 gcc_assert (mode == Pmode);
2030
2031 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2032 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2033 }
2034
2035 /* The operand is expected to be MEM. Whenever the related insn
2036 pattern changed, above code which calculate mem should be
2037 updated. */
2038 gcc_assert (GET_CODE (mem) == MEM);
2039 MEM_READONLY_P (mem) = 1;
2040 MEM_NOTRAP_P (mem) = 1;
2041 emit_insn (insn);
2042 return;
2043 }
2044
2045 case SYMBOL_SMALL_GOT_4G:
2046 {
2047 /* In ILP32, the mode of dest can be either SImode or DImode,
2048 while the got entry is always of SImode size. The mode of
2049 dest depends on how dest is used: if dest is assigned to a
2050 pointer (e.g. in the memory), it has SImode; it may have
2051 DImode if dest is dereferenced to access the memeory.
2052 This is why we have to handle three different ldr_got_small
2053 patterns here (two patterns for ILP32). */
2054
2055 rtx insn;
2056 rtx mem;
2057 rtx tmp_reg = dest;
2058 machine_mode mode = GET_MODE (dest);
2059
2060 if (can_create_pseudo_p ())
2061 tmp_reg = gen_reg_rtx (mode);
2062
2063 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2064 if (mode == ptr_mode)
2065 {
2066 if (mode == DImode)
2067 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2068 else
2069 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2070
2071 mem = XVECEXP (SET_SRC (insn), 0, 0);
2072 }
2073 else
2074 {
2075 gcc_assert (mode == Pmode);
2076
2077 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2078 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2079 }
2080
2081 gcc_assert (GET_CODE (mem) == MEM);
2082 MEM_READONLY_P (mem) = 1;
2083 MEM_NOTRAP_P (mem) = 1;
2084 emit_insn (insn);
2085 return;
2086 }
2087
2088 case SYMBOL_SMALL_TLSGD:
2089 {
2090 rtx_insn *insns;
2091 machine_mode mode = GET_MODE (dest);
2092 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2093
2094 start_sequence ();
2095 if (TARGET_ILP32)
2096 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2097 else
2098 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2099 insns = get_insns ();
2100 end_sequence ();
2101
2102 RTL_CONST_CALL_P (insns) = 1;
2103 emit_libcall_block (insns, dest, result, imm);
2104 return;
2105 }
2106
2107 case SYMBOL_SMALL_TLSDESC:
2108 {
2109 machine_mode mode = GET_MODE (dest);
2110 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2111 rtx tp;
2112
2113 gcc_assert (mode == Pmode || mode == ptr_mode);
2114
2115 /* In ILP32, the got entry is always of SImode size. Unlike
2116 small GOT, the dest is fixed at reg 0. */
2117 if (TARGET_ILP32)
2118 emit_insn (gen_tlsdesc_small_si (imm));
2119 else
2120 emit_insn (gen_tlsdesc_small_di (imm));
2121 tp = aarch64_load_tp (NULL);
2122
2123 if (mode != Pmode)
2124 tp = gen_lowpart (mode, tp);
2125
2126 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2127 if (REG_P (dest))
2128 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2129 return;
2130 }
2131
2132 case SYMBOL_SMALL_TLSIE:
2133 {
2134 /* In ILP32, the mode of dest can be either SImode or DImode,
2135 while the got entry is always of SImode size. The mode of
2136 dest depends on how dest is used: if dest is assigned to a
2137 pointer (e.g. in the memory), it has SImode; it may have
2138 DImode if dest is dereferenced to access the memeory.
2139 This is why we have to handle three different tlsie_small
2140 patterns here (two patterns for ILP32). */
2141 machine_mode mode = GET_MODE (dest);
2142 rtx tmp_reg = gen_reg_rtx (mode);
2143 rtx tp = aarch64_load_tp (NULL);
2144
2145 if (mode == ptr_mode)
2146 {
2147 if (mode == DImode)
2148 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2149 else
2150 {
2151 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2152 tp = gen_lowpart (mode, tp);
2153 }
2154 }
2155 else
2156 {
2157 gcc_assert (mode == Pmode);
2158 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2159 }
2160
2161 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2162 if (REG_P (dest))
2163 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2164 return;
2165 }
2166
2167 case SYMBOL_TLSLE12:
2168 case SYMBOL_TLSLE24:
2169 case SYMBOL_TLSLE32:
2170 case SYMBOL_TLSLE48:
2171 {
2172 machine_mode mode = GET_MODE (dest);
2173 rtx tp = aarch64_load_tp (NULL);
2174
2175 if (mode != Pmode)
2176 tp = gen_lowpart (mode, tp);
2177
2178 switch (type)
2179 {
2180 case SYMBOL_TLSLE12:
2181 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2182 (dest, tp, imm));
2183 break;
2184 case SYMBOL_TLSLE24:
2185 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2186 (dest, tp, imm));
2187 break;
2188 case SYMBOL_TLSLE32:
2189 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2190 (dest, imm));
2191 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2192 (dest, dest, tp));
2193 break;
2194 case SYMBOL_TLSLE48:
2195 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2196 (dest, imm));
2197 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2198 (dest, dest, tp));
2199 break;
2200 default:
2201 gcc_unreachable ();
2202 }
2203
2204 if (REG_P (dest))
2205 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2206 return;
2207 }
2208
2209 case SYMBOL_TINY_GOT:
2210 emit_insn (gen_ldr_got_tiny (dest, imm));
2211 return;
2212
2213 case SYMBOL_TINY_TLSIE:
2214 {
2215 machine_mode mode = GET_MODE (dest);
2216 rtx tp = aarch64_load_tp (NULL);
2217
2218 if (mode == ptr_mode)
2219 {
2220 if (mode == DImode)
2221 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2222 else
2223 {
2224 tp = gen_lowpart (mode, tp);
2225 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2226 }
2227 }
2228 else
2229 {
2230 gcc_assert (mode == Pmode);
2231 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2232 }
2233
2234 if (REG_P (dest))
2235 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2236 return;
2237 }
2238
2239 default:
2240 gcc_unreachable ();
2241 }
2242 }
2243
2244 /* Emit a move from SRC to DEST. Assume that the move expanders can
2245 handle all moves if !can_create_pseudo_p (). The distinction is
2246 important because, unlike emit_move_insn, the move expanders know
2247 how to force Pmode objects into the constant pool even when the
2248 constant pool address is not itself legitimate. */
2249 static rtx
2250 aarch64_emit_move (rtx dest, rtx src)
2251 {
2252 return (can_create_pseudo_p ()
2253 ? emit_move_insn (dest, src)
2254 : emit_move_insn_1 (dest, src));
2255 }
2256
2257 /* Apply UNOPTAB to OP and store the result in DEST. */
2258
2259 static void
2260 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2261 {
2262 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2263 if (dest != tmp)
2264 emit_move_insn (dest, tmp);
2265 }
2266
2267 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2268
2269 static void
2270 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2271 {
2272 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2273 OPTAB_DIRECT);
2274 if (dest != tmp)
2275 emit_move_insn (dest, tmp);
2276 }
2277
2278 /* Split a 128-bit move operation into two 64-bit move operations,
2279 taking care to handle partial overlap of register to register
2280 copies. Special cases are needed when moving between GP regs and
2281 FP regs. SRC can be a register, constant or memory; DST a register
2282 or memory. If either operand is memory it must not have any side
2283 effects. */
2284 void
2285 aarch64_split_128bit_move (rtx dst, rtx src)
2286 {
2287 rtx dst_lo, dst_hi;
2288 rtx src_lo, src_hi;
2289
2290 machine_mode mode = GET_MODE (dst);
2291
2292 gcc_assert (mode == TImode || mode == TFmode);
2293 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2294 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2295
2296 if (REG_P (dst) && REG_P (src))
2297 {
2298 int src_regno = REGNO (src);
2299 int dst_regno = REGNO (dst);
2300
2301 /* Handle FP <-> GP regs. */
2302 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2303 {
2304 src_lo = gen_lowpart (word_mode, src);
2305 src_hi = gen_highpart (word_mode, src);
2306
2307 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2308 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2309 return;
2310 }
2311 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2312 {
2313 dst_lo = gen_lowpart (word_mode, dst);
2314 dst_hi = gen_highpart (word_mode, dst);
2315
2316 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2317 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2318 return;
2319 }
2320 }
2321
2322 dst_lo = gen_lowpart (word_mode, dst);
2323 dst_hi = gen_highpart (word_mode, dst);
2324 src_lo = gen_lowpart (word_mode, src);
2325 src_hi = gen_highpart_mode (word_mode, mode, src);
2326
2327 /* At most one pairing may overlap. */
2328 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2329 {
2330 aarch64_emit_move (dst_hi, src_hi);
2331 aarch64_emit_move (dst_lo, src_lo);
2332 }
2333 else
2334 {
2335 aarch64_emit_move (dst_lo, src_lo);
2336 aarch64_emit_move (dst_hi, src_hi);
2337 }
2338 }
2339
2340 bool
2341 aarch64_split_128bit_move_p (rtx dst, rtx src)
2342 {
2343 return (! REG_P (src)
2344 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2345 }
2346
2347 /* Split a complex SIMD combine. */
2348
2349 void
2350 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2351 {
2352 machine_mode src_mode = GET_MODE (src1);
2353 machine_mode dst_mode = GET_MODE (dst);
2354
2355 gcc_assert (VECTOR_MODE_P (dst_mode));
2356 gcc_assert (register_operand (dst, dst_mode)
2357 && register_operand (src1, src_mode)
2358 && register_operand (src2, src_mode));
2359
2360 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2361 return;
2362 }
2363
2364 /* Split a complex SIMD move. */
2365
2366 void
2367 aarch64_split_simd_move (rtx dst, rtx src)
2368 {
2369 machine_mode src_mode = GET_MODE (src);
2370 machine_mode dst_mode = GET_MODE (dst);
2371
2372 gcc_assert (VECTOR_MODE_P (dst_mode));
2373
2374 if (REG_P (dst) && REG_P (src))
2375 {
2376 gcc_assert (VECTOR_MODE_P (src_mode));
2377 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2378 }
2379 }
2380
2381 bool
2382 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2383 machine_mode ymode, rtx y)
2384 {
2385 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2386 gcc_assert (r != NULL);
2387 return rtx_equal_p (x, r);
2388 }
2389
2390
2391 static rtx
2392 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2393 {
2394 if (can_create_pseudo_p ())
2395 return force_reg (mode, value);
2396 else
2397 {
2398 gcc_assert (x);
2399 aarch64_emit_move (x, value);
2400 return x;
2401 }
2402 }
2403
2404 /* Return true if we can move VALUE into a register using a single
2405 CNT[BHWD] instruction. */
2406
2407 static bool
2408 aarch64_sve_cnt_immediate_p (poly_int64 value)
2409 {
2410 HOST_WIDE_INT factor = value.coeffs[0];
2411 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2412 return (value.coeffs[1] == factor
2413 && IN_RANGE (factor, 2, 16 * 16)
2414 && (factor & 1) == 0
2415 && factor <= 16 * (factor & -factor));
2416 }
2417
2418 /* Likewise for rtx X. */
2419
2420 bool
2421 aarch64_sve_cnt_immediate_p (rtx x)
2422 {
2423 poly_int64 value;
2424 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2425 }
2426
2427 /* Return the asm string for an instruction with a CNT-like vector size
2428 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2429 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2430 first part of the operands template (the part that comes before the
2431 vector size itself). FACTOR is the number of quadwords.
2432 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2433 If it is zero, we can use any element size. */
2434
2435 static char *
2436 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2437 unsigned int factor,
2438 unsigned int nelts_per_vq)
2439 {
2440 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2441
2442 if (nelts_per_vq == 0)
2443 /* There is some overlap in the ranges of the four CNT instructions.
2444 Here we always use the smallest possible element size, so that the
2445 multiplier is 1 whereever possible. */
2446 nelts_per_vq = factor & -factor;
2447 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2448 gcc_assert (IN_RANGE (shift, 1, 4));
2449 char suffix = "dwhb"[shift - 1];
2450
2451 factor >>= shift;
2452 unsigned int written;
2453 if (factor == 1)
2454 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2455 prefix, suffix, operands);
2456 else
2457 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2458 prefix, suffix, operands, factor);
2459 gcc_assert (written < sizeof (buffer));
2460 return buffer;
2461 }
2462
2463 /* Return the asm string for an instruction with a CNT-like vector size
2464 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2465 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2466 first part of the operands template (the part that comes before the
2467 vector size itself). X is the value of the vector size operand,
2468 as a polynomial integer rtx. */
2469
2470 char *
2471 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2472 rtx x)
2473 {
2474 poly_int64 value = rtx_to_poly_int64 (x);
2475 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2476 return aarch64_output_sve_cnt_immediate (prefix, operands,
2477 value.coeffs[1], 0);
2478 }
2479
2480 /* Return true if we can add VALUE to a register using a single ADDVL
2481 or ADDPL instruction. */
2482
2483 static bool
2484 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2485 {
2486 HOST_WIDE_INT factor = value.coeffs[0];
2487 if (factor == 0 || value.coeffs[1] != factor)
2488 return false;
2489 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2490 and a value of 16 is one vector width. */
2491 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2492 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2493 }
2494
2495 /* Likewise for rtx X. */
2496
2497 bool
2498 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2499 {
2500 poly_int64 value;
2501 return (poly_int_rtx_p (x, &value)
2502 && aarch64_sve_addvl_addpl_immediate_p (value));
2503 }
2504
2505 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2506 and storing the result in operand 0. */
2507
2508 char *
2509 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2510 {
2511 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2512 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2513 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2514
2515 /* Use INC or DEC if possible. */
2516 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2517 {
2518 if (aarch64_sve_cnt_immediate_p (offset_value))
2519 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2520 offset_value.coeffs[1], 0);
2521 if (aarch64_sve_cnt_immediate_p (-offset_value))
2522 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2523 -offset_value.coeffs[1], 0);
2524 }
2525
2526 int factor = offset_value.coeffs[1];
2527 if ((factor & 15) == 0)
2528 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2529 else
2530 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2531 return buffer;
2532 }
2533
2534 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2535 instruction. If it is, store the number of elements in each vector
2536 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2537 factor in *FACTOR_OUT (if nonnull). */
2538
2539 bool
2540 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2541 unsigned int *nelts_per_vq_out)
2542 {
2543 rtx elt;
2544 poly_int64 value;
2545
2546 if (!const_vec_duplicate_p (x, &elt)
2547 || !poly_int_rtx_p (elt, &value))
2548 return false;
2549
2550 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2551 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2552 /* There's no vector INCB. */
2553 return false;
2554
2555 HOST_WIDE_INT factor = value.coeffs[0];
2556 if (value.coeffs[1] != factor)
2557 return false;
2558
2559 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2560 if ((factor % nelts_per_vq) != 0
2561 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2562 return false;
2563
2564 if (factor_out)
2565 *factor_out = factor;
2566 if (nelts_per_vq_out)
2567 *nelts_per_vq_out = nelts_per_vq;
2568 return true;
2569 }
2570
2571 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2572 instruction. */
2573
2574 bool
2575 aarch64_sve_inc_dec_immediate_p (rtx x)
2576 {
2577 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2578 }
2579
2580 /* Return the asm template for an SVE vector INC or DEC instruction.
2581 OPERANDS gives the operands before the vector count and X is the
2582 value of the vector count operand itself. */
2583
2584 char *
2585 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2586 {
2587 int factor;
2588 unsigned int nelts_per_vq;
2589 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2590 gcc_unreachable ();
2591 if (factor < 0)
2592 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2593 nelts_per_vq);
2594 else
2595 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2596 nelts_per_vq);
2597 }
2598
2599 static int
2600 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2601 scalar_int_mode mode)
2602 {
2603 int i;
2604 unsigned HOST_WIDE_INT val, val2, mask;
2605 int one_match, zero_match;
2606 int num_insns;
2607
2608 val = INTVAL (imm);
2609
2610 if (aarch64_move_imm (val, mode))
2611 {
2612 if (generate)
2613 emit_insn (gen_rtx_SET (dest, imm));
2614 return 1;
2615 }
2616
2617 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2618 (with XXXX non-zero). In that case check to see if the move can be done in
2619 a smaller mode. */
2620 val2 = val & 0xffffffff;
2621 if (mode == DImode
2622 && aarch64_move_imm (val2, SImode)
2623 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2624 {
2625 if (generate)
2626 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2627
2628 /* Check if we have to emit a second instruction by checking to see
2629 if any of the upper 32 bits of the original DI mode value is set. */
2630 if (val == val2)
2631 return 1;
2632
2633 i = (val >> 48) ? 48 : 32;
2634
2635 if (generate)
2636 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2637 GEN_INT ((val >> i) & 0xffff)));
2638
2639 return 2;
2640 }
2641
2642 if ((val >> 32) == 0 || mode == SImode)
2643 {
2644 if (generate)
2645 {
2646 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2647 if (mode == SImode)
2648 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2649 GEN_INT ((val >> 16) & 0xffff)));
2650 else
2651 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2652 GEN_INT ((val >> 16) & 0xffff)));
2653 }
2654 return 2;
2655 }
2656
2657 /* Remaining cases are all for DImode. */
2658
2659 mask = 0xffff;
2660 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2661 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2662 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2663 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2664
2665 if (zero_match != 2 && one_match != 2)
2666 {
2667 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2668 For a 64-bit bitmask try whether changing 16 bits to all ones or
2669 zeroes creates a valid bitmask. To check any repeated bitmask,
2670 try using 16 bits from the other 32-bit half of val. */
2671
2672 for (i = 0; i < 64; i += 16, mask <<= 16)
2673 {
2674 val2 = val & ~mask;
2675 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2676 break;
2677 val2 = val | mask;
2678 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2679 break;
2680 val2 = val2 & ~mask;
2681 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2682 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2683 break;
2684 }
2685 if (i != 64)
2686 {
2687 if (generate)
2688 {
2689 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2690 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2691 GEN_INT ((val >> i) & 0xffff)));
2692 }
2693 return 2;
2694 }
2695 }
2696
2697 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2698 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2699 otherwise skip zero bits. */
2700
2701 num_insns = 1;
2702 mask = 0xffff;
2703 val2 = one_match > zero_match ? ~val : val;
2704 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2705
2706 if (generate)
2707 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2708 ? (val | ~(mask << i))
2709 : (val & (mask << i)))));
2710 for (i += 16; i < 64; i += 16)
2711 {
2712 if ((val2 & (mask << i)) == 0)
2713 continue;
2714 if (generate)
2715 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2716 GEN_INT ((val >> i) & 0xffff)));
2717 num_insns ++;
2718 }
2719
2720 return num_insns;
2721 }
2722
2723 /* Return whether imm is a 128-bit immediate which is simple enough to
2724 expand inline. */
2725 bool
2726 aarch64_mov128_immediate (rtx imm)
2727 {
2728 if (GET_CODE (imm) == CONST_INT)
2729 return true;
2730
2731 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2732
2733 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2734 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2735
2736 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2737 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2738 }
2739
2740
2741 /* Return the number of temporary registers that aarch64_add_offset_1
2742 would need to add OFFSET to a register. */
2743
2744 static unsigned int
2745 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2746 {
2747 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2748 }
2749
2750 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2751 a non-polynomial OFFSET. MODE is the mode of the addition.
2752 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2753 be set and CFA adjustments added to the generated instructions.
2754
2755 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2756 temporary if register allocation is already complete. This temporary
2757 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2758 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2759 the immediate again.
2760
2761 Since this function may be used to adjust the stack pointer, we must
2762 ensure that it cannot cause transient stack deallocation (for example
2763 by first incrementing SP and then decrementing when adjusting by a
2764 large immediate). */
2765
2766 static void
2767 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2768 rtx src, HOST_WIDE_INT offset, rtx temp1,
2769 bool frame_related_p, bool emit_move_imm)
2770 {
2771 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2772 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2773
2774 HOST_WIDE_INT moffset = abs_hwi (offset);
2775 rtx_insn *insn;
2776
2777 if (!moffset)
2778 {
2779 if (!rtx_equal_p (dest, src))
2780 {
2781 insn = emit_insn (gen_rtx_SET (dest, src));
2782 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2783 }
2784 return;
2785 }
2786
2787 /* Single instruction adjustment. */
2788 if (aarch64_uimm12_shift (moffset))
2789 {
2790 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2791 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2792 return;
2793 }
2794
2795 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2796 and either:
2797
2798 a) the offset cannot be loaded by a 16-bit move or
2799 b) there is no spare register into which we can move it. */
2800 if (moffset < 0x1000000
2801 && ((!temp1 && !can_create_pseudo_p ())
2802 || !aarch64_move_imm (moffset, mode)))
2803 {
2804 HOST_WIDE_INT low_off = moffset & 0xfff;
2805
2806 low_off = offset < 0 ? -low_off : low_off;
2807 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2808 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2809 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2810 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2811 return;
2812 }
2813
2814 /* Emit a move immediate if required and an addition/subtraction. */
2815 if (emit_move_imm)
2816 {
2817 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2818 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2819 }
2820 insn = emit_insn (offset < 0
2821 ? gen_sub3_insn (dest, src, temp1)
2822 : gen_add3_insn (dest, src, temp1));
2823 if (frame_related_p)
2824 {
2825 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2826 rtx adj = plus_constant (mode, src, offset);
2827 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2828 }
2829 }
2830
2831 /* Return the number of temporary registers that aarch64_add_offset
2832 would need to move OFFSET into a register or add OFFSET to a register;
2833 ADD_P is true if we want the latter rather than the former. */
2834
2835 static unsigned int
2836 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2837 {
2838 /* This follows the same structure as aarch64_add_offset. */
2839 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2840 return 0;
2841
2842 unsigned int count = 0;
2843 HOST_WIDE_INT factor = offset.coeffs[1];
2844 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2845 poly_int64 poly_offset (factor, factor);
2846 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2847 /* Need one register for the ADDVL/ADDPL result. */
2848 count += 1;
2849 else if (factor != 0)
2850 {
2851 factor = abs (factor);
2852 if (factor > 16 * (factor & -factor))
2853 /* Need one register for the CNT result and one for the multiplication
2854 factor. If necessary, the second temporary can be reused for the
2855 constant part of the offset. */
2856 return 2;
2857 /* Need one register for the CNT result (which might then
2858 be shifted). */
2859 count += 1;
2860 }
2861 return count + aarch64_add_offset_1_temporaries (constant);
2862 }
2863
2864 /* If X can be represented as a poly_int64, return the number
2865 of temporaries that are required to add it to a register.
2866 Return -1 otherwise. */
2867
2868 int
2869 aarch64_add_offset_temporaries (rtx x)
2870 {
2871 poly_int64 offset;
2872 if (!poly_int_rtx_p (x, &offset))
2873 return -1;
2874 return aarch64_offset_temporaries (true, offset);
2875 }
2876
2877 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2878 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2879 be set and CFA adjustments added to the generated instructions.
2880
2881 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2882 temporary if register allocation is already complete. This temporary
2883 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2884 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2885 false to avoid emitting the immediate again.
2886
2887 TEMP2, if nonnull, is a second temporary register that doesn't
2888 overlap either DEST or REG.
2889
2890 Since this function may be used to adjust the stack pointer, we must
2891 ensure that it cannot cause transient stack deallocation (for example
2892 by first incrementing SP and then decrementing when adjusting by a
2893 large immediate). */
2894
2895 static void
2896 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2897 poly_int64 offset, rtx temp1, rtx temp2,
2898 bool frame_related_p, bool emit_move_imm = true)
2899 {
2900 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2901 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2902 gcc_assert (temp1 == NULL_RTX
2903 || !frame_related_p
2904 || !reg_overlap_mentioned_p (temp1, dest));
2905 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2906
2907 /* Try using ADDVL or ADDPL to add the whole value. */
2908 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2909 {
2910 rtx offset_rtx = gen_int_mode (offset, mode);
2911 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2912 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2913 return;
2914 }
2915
2916 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2917 SVE vector register, over and above the minimum size of 128 bits.
2918 This is equivalent to half the value returned by CNTD with a
2919 vector shape of ALL. */
2920 HOST_WIDE_INT factor = offset.coeffs[1];
2921 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2922
2923 /* Try using ADDVL or ADDPL to add the VG-based part. */
2924 poly_int64 poly_offset (factor, factor);
2925 if (src != const0_rtx
2926 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2927 {
2928 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2929 if (frame_related_p)
2930 {
2931 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2932 RTX_FRAME_RELATED_P (insn) = true;
2933 src = dest;
2934 }
2935 else
2936 {
2937 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2938 src = aarch64_force_temporary (mode, temp1, addr);
2939 temp1 = temp2;
2940 temp2 = NULL_RTX;
2941 }
2942 }
2943 /* Otherwise use a CNT-based sequence. */
2944 else if (factor != 0)
2945 {
2946 /* Use a subtraction if we have a negative factor. */
2947 rtx_code code = PLUS;
2948 if (factor < 0)
2949 {
2950 factor = -factor;
2951 code = MINUS;
2952 }
2953
2954 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2955 into the multiplication. */
2956 rtx val;
2957 int shift = 0;
2958 if (factor & 1)
2959 /* Use a right shift by 1. */
2960 shift = -1;
2961 else
2962 factor /= 2;
2963 HOST_WIDE_INT low_bit = factor & -factor;
2964 if (factor <= 16 * low_bit)
2965 {
2966 if (factor > 16 * 8)
2967 {
2968 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2969 the value with the minimum multiplier and shift it into
2970 position. */
2971 int extra_shift = exact_log2 (low_bit);
2972 shift += extra_shift;
2973 factor >>= extra_shift;
2974 }
2975 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2976 }
2977 else
2978 {
2979 /* Use CNTD, then multiply it by FACTOR. */
2980 val = gen_int_mode (poly_int64 (2, 2), mode);
2981 val = aarch64_force_temporary (mode, temp1, val);
2982
2983 /* Go back to using a negative multiplication factor if we have
2984 no register from which to subtract. */
2985 if (code == MINUS && src == const0_rtx)
2986 {
2987 factor = -factor;
2988 code = PLUS;
2989 }
2990 rtx coeff1 = gen_int_mode (factor, mode);
2991 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2992 val = gen_rtx_MULT (mode, val, coeff1);
2993 }
2994
2995 if (shift > 0)
2996 {
2997 /* Multiply by 1 << SHIFT. */
2998 val = aarch64_force_temporary (mode, temp1, val);
2999 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3000 }
3001 else if (shift == -1)
3002 {
3003 /* Divide by 2. */
3004 val = aarch64_force_temporary (mode, temp1, val);
3005 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3006 }
3007
3008 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3009 if (src != const0_rtx)
3010 {
3011 val = aarch64_force_temporary (mode, temp1, val);
3012 val = gen_rtx_fmt_ee (code, mode, src, val);
3013 }
3014 else if (code == MINUS)
3015 {
3016 val = aarch64_force_temporary (mode, temp1, val);
3017 val = gen_rtx_NEG (mode, val);
3018 }
3019
3020 if (constant == 0 || frame_related_p)
3021 {
3022 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3023 if (frame_related_p)
3024 {
3025 RTX_FRAME_RELATED_P (insn) = true;
3026 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3027 gen_rtx_SET (dest, plus_constant (Pmode, src,
3028 poly_offset)));
3029 }
3030 src = dest;
3031 if (constant == 0)
3032 return;
3033 }
3034 else
3035 {
3036 src = aarch64_force_temporary (mode, temp1, val);
3037 temp1 = temp2;
3038 temp2 = NULL_RTX;
3039 }
3040
3041 emit_move_imm = true;
3042 }
3043
3044 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3045 frame_related_p, emit_move_imm);
3046 }
3047
3048 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3049 than a poly_int64. */
3050
3051 void
3052 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3053 rtx offset_rtx, rtx temp1, rtx temp2)
3054 {
3055 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3056 temp1, temp2, false);
3057 }
3058
3059 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3060 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3061 if TEMP1 already contains abs (DELTA). */
3062
3063 static inline void
3064 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3065 {
3066 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3067 temp1, temp2, true, emit_move_imm);
3068 }
3069
3070 /* Subtract DELTA from the stack pointer, marking the instructions
3071 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3072 if nonnull. */
3073
3074 static inline void
3075 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3076 bool emit_move_imm = true)
3077 {
3078 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3079 temp1, temp2, frame_related_p, emit_move_imm);
3080 }
3081
3082 /* Set DEST to (vec_series BASE STEP). */
3083
3084 static void
3085 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3086 {
3087 machine_mode mode = GET_MODE (dest);
3088 scalar_mode inner = GET_MODE_INNER (mode);
3089
3090 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3091 if (!aarch64_sve_index_immediate_p (base))
3092 base = force_reg (inner, base);
3093 if (!aarch64_sve_index_immediate_p (step))
3094 step = force_reg (inner, step);
3095
3096 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3097 }
3098
3099 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3100 integer of mode INT_MODE. Return true on success. */
3101
3102 static bool
3103 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3104 rtx src)
3105 {
3106 /* If the constant is smaller than 128 bits, we can do the move
3107 using a vector of SRC_MODEs. */
3108 if (src_mode != TImode)
3109 {
3110 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3111 GET_MODE_SIZE (src_mode));
3112 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3113 emit_move_insn (gen_lowpart (dup_mode, dest),
3114 gen_const_vec_duplicate (dup_mode, src));
3115 return true;
3116 }
3117
3118 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3119 src = force_const_mem (src_mode, src);
3120 if (!src)
3121 return false;
3122
3123 /* Make sure that the address is legitimate. */
3124 if (!aarch64_sve_ld1r_operand_p (src))
3125 {
3126 rtx addr = force_reg (Pmode, XEXP (src, 0));
3127 src = replace_equiv_address (src, addr);
3128 }
3129
3130 machine_mode mode = GET_MODE (dest);
3131 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3132 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3133 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3134 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3135 emit_insn (gen_rtx_SET (dest, src));
3136 return true;
3137 }
3138
3139 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3140 isn't a simple duplicate or series. */
3141
3142 static void
3143 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3144 {
3145 machine_mode mode = GET_MODE (src);
3146 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3147 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3148 gcc_assert (npatterns > 1);
3149
3150 if (nelts_per_pattern == 1)
3151 {
3152 /* The constant is a repeating seqeuence of at least two elements,
3153 where the repeating elements occupy no more than 128 bits.
3154 Get an integer representation of the replicated value. */
3155 scalar_int_mode int_mode;
3156 if (BYTES_BIG_ENDIAN)
3157 /* For now, always use LD1RQ to load the value on big-endian
3158 targets, since the handling of smaller integers includes a
3159 subreg that is semantically an element reverse. */
3160 int_mode = TImode;
3161 else
3162 {
3163 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3164 gcc_assert (int_bits <= 128);
3165 int_mode = int_mode_for_size (int_bits, 0).require ();
3166 }
3167 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3168 if (int_value
3169 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3170 return;
3171 }
3172
3173 /* Expand each pattern individually. */
3174 rtx_vector_builder builder;
3175 auto_vec<rtx, 16> vectors (npatterns);
3176 for (unsigned int i = 0; i < npatterns; ++i)
3177 {
3178 builder.new_vector (mode, 1, nelts_per_pattern);
3179 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3180 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3181 vectors.quick_push (force_reg (mode, builder.build ()));
3182 }
3183
3184 /* Use permutes to interleave the separate vectors. */
3185 while (npatterns > 1)
3186 {
3187 npatterns /= 2;
3188 for (unsigned int i = 0; i < npatterns; ++i)
3189 {
3190 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3191 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3192 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3193 vectors[i] = tmp;
3194 }
3195 }
3196 gcc_assert (vectors[0] == dest);
3197 }
3198
3199 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3200 is a pattern that can be used to set DEST to a replicated scalar
3201 element. */
3202
3203 void
3204 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3205 rtx (*gen_vec_duplicate) (rtx, rtx))
3206 {
3207 machine_mode mode = GET_MODE (dest);
3208
3209 /* Check on what type of symbol it is. */
3210 scalar_int_mode int_mode;
3211 if ((GET_CODE (imm) == SYMBOL_REF
3212 || GET_CODE (imm) == LABEL_REF
3213 || GET_CODE (imm) == CONST
3214 || GET_CODE (imm) == CONST_POLY_INT)
3215 && is_a <scalar_int_mode> (mode, &int_mode))
3216 {
3217 rtx mem;
3218 poly_int64 offset;
3219 HOST_WIDE_INT const_offset;
3220 enum aarch64_symbol_type sty;
3221
3222 /* If we have (const (plus symbol offset)), separate out the offset
3223 before we start classifying the symbol. */
3224 rtx base = strip_offset (imm, &offset);
3225
3226 /* We must always add an offset involving VL separately, rather than
3227 folding it into the relocation. */
3228 if (!offset.is_constant (&const_offset))
3229 {
3230 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3231 emit_insn (gen_rtx_SET (dest, imm));
3232 else
3233 {
3234 /* Do arithmetic on 32-bit values if the result is smaller
3235 than that. */
3236 if (partial_subreg_p (int_mode, SImode))
3237 {
3238 /* It is invalid to do symbol calculations in modes
3239 narrower than SImode. */
3240 gcc_assert (base == const0_rtx);
3241 dest = gen_lowpart (SImode, dest);
3242 int_mode = SImode;
3243 }
3244 if (base != const0_rtx)
3245 {
3246 base = aarch64_force_temporary (int_mode, dest, base);
3247 aarch64_add_offset (int_mode, dest, base, offset,
3248 NULL_RTX, NULL_RTX, false);
3249 }
3250 else
3251 aarch64_add_offset (int_mode, dest, base, offset,
3252 dest, NULL_RTX, false);
3253 }
3254 return;
3255 }
3256
3257 sty = aarch64_classify_symbol (base, const_offset);
3258 switch (sty)
3259 {
3260 case SYMBOL_FORCE_TO_MEM:
3261 if (const_offset != 0
3262 && targetm.cannot_force_const_mem (int_mode, imm))
3263 {
3264 gcc_assert (can_create_pseudo_p ());
3265 base = aarch64_force_temporary (int_mode, dest, base);
3266 aarch64_add_offset (int_mode, dest, base, const_offset,
3267 NULL_RTX, NULL_RTX, false);
3268 return;
3269 }
3270
3271 mem = force_const_mem (ptr_mode, imm);
3272 gcc_assert (mem);
3273
3274 /* If we aren't generating PC relative literals, then
3275 we need to expand the literal pool access carefully.
3276 This is something that needs to be done in a number
3277 of places, so could well live as a separate function. */
3278 if (!aarch64_pcrelative_literal_loads)
3279 {
3280 gcc_assert (can_create_pseudo_p ());
3281 base = gen_reg_rtx (ptr_mode);
3282 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3283 if (ptr_mode != Pmode)
3284 base = convert_memory_address (Pmode, base);
3285 mem = gen_rtx_MEM (ptr_mode, base);
3286 }
3287
3288 if (int_mode != ptr_mode)
3289 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3290
3291 emit_insn (gen_rtx_SET (dest, mem));
3292
3293 return;
3294
3295 case SYMBOL_SMALL_TLSGD:
3296 case SYMBOL_SMALL_TLSDESC:
3297 case SYMBOL_SMALL_TLSIE:
3298 case SYMBOL_SMALL_GOT_28K:
3299 case SYMBOL_SMALL_GOT_4G:
3300 case SYMBOL_TINY_GOT:
3301 case SYMBOL_TINY_TLSIE:
3302 if (const_offset != 0)
3303 {
3304 gcc_assert(can_create_pseudo_p ());
3305 base = aarch64_force_temporary (int_mode, dest, base);
3306 aarch64_add_offset (int_mode, dest, base, const_offset,
3307 NULL_RTX, NULL_RTX, false);
3308 return;
3309 }
3310 /* FALLTHRU */
3311
3312 case SYMBOL_SMALL_ABSOLUTE:
3313 case SYMBOL_TINY_ABSOLUTE:
3314 case SYMBOL_TLSLE12:
3315 case SYMBOL_TLSLE24:
3316 case SYMBOL_TLSLE32:
3317 case SYMBOL_TLSLE48:
3318 aarch64_load_symref_appropriately (dest, imm, sty);
3319 return;
3320
3321 default:
3322 gcc_unreachable ();
3323 }
3324 }
3325
3326 if (!CONST_INT_P (imm))
3327 {
3328 rtx base, step, value;
3329 if (GET_CODE (imm) == HIGH
3330 || aarch64_simd_valid_immediate (imm, NULL))
3331 emit_insn (gen_rtx_SET (dest, imm));
3332 else if (const_vec_series_p (imm, &base, &step))
3333 aarch64_expand_vec_series (dest, base, step);
3334 else if (const_vec_duplicate_p (imm, &value))
3335 {
3336 /* If the constant is out of range of an SVE vector move,
3337 load it from memory if we can, otherwise move it into
3338 a register and use a DUP. */
3339 scalar_mode inner_mode = GET_MODE_INNER (mode);
3340 rtx op = force_const_mem (inner_mode, value);
3341 if (!op)
3342 op = force_reg (inner_mode, value);
3343 else if (!aarch64_sve_ld1r_operand_p (op))
3344 {
3345 rtx addr = force_reg (Pmode, XEXP (op, 0));
3346 op = replace_equiv_address (op, addr);
3347 }
3348 emit_insn (gen_vec_duplicate (dest, op));
3349 }
3350 else if (GET_CODE (imm) == CONST_VECTOR
3351 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3352 aarch64_expand_sve_const_vector (dest, imm);
3353 else
3354 {
3355 rtx mem = force_const_mem (mode, imm);
3356 gcc_assert (mem);
3357 emit_move_insn (dest, mem);
3358 }
3359
3360 return;
3361 }
3362
3363 aarch64_internal_mov_immediate (dest, imm, true,
3364 as_a <scalar_int_mode> (mode));
3365 }
3366
3367 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3368 that is known to contain PTRUE. */
3369
3370 void
3371 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3372 {
3373 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3374 gen_rtvec (2, pred, src),
3375 UNSPEC_MERGE_PTRUE)));
3376 }
3377
3378 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3379 operand is in memory. In this case we need to use the predicated LD1
3380 and ST1 instead of LDR and STR, both for correctness on big-endian
3381 targets and because LD1 and ST1 support a wider range of addressing modes.
3382 PRED_MODE is the mode of the predicate.
3383
3384 See the comment at the head of aarch64-sve.md for details about the
3385 big-endian handling. */
3386
3387 void
3388 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3389 {
3390 machine_mode mode = GET_MODE (dest);
3391 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3392 if (!register_operand (src, mode)
3393 && !register_operand (dest, mode))
3394 {
3395 rtx tmp = gen_reg_rtx (mode);
3396 if (MEM_P (src))
3397 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3398 else
3399 emit_move_insn (tmp, src);
3400 src = tmp;
3401 }
3402 aarch64_emit_sve_pred_move (dest, ptrue, src);
3403 }
3404
3405 /* Called only on big-endian targets. See whether an SVE vector move
3406 from SRC to DEST is effectively a REV[BHW] instruction, because at
3407 least one operand is a subreg of an SVE vector that has wider or
3408 narrower elements. Return true and emit the instruction if so.
3409
3410 For example:
3411
3412 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3413
3414 represents a VIEW_CONVERT between the following vectors, viewed
3415 in memory order:
3416
3417 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3418 R1: { [0], [1], [2], [3], ... }
3419
3420 The high part of lane X in R2 should therefore correspond to lane X*2
3421 of R1, but the register representations are:
3422
3423 msb lsb
3424 R2: ...... [1].high [1].low [0].high [0].low
3425 R1: ...... [3] [2] [1] [0]
3426
3427 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3428 We therefore need a reverse operation to swap the high and low values
3429 around.
3430
3431 This is purely an optimization. Without it we would spill the
3432 subreg operand to the stack in one mode and reload it in the
3433 other mode, which has the same effect as the REV. */
3434
3435 bool
3436 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3437 {
3438 gcc_assert (BYTES_BIG_ENDIAN);
3439 if (GET_CODE (dest) == SUBREG)
3440 dest = SUBREG_REG (dest);
3441 if (GET_CODE (src) == SUBREG)
3442 src = SUBREG_REG (src);
3443
3444 /* The optimization handles two single SVE REGs with different element
3445 sizes. */
3446 if (!REG_P (dest)
3447 || !REG_P (src)
3448 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3449 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3450 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3451 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3452 return false;
3453
3454 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3455 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3456 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3457 UNSPEC_REV_SUBREG);
3458 emit_insn (gen_rtx_SET (dest, unspec));
3459 return true;
3460 }
3461
3462 /* Return a copy of X with mode MODE, without changing its other
3463 attributes. Unlike gen_lowpart, this doesn't care whether the
3464 mode change is valid. */
3465
3466 static rtx
3467 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3468 {
3469 if (GET_MODE (x) == mode)
3470 return x;
3471
3472 x = shallow_copy_rtx (x);
3473 set_mode_and_regno (x, mode, REGNO (x));
3474 return x;
3475 }
3476
3477 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3478 operands. */
3479
3480 void
3481 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3482 {
3483 /* Decide which REV operation we need. The mode with narrower elements
3484 determines the mode of the operands and the mode with the wider
3485 elements determines the reverse width. */
3486 machine_mode mode_with_wider_elts = GET_MODE (dest);
3487 machine_mode mode_with_narrower_elts = GET_MODE (src);
3488 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3489 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3490 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3491
3492 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3493 unsigned int unspec;
3494 if (wider_bytes == 8)
3495 unspec = UNSPEC_REV64;
3496 else if (wider_bytes == 4)
3497 unspec = UNSPEC_REV32;
3498 else if (wider_bytes == 2)
3499 unspec = UNSPEC_REV16;
3500 else
3501 gcc_unreachable ();
3502 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3503
3504 /* Emit:
3505
3506 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3507 UNSPEC_MERGE_PTRUE))
3508
3509 with the appropriate modes. */
3510 ptrue = gen_lowpart (pred_mode, ptrue);
3511 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3512 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3513 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3514 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3515 UNSPEC_MERGE_PTRUE);
3516 emit_insn (gen_rtx_SET (dest, src));
3517 }
3518
3519 static bool
3520 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3521 tree exp ATTRIBUTE_UNUSED)
3522 {
3523 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3524 return false;
3525
3526 return true;
3527 }
3528
3529 /* Implement TARGET_PASS_BY_REFERENCE. */
3530
3531 static bool
3532 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3533 machine_mode mode,
3534 const_tree type,
3535 bool named ATTRIBUTE_UNUSED)
3536 {
3537 HOST_WIDE_INT size;
3538 machine_mode dummymode;
3539 int nregs;
3540
3541 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3542 if (mode == BLKmode && type)
3543 size = int_size_in_bytes (type);
3544 else
3545 /* No frontends can create types with variable-sized modes, so we
3546 shouldn't be asked to pass or return them. */
3547 size = GET_MODE_SIZE (mode).to_constant ();
3548
3549 /* Aggregates are passed by reference based on their size. */
3550 if (type && AGGREGATE_TYPE_P (type))
3551 {
3552 size = int_size_in_bytes (type);
3553 }
3554
3555 /* Variable sized arguments are always returned by reference. */
3556 if (size < 0)
3557 return true;
3558
3559 /* Can this be a candidate to be passed in fp/simd register(s)? */
3560 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3561 &dummymode, &nregs,
3562 NULL))
3563 return false;
3564
3565 /* Arguments which are variable sized or larger than 2 registers are
3566 passed by reference unless they are a homogenous floating point
3567 aggregate. */
3568 return size > 2 * UNITS_PER_WORD;
3569 }
3570
3571 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3572 static bool
3573 aarch64_return_in_msb (const_tree valtype)
3574 {
3575 machine_mode dummy_mode;
3576 int dummy_int;
3577
3578 /* Never happens in little-endian mode. */
3579 if (!BYTES_BIG_ENDIAN)
3580 return false;
3581
3582 /* Only composite types smaller than or equal to 16 bytes can
3583 be potentially returned in registers. */
3584 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3585 || int_size_in_bytes (valtype) <= 0
3586 || int_size_in_bytes (valtype) > 16)
3587 return false;
3588
3589 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3590 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3591 is always passed/returned in the least significant bits of fp/simd
3592 register(s). */
3593 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3594 &dummy_mode, &dummy_int, NULL))
3595 return false;
3596
3597 return true;
3598 }
3599
3600 /* Implement TARGET_FUNCTION_VALUE.
3601 Define how to find the value returned by a function. */
3602
3603 static rtx
3604 aarch64_function_value (const_tree type, const_tree func,
3605 bool outgoing ATTRIBUTE_UNUSED)
3606 {
3607 machine_mode mode;
3608 int unsignedp;
3609 int count;
3610 machine_mode ag_mode;
3611
3612 mode = TYPE_MODE (type);
3613 if (INTEGRAL_TYPE_P (type))
3614 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3615
3616 if (aarch64_return_in_msb (type))
3617 {
3618 HOST_WIDE_INT size = int_size_in_bytes (type);
3619
3620 if (size % UNITS_PER_WORD != 0)
3621 {
3622 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3623 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3624 }
3625 }
3626
3627 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3628 &ag_mode, &count, NULL))
3629 {
3630 if (!aarch64_composite_type_p (type, mode))
3631 {
3632 gcc_assert (count == 1 && mode == ag_mode);
3633 return gen_rtx_REG (mode, V0_REGNUM);
3634 }
3635 else
3636 {
3637 int i;
3638 rtx par;
3639
3640 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3641 for (i = 0; i < count; i++)
3642 {
3643 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3644 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3645 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3646 XVECEXP (par, 0, i) = tmp;
3647 }
3648 return par;
3649 }
3650 }
3651 else
3652 return gen_rtx_REG (mode, R0_REGNUM);
3653 }
3654
3655 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3656 Return true if REGNO is the number of a hard register in which the values
3657 of called function may come back. */
3658
3659 static bool
3660 aarch64_function_value_regno_p (const unsigned int regno)
3661 {
3662 /* Maximum of 16 bytes can be returned in the general registers. Examples
3663 of 16-byte return values are: 128-bit integers and 16-byte small
3664 structures (excluding homogeneous floating-point aggregates). */
3665 if (regno == R0_REGNUM || regno == R1_REGNUM)
3666 return true;
3667
3668 /* Up to four fp/simd registers can return a function value, e.g. a
3669 homogeneous floating-point aggregate having four members. */
3670 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3671 return TARGET_FLOAT;
3672
3673 return false;
3674 }
3675
3676 /* Implement TARGET_RETURN_IN_MEMORY.
3677
3678 If the type T of the result of a function is such that
3679 void func (T arg)
3680 would require that arg be passed as a value in a register (or set of
3681 registers) according to the parameter passing rules, then the result
3682 is returned in the same registers as would be used for such an
3683 argument. */
3684
3685 static bool
3686 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3687 {
3688 HOST_WIDE_INT size;
3689 machine_mode ag_mode;
3690 int count;
3691
3692 if (!AGGREGATE_TYPE_P (type)
3693 && TREE_CODE (type) != COMPLEX_TYPE
3694 && TREE_CODE (type) != VECTOR_TYPE)
3695 /* Simple scalar types always returned in registers. */
3696 return false;
3697
3698 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3699 type,
3700 &ag_mode,
3701 &count,
3702 NULL))
3703 return false;
3704
3705 /* Types larger than 2 registers returned in memory. */
3706 size = int_size_in_bytes (type);
3707 return (size < 0 || size > 2 * UNITS_PER_WORD);
3708 }
3709
3710 static bool
3711 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3712 const_tree type, int *nregs)
3713 {
3714 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3715 return aarch64_vfp_is_call_or_return_candidate (mode,
3716 type,
3717 &pcum->aapcs_vfp_rmode,
3718 nregs,
3719 NULL);
3720 }
3721
3722 /* Given MODE and TYPE of a function argument, return the alignment in
3723 bits. The idea is to suppress any stronger alignment requested by
3724 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3725 This is a helper function for local use only. */
3726
3727 static unsigned int
3728 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3729 {
3730 if (!type)
3731 return GET_MODE_ALIGNMENT (mode);
3732
3733 if (integer_zerop (TYPE_SIZE (type)))
3734 return 0;
3735
3736 gcc_assert (TYPE_MODE (type) == mode);
3737
3738 if (!AGGREGATE_TYPE_P (type))
3739 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3740
3741 if (TREE_CODE (type) == ARRAY_TYPE)
3742 return TYPE_ALIGN (TREE_TYPE (type));
3743
3744 unsigned int alignment = 0;
3745 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3746 if (TREE_CODE (field) == FIELD_DECL)
3747 alignment = std::max (alignment, DECL_ALIGN (field));
3748
3749 return alignment;
3750 }
3751
3752 /* Layout a function argument according to the AAPCS64 rules. The rule
3753 numbers refer to the rule numbers in the AAPCS64. */
3754
3755 static void
3756 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3757 const_tree type,
3758 bool named ATTRIBUTE_UNUSED)
3759 {
3760 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3761 int ncrn, nvrn, nregs;
3762 bool allocate_ncrn, allocate_nvrn;
3763 HOST_WIDE_INT size;
3764
3765 /* We need to do this once per argument. */
3766 if (pcum->aapcs_arg_processed)
3767 return;
3768
3769 pcum->aapcs_arg_processed = true;
3770
3771 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3772 if (type)
3773 size = int_size_in_bytes (type);
3774 else
3775 /* No frontends can create types with variable-sized modes, so we
3776 shouldn't be asked to pass or return them. */
3777 size = GET_MODE_SIZE (mode).to_constant ();
3778 size = ROUND_UP (size, UNITS_PER_WORD);
3779
3780 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3781 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3782 mode,
3783 type,
3784 &nregs);
3785
3786 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3787 The following code thus handles passing by SIMD/FP registers first. */
3788
3789 nvrn = pcum->aapcs_nvrn;
3790
3791 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3792 and homogenous short-vector aggregates (HVA). */
3793 if (allocate_nvrn)
3794 {
3795 if (!TARGET_FLOAT)
3796 aarch64_err_no_fpadvsimd (mode);
3797
3798 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3799 {
3800 pcum->aapcs_nextnvrn = nvrn + nregs;
3801 if (!aarch64_composite_type_p (type, mode))
3802 {
3803 gcc_assert (nregs == 1);
3804 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3805 }
3806 else
3807 {
3808 rtx par;
3809 int i;
3810 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3811 for (i = 0; i < nregs; i++)
3812 {
3813 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3814 V0_REGNUM + nvrn + i);
3815 rtx offset = gen_int_mode
3816 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3817 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3818 XVECEXP (par, 0, i) = tmp;
3819 }
3820 pcum->aapcs_reg = par;
3821 }
3822 return;
3823 }
3824 else
3825 {
3826 /* C.3 NSRN is set to 8. */
3827 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3828 goto on_stack;
3829 }
3830 }
3831
3832 ncrn = pcum->aapcs_ncrn;
3833 nregs = size / UNITS_PER_WORD;
3834
3835 /* C6 - C9. though the sign and zero extension semantics are
3836 handled elsewhere. This is the case where the argument fits
3837 entirely general registers. */
3838 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3839 {
3840
3841 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3842
3843 /* C.8 if the argument has an alignment of 16 then the NGRN is
3844 rounded up to the next even number. */
3845 if (nregs == 2
3846 && ncrn % 2
3847 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3848 comparison is there because for > 16 * BITS_PER_UNIT
3849 alignment nregs should be > 2 and therefore it should be
3850 passed by reference rather than value. */
3851 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3852 {
3853 ++ncrn;
3854 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3855 }
3856
3857 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3858 A reg is still generated for it, but the caller should be smart
3859 enough not to use it. */
3860 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3861 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3862 else
3863 {
3864 rtx par;
3865 int i;
3866
3867 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3868 for (i = 0; i < nregs; i++)
3869 {
3870 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3871 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3872 GEN_INT (i * UNITS_PER_WORD));
3873 XVECEXP (par, 0, i) = tmp;
3874 }
3875 pcum->aapcs_reg = par;
3876 }
3877
3878 pcum->aapcs_nextncrn = ncrn + nregs;
3879 return;
3880 }
3881
3882 /* C.11 */
3883 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3884
3885 /* The argument is passed on stack; record the needed number of words for
3886 this argument and align the total size if necessary. */
3887 on_stack:
3888 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3889
3890 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3891 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3892 16 / UNITS_PER_WORD);
3893 return;
3894 }
3895
3896 /* Implement TARGET_FUNCTION_ARG. */
3897
3898 static rtx
3899 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3900 const_tree type, bool named)
3901 {
3902 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3903 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3904
3905 if (mode == VOIDmode)
3906 return NULL_RTX;
3907
3908 aarch64_layout_arg (pcum_v, mode, type, named);
3909 return pcum->aapcs_reg;
3910 }
3911
3912 void
3913 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3914 const_tree fntype ATTRIBUTE_UNUSED,
3915 rtx libname ATTRIBUTE_UNUSED,
3916 const_tree fndecl ATTRIBUTE_UNUSED,
3917 unsigned n_named ATTRIBUTE_UNUSED)
3918 {
3919 pcum->aapcs_ncrn = 0;
3920 pcum->aapcs_nvrn = 0;
3921 pcum->aapcs_nextncrn = 0;
3922 pcum->aapcs_nextnvrn = 0;
3923 pcum->pcs_variant = ARM_PCS_AAPCS64;
3924 pcum->aapcs_reg = NULL_RTX;
3925 pcum->aapcs_arg_processed = false;
3926 pcum->aapcs_stack_words = 0;
3927 pcum->aapcs_stack_size = 0;
3928
3929 if (!TARGET_FLOAT
3930 && fndecl && TREE_PUBLIC (fndecl)
3931 && fntype && fntype != error_mark_node)
3932 {
3933 const_tree type = TREE_TYPE (fntype);
3934 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3935 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3936 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3937 &mode, &nregs, NULL))
3938 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3939 }
3940 return;
3941 }
3942
3943 static void
3944 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3945 machine_mode mode,
3946 const_tree type,
3947 bool named)
3948 {
3949 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3950 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3951 {
3952 aarch64_layout_arg (pcum_v, mode, type, named);
3953 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3954 != (pcum->aapcs_stack_words != 0));
3955 pcum->aapcs_arg_processed = false;
3956 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3957 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3958 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3959 pcum->aapcs_stack_words = 0;
3960 pcum->aapcs_reg = NULL_RTX;
3961 }
3962 }
3963
3964 bool
3965 aarch64_function_arg_regno_p (unsigned regno)
3966 {
3967 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3968 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3969 }
3970
3971 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3972 PARM_BOUNDARY bits of alignment, but will be given anything up
3973 to STACK_BOUNDARY bits if the type requires it. This makes sure
3974 that both before and after the layout of each argument, the Next
3975 Stacked Argument Address (NSAA) will have a minimum alignment of
3976 8 bytes. */
3977
3978 static unsigned int
3979 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3980 {
3981 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3982 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3983 }
3984
3985 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3986
3987 static fixed_size_mode
3988 aarch64_get_reg_raw_mode (int regno)
3989 {
3990 if (TARGET_SVE && FP_REGNUM_P (regno))
3991 /* Don't use the SVE part of the register for __builtin_apply and
3992 __builtin_return. The SVE registers aren't used by the normal PCS,
3993 so using them there would be a waste of time. The PCS extensions
3994 for SVE types are fundamentally incompatible with the
3995 __builtin_return/__builtin_apply interface. */
3996 return as_a <fixed_size_mode> (V16QImode);
3997 return default_get_reg_raw_mode (regno);
3998 }
3999
4000 /* Implement TARGET_FUNCTION_ARG_PADDING.
4001
4002 Small aggregate types are placed in the lowest memory address.
4003
4004 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4005
4006 static pad_direction
4007 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4008 {
4009 /* On little-endian targets, the least significant byte of every stack
4010 argument is passed at the lowest byte address of the stack slot. */
4011 if (!BYTES_BIG_ENDIAN)
4012 return PAD_UPWARD;
4013
4014 /* Otherwise, integral, floating-point and pointer types are padded downward:
4015 the least significant byte of a stack argument is passed at the highest
4016 byte address of the stack slot. */
4017 if (type
4018 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4019 || POINTER_TYPE_P (type))
4020 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4021 return PAD_DOWNWARD;
4022
4023 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4024 return PAD_UPWARD;
4025 }
4026
4027 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4028
4029 It specifies padding for the last (may also be the only)
4030 element of a block move between registers and memory. If
4031 assuming the block is in the memory, padding upward means that
4032 the last element is padded after its highest significant byte,
4033 while in downward padding, the last element is padded at the
4034 its least significant byte side.
4035
4036 Small aggregates and small complex types are always padded
4037 upwards.
4038
4039 We don't need to worry about homogeneous floating-point or
4040 short-vector aggregates; their move is not affected by the
4041 padding direction determined here. Regardless of endianness,
4042 each element of such an aggregate is put in the least
4043 significant bits of a fp/simd register.
4044
4045 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4046 register has useful data, and return the opposite if the most
4047 significant byte does. */
4048
4049 bool
4050 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4051 bool first ATTRIBUTE_UNUSED)
4052 {
4053
4054 /* Small composite types are always padded upward. */
4055 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4056 {
4057 HOST_WIDE_INT size;
4058 if (type)
4059 size = int_size_in_bytes (type);
4060 else
4061 /* No frontends can create types with variable-sized modes, so we
4062 shouldn't be asked to pass or return them. */
4063 size = GET_MODE_SIZE (mode).to_constant ();
4064 if (size < 2 * UNITS_PER_WORD)
4065 return true;
4066 }
4067
4068 /* Otherwise, use the default padding. */
4069 return !BYTES_BIG_ENDIAN;
4070 }
4071
4072 static scalar_int_mode
4073 aarch64_libgcc_cmp_return_mode (void)
4074 {
4075 return SImode;
4076 }
4077
4078 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4079
4080 /* We use the 12-bit shifted immediate arithmetic instructions so values
4081 must be multiple of (1 << 12), i.e. 4096. */
4082 #define ARITH_FACTOR 4096
4083
4084 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4085 #error Cannot use simple address calculation for stack probing
4086 #endif
4087
4088 /* The pair of scratch registers used for stack probing. */
4089 #define PROBE_STACK_FIRST_REG R9_REGNUM
4090 #define PROBE_STACK_SECOND_REG R10_REGNUM
4091
4092 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4093 inclusive. These are offsets from the current stack pointer. */
4094
4095 static void
4096 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4097 {
4098 HOST_WIDE_INT size;
4099 if (!poly_size.is_constant (&size))
4100 {
4101 sorry ("stack probes for SVE frames");
4102 return;
4103 }
4104
4105 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4106
4107 /* See the same assertion on PROBE_INTERVAL above. */
4108 gcc_assert ((first % ARITH_FACTOR) == 0);
4109
4110 /* See if we have a constant small number of probes to generate. If so,
4111 that's the easy case. */
4112 if (size <= PROBE_INTERVAL)
4113 {
4114 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4115
4116 emit_set_insn (reg1,
4117 plus_constant (Pmode,
4118 stack_pointer_rtx, -(first + base)));
4119 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4120 }
4121
4122 /* The run-time loop is made up of 8 insns in the generic case while the
4123 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4124 else if (size <= 4 * PROBE_INTERVAL)
4125 {
4126 HOST_WIDE_INT i, rem;
4127
4128 emit_set_insn (reg1,
4129 plus_constant (Pmode,
4130 stack_pointer_rtx,
4131 -(first + PROBE_INTERVAL)));
4132 emit_stack_probe (reg1);
4133
4134 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4135 it exceeds SIZE. If only two probes are needed, this will not
4136 generate any code. Then probe at FIRST + SIZE. */
4137 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4138 {
4139 emit_set_insn (reg1,
4140 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4141 emit_stack_probe (reg1);
4142 }
4143
4144 rem = size - (i - PROBE_INTERVAL);
4145 if (rem > 256)
4146 {
4147 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4148
4149 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4150 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4151 }
4152 else
4153 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4154 }
4155
4156 /* Otherwise, do the same as above, but in a loop. Note that we must be
4157 extra careful with variables wrapping around because we might be at
4158 the very top (or the very bottom) of the address space and we have
4159 to be able to handle this case properly; in particular, we use an
4160 equality test for the loop condition. */
4161 else
4162 {
4163 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4164
4165 /* Step 1: round SIZE to the previous multiple of the interval. */
4166
4167 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4168
4169
4170 /* Step 2: compute initial and final value of the loop counter. */
4171
4172 /* TEST_ADDR = SP + FIRST. */
4173 emit_set_insn (reg1,
4174 plus_constant (Pmode, stack_pointer_rtx, -first));
4175
4176 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4177 HOST_WIDE_INT adjustment = - (first + rounded_size);
4178 if (! aarch64_uimm12_shift (adjustment))
4179 {
4180 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4181 true, Pmode);
4182 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4183 }
4184 else
4185 emit_set_insn (reg2,
4186 plus_constant (Pmode, stack_pointer_rtx, adjustment));
4187
4188 /* Step 3: the loop
4189
4190 do
4191 {
4192 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4193 probe at TEST_ADDR
4194 }
4195 while (TEST_ADDR != LAST_ADDR)
4196
4197 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4198 until it is equal to ROUNDED_SIZE. */
4199
4200 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4201
4202
4203 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4204 that SIZE is equal to ROUNDED_SIZE. */
4205
4206 if (size != rounded_size)
4207 {
4208 HOST_WIDE_INT rem = size - rounded_size;
4209
4210 if (rem > 256)
4211 {
4212 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4213
4214 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4215 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4216 }
4217 else
4218 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4219 }
4220 }
4221
4222 /* Make sure nothing is scheduled before we are done. */
4223 emit_insn (gen_blockage ());
4224 }
4225
4226 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4227 absolute addresses. */
4228
4229 const char *
4230 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4231 {
4232 static int labelno = 0;
4233 char loop_lab[32];
4234 rtx xops[2];
4235
4236 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4237
4238 /* Loop. */
4239 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4240
4241 HOST_WIDE_INT stack_clash_probe_interval
4242 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4243
4244 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4245 xops[0] = reg1;
4246 HOST_WIDE_INT interval;
4247 if (flag_stack_clash_protection)
4248 interval = stack_clash_probe_interval;
4249 else
4250 interval = PROBE_INTERVAL;
4251
4252 gcc_assert (aarch64_uimm12_shift (interval));
4253 xops[1] = GEN_INT (interval);
4254
4255 output_asm_insn ("sub\t%0, %0, %1", xops);
4256
4257 /* If doing stack clash protection then we probe up by the ABI specified
4258 amount. We do this because we're dropping full pages at a time in the
4259 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4260 if (flag_stack_clash_protection)
4261 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4262 else
4263 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4264
4265 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4266 by this amount for each iteration. */
4267 output_asm_insn ("str\txzr, [%0, %1]", xops);
4268
4269 /* Test if TEST_ADDR == LAST_ADDR. */
4270 xops[1] = reg2;
4271 output_asm_insn ("cmp\t%0, %1", xops);
4272
4273 /* Branch. */
4274 fputs ("\tb.ne\t", asm_out_file);
4275 assemble_name_raw (asm_out_file, loop_lab);
4276 fputc ('\n', asm_out_file);
4277
4278 return "";
4279 }
4280
4281 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4282 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4283 of GUARD_SIZE. When a probe is emitted it is done at most
4284 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4285 at most MIN_PROBE_THRESHOLD. By the end of this function
4286 BASE = BASE - ADJUSTMENT. */
4287
4288 const char *
4289 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4290 rtx min_probe_threshold, rtx guard_size)
4291 {
4292 /* This function is not allowed to use any instruction generation function
4293 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4294 so instead emit the code you want using output_asm_insn. */
4295 gcc_assert (flag_stack_clash_protection);
4296 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4297 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4298
4299 /* The minimum required allocation before the residual requires probing. */
4300 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4301
4302 /* Clamp the value down to the nearest value that can be used with a cmp. */
4303 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4304 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4305
4306 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4307 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4308
4309 static int labelno = 0;
4310 char loop_start_lab[32];
4311 char loop_end_lab[32];
4312 rtx xops[2];
4313
4314 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4315 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4316
4317 /* Emit loop start label. */
4318 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4319
4320 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4321 xops[0] = adjustment;
4322 xops[1] = probe_offset_value_rtx;
4323 output_asm_insn ("cmp\t%0, %1", xops);
4324
4325 /* Branch to end if not enough adjustment to probe. */
4326 fputs ("\tb.lt\t", asm_out_file);
4327 assemble_name_raw (asm_out_file, loop_end_lab);
4328 fputc ('\n', asm_out_file);
4329
4330 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4331 xops[0] = base;
4332 xops[1] = probe_offset_value_rtx;
4333 output_asm_insn ("sub\t%0, %0, %1", xops);
4334
4335 /* Probe at BASE. */
4336 xops[1] = const0_rtx;
4337 output_asm_insn ("str\txzr, [%0, %1]", xops);
4338
4339 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4340 xops[0] = adjustment;
4341 xops[1] = probe_offset_value_rtx;
4342 output_asm_insn ("sub\t%0, %0, %1", xops);
4343
4344 /* Branch to start if still more bytes to allocate. */
4345 fputs ("\tb\t", asm_out_file);
4346 assemble_name_raw (asm_out_file, loop_start_lab);
4347 fputc ('\n', asm_out_file);
4348
4349 /* No probe leave. */
4350 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4351
4352 /* BASE = BASE - ADJUSTMENT. */
4353 xops[0] = base;
4354 xops[1] = adjustment;
4355 output_asm_insn ("sub\t%0, %0, %1", xops);
4356 return "";
4357 }
4358
4359 /* Determine whether a frame chain needs to be generated. */
4360 static bool
4361 aarch64_needs_frame_chain (void)
4362 {
4363 /* Force a frame chain for EH returns so the return address is at FP+8. */
4364 if (frame_pointer_needed || crtl->calls_eh_return)
4365 return true;
4366
4367 /* A leaf function cannot have calls or write LR. */
4368 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4369
4370 /* Don't use a frame chain in leaf functions if leaf frame pointers
4371 are disabled. */
4372 if (flag_omit_leaf_frame_pointer && is_leaf)
4373 return false;
4374
4375 return aarch64_use_frame_pointer;
4376 }
4377
4378 /* Mark the registers that need to be saved by the callee and calculate
4379 the size of the callee-saved registers area and frame record (both FP
4380 and LR may be omitted). */
4381 static void
4382 aarch64_layout_frame (void)
4383 {
4384 HOST_WIDE_INT offset = 0;
4385 int regno, last_fp_reg = INVALID_REGNUM;
4386 bool simd_function = aarch64_simd_decl_p (cfun->decl);
4387
4388 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4389
4390 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4391 the mid-end is doing. */
4392 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4393
4394 #define SLOT_NOT_REQUIRED (-2)
4395 #define SLOT_REQUIRED (-1)
4396
4397 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4398 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4399
4400 /* If this is a non-leaf simd function with calls we assume that
4401 at least one of those calls is to a non-simd function and thus
4402 we must save V8 to V23 in the prologue. */
4403
4404 if (simd_function && !crtl->is_leaf)
4405 {
4406 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4407 if (FP_SIMD_SAVED_REGNUM_P (regno))
4408 df_set_regs_ever_live (regno, true);
4409 }
4410
4411 /* First mark all the registers that really need to be saved... */
4412 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4413 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4414
4415 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4416 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4417
4418 /* ... that includes the eh data registers (if needed)... */
4419 if (crtl->calls_eh_return)
4420 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4421 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4422 = SLOT_REQUIRED;
4423
4424 /* ... and any callee saved register that dataflow says is live. */
4425 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4426 if (df_regs_ever_live_p (regno)
4427 && (regno == R30_REGNUM
4428 || !call_used_regs[regno]))
4429 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4430
4431 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4432 if (df_regs_ever_live_p (regno)
4433 && (!call_used_regs[regno]
4434 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4435 {
4436 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4437 last_fp_reg = regno;
4438 }
4439
4440 if (cfun->machine->frame.emit_frame_chain)
4441 {
4442 /* FP and LR are placed in the linkage record. */
4443 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4444 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4445 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4446 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4447 offset = 2 * UNITS_PER_WORD;
4448 }
4449
4450 /* With stack-clash, LR must be saved in non-leaf functions. */
4451 gcc_assert (crtl->is_leaf
4452 || (cfun->machine->frame.reg_offset[R30_REGNUM]
4453 != SLOT_NOT_REQUIRED));
4454
4455 /* Now assign stack slots for them. */
4456 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4457 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4458 {
4459 cfun->machine->frame.reg_offset[regno] = offset;
4460 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4461 cfun->machine->frame.wb_candidate1 = regno;
4462 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4463 cfun->machine->frame.wb_candidate2 = regno;
4464 offset += UNITS_PER_WORD;
4465 }
4466
4467 HOST_WIDE_INT max_int_offset = offset;
4468 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4469 bool has_align_gap = offset != max_int_offset;
4470
4471 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4472 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4473 {
4474 /* If there is an alignment gap between integer and fp callee-saves,
4475 allocate the last fp register to it if possible. */
4476 if (regno == last_fp_reg
4477 && has_align_gap
4478 && !simd_function
4479 && (offset & 8) == 0)
4480 {
4481 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4482 break;
4483 }
4484
4485 cfun->machine->frame.reg_offset[regno] = offset;
4486 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4487 cfun->machine->frame.wb_candidate1 = regno;
4488 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4489 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4490 cfun->machine->frame.wb_candidate2 = regno;
4491 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4492 }
4493
4494 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4495
4496 cfun->machine->frame.saved_regs_size = offset;
4497
4498 HOST_WIDE_INT varargs_and_saved_regs_size
4499 = offset + cfun->machine->frame.saved_varargs_size;
4500
4501 cfun->machine->frame.hard_fp_offset
4502 = aligned_upper_bound (varargs_and_saved_regs_size
4503 + get_frame_size (),
4504 STACK_BOUNDARY / BITS_PER_UNIT);
4505
4506 /* Both these values are already aligned. */
4507 gcc_assert (multiple_p (crtl->outgoing_args_size,
4508 STACK_BOUNDARY / BITS_PER_UNIT));
4509 cfun->machine->frame.frame_size
4510 = (cfun->machine->frame.hard_fp_offset
4511 + crtl->outgoing_args_size);
4512
4513 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4514
4515 cfun->machine->frame.initial_adjust = 0;
4516 cfun->machine->frame.final_adjust = 0;
4517 cfun->machine->frame.callee_adjust = 0;
4518 cfun->machine->frame.callee_offset = 0;
4519
4520 HOST_WIDE_INT max_push_offset = 0;
4521 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4522 max_push_offset = 512;
4523 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4524 max_push_offset = 256;
4525
4526 HOST_WIDE_INT const_size, const_fp_offset;
4527 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4528 && const_size < max_push_offset
4529 && known_eq (crtl->outgoing_args_size, 0))
4530 {
4531 /* Simple, small frame with no outgoing arguments:
4532 stp reg1, reg2, [sp, -frame_size]!
4533 stp reg3, reg4, [sp, 16] */
4534 cfun->machine->frame.callee_adjust = const_size;
4535 }
4536 else if (known_lt (crtl->outgoing_args_size
4537 + cfun->machine->frame.saved_regs_size, 512)
4538 && !(cfun->calls_alloca
4539 && known_lt (cfun->machine->frame.hard_fp_offset,
4540 max_push_offset)))
4541 {
4542 /* Frame with small outgoing arguments:
4543 sub sp, sp, frame_size
4544 stp reg1, reg2, [sp, outgoing_args_size]
4545 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4546 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4547 cfun->machine->frame.callee_offset
4548 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4549 }
4550 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4551 && const_fp_offset < max_push_offset)
4552 {
4553 /* Frame with large outgoing arguments but a small local area:
4554 stp reg1, reg2, [sp, -hard_fp_offset]!
4555 stp reg3, reg4, [sp, 16]
4556 sub sp, sp, outgoing_args_size */
4557 cfun->machine->frame.callee_adjust = const_fp_offset;
4558 cfun->machine->frame.final_adjust
4559 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4560 }
4561 else
4562 {
4563 /* Frame with large local area and outgoing arguments using frame pointer:
4564 sub sp, sp, hard_fp_offset
4565 stp x29, x30, [sp, 0]
4566 add x29, sp, 0
4567 stp reg3, reg4, [sp, 16]
4568 sub sp, sp, outgoing_args_size */
4569 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4570 cfun->machine->frame.final_adjust
4571 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4572 }
4573
4574 cfun->machine->frame.laid_out = true;
4575 }
4576
4577 /* Return true if the register REGNO is saved on entry to
4578 the current function. */
4579
4580 static bool
4581 aarch64_register_saved_on_entry (int regno)
4582 {
4583 return cfun->machine->frame.reg_offset[regno] >= 0;
4584 }
4585
4586 /* Return the next register up from REGNO up to LIMIT for the callee
4587 to save. */
4588
4589 static unsigned
4590 aarch64_next_callee_save (unsigned regno, unsigned limit)
4591 {
4592 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4593 regno ++;
4594 return regno;
4595 }
4596
4597 /* Push the register number REGNO of mode MODE to the stack with write-back
4598 adjusting the stack by ADJUSTMENT. */
4599
4600 static void
4601 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4602 HOST_WIDE_INT adjustment)
4603 {
4604 rtx base_rtx = stack_pointer_rtx;
4605 rtx insn, reg, mem;
4606
4607 reg = gen_rtx_REG (mode, regno);
4608 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4609 plus_constant (Pmode, base_rtx, -adjustment));
4610 mem = gen_frame_mem (mode, mem);
4611
4612 insn = emit_move_insn (mem, reg);
4613 RTX_FRAME_RELATED_P (insn) = 1;
4614 }
4615
4616 /* Generate and return an instruction to store the pair of registers
4617 REG and REG2 of mode MODE to location BASE with write-back adjusting
4618 the stack location BASE by ADJUSTMENT. */
4619
4620 static rtx
4621 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4622 HOST_WIDE_INT adjustment)
4623 {
4624 switch (mode)
4625 {
4626 case E_DImode:
4627 return gen_storewb_pairdi_di (base, base, reg, reg2,
4628 GEN_INT (-adjustment),
4629 GEN_INT (UNITS_PER_WORD - adjustment));
4630 case E_DFmode:
4631 return gen_storewb_pairdf_di (base, base, reg, reg2,
4632 GEN_INT (-adjustment),
4633 GEN_INT (UNITS_PER_WORD - adjustment));
4634 case E_TFmode:
4635 return gen_storewb_pairtf_di (base, base, reg, reg2,
4636 GEN_INT (-adjustment),
4637 GEN_INT (UNITS_PER_VREG - adjustment));
4638 default:
4639 gcc_unreachable ();
4640 }
4641 }
4642
4643 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4644 stack pointer by ADJUSTMENT. */
4645
4646 static void
4647 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4648 {
4649 rtx_insn *insn;
4650 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4651
4652 if (regno2 == INVALID_REGNUM)
4653 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4654
4655 rtx reg1 = gen_rtx_REG (mode, regno1);
4656 rtx reg2 = gen_rtx_REG (mode, regno2);
4657
4658 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4659 reg2, adjustment));
4660 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4661 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4662 RTX_FRAME_RELATED_P (insn) = 1;
4663 }
4664
4665 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4666 adjusting it by ADJUSTMENT afterwards. */
4667
4668 static rtx
4669 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4670 HOST_WIDE_INT adjustment)
4671 {
4672 switch (mode)
4673 {
4674 case E_DImode:
4675 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4676 GEN_INT (UNITS_PER_WORD));
4677 case E_DFmode:
4678 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4679 GEN_INT (UNITS_PER_WORD));
4680 case E_TFmode:
4681 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4682 GEN_INT (UNITS_PER_VREG));
4683 default:
4684 gcc_unreachable ();
4685 }
4686 }
4687
4688 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4689 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4690 into CFI_OPS. */
4691
4692 static void
4693 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4694 rtx *cfi_ops)
4695 {
4696 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4697 rtx reg1 = gen_rtx_REG (mode, regno1);
4698
4699 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4700
4701 if (regno2 == INVALID_REGNUM)
4702 {
4703 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4704 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4705 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4706 }
4707 else
4708 {
4709 rtx reg2 = gen_rtx_REG (mode, regno2);
4710 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4711 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4712 reg2, adjustment));
4713 }
4714 }
4715
4716 /* Generate and return a store pair instruction of mode MODE to store
4717 register REG1 to MEM1 and register REG2 to MEM2. */
4718
4719 static rtx
4720 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4721 rtx reg2)
4722 {
4723 switch (mode)
4724 {
4725 case E_DImode:
4726 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4727
4728 case E_DFmode:
4729 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4730
4731 case E_TFmode:
4732 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4733
4734 default:
4735 gcc_unreachable ();
4736 }
4737 }
4738
4739 /* Generate and regurn a load pair isntruction of mode MODE to load register
4740 REG1 from MEM1 and register REG2 from MEM2. */
4741
4742 static rtx
4743 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4744 rtx mem2)
4745 {
4746 switch (mode)
4747 {
4748 case E_DImode:
4749 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4750
4751 case E_DFmode:
4752 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4753
4754 case E_TFmode:
4755 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4756
4757 default:
4758 gcc_unreachable ();
4759 }
4760 }
4761
4762 /* Return TRUE if return address signing should be enabled for the current
4763 function, otherwise return FALSE. */
4764
4765 bool
4766 aarch64_return_address_signing_enabled (void)
4767 {
4768 /* This function should only be called after frame laid out. */
4769 gcc_assert (cfun->machine->frame.laid_out);
4770
4771 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4772 if it's LR is pushed onto stack. */
4773 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4774 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4775 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4776 }
4777
4778 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
4779 bool
4780 aarch64_bti_enabled (void)
4781 {
4782 return (aarch64_enable_bti == 1);
4783 }
4784
4785 /* Emit code to save the callee-saved registers from register number START
4786 to LIMIT to the stack at the location starting at offset START_OFFSET,
4787 skipping any write-back candidates if SKIP_WB is true. */
4788
4789 static void
4790 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4791 unsigned start, unsigned limit, bool skip_wb)
4792 {
4793 rtx_insn *insn;
4794 unsigned regno;
4795 unsigned regno2;
4796
4797 for (regno = aarch64_next_callee_save (start, limit);
4798 regno <= limit;
4799 regno = aarch64_next_callee_save (regno + 1, limit))
4800 {
4801 rtx reg, mem;
4802 poly_int64 offset;
4803 int offset_diff;
4804
4805 if (skip_wb
4806 && (regno == cfun->machine->frame.wb_candidate1
4807 || regno == cfun->machine->frame.wb_candidate2))
4808 continue;
4809
4810 if (cfun->machine->reg_is_wrapped_separately[regno])
4811 continue;
4812
4813 reg = gen_rtx_REG (mode, regno);
4814 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4815 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4816 offset));
4817
4818 regno2 = aarch64_next_callee_save (regno + 1, limit);
4819 offset_diff = cfun->machine->frame.reg_offset[regno2]
4820 - cfun->machine->frame.reg_offset[regno];
4821
4822 if (regno2 <= limit
4823 && !cfun->machine->reg_is_wrapped_separately[regno2]
4824 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4825 {
4826 rtx reg2 = gen_rtx_REG (mode, regno2);
4827 rtx mem2;
4828
4829 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4830 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4831 offset));
4832 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4833 reg2));
4834
4835 /* The first part of a frame-related parallel insn is
4836 always assumed to be relevant to the frame
4837 calculations; subsequent parts, are only
4838 frame-related if explicitly marked. */
4839 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4840 regno = regno2;
4841 }
4842 else
4843 insn = emit_move_insn (mem, reg);
4844
4845 RTX_FRAME_RELATED_P (insn) = 1;
4846 }
4847 }
4848
4849 /* Emit code to restore the callee registers of mode MODE from register
4850 number START up to and including LIMIT. Restore from the stack offset
4851 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4852 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4853
4854 static void
4855 aarch64_restore_callee_saves (machine_mode mode,
4856 poly_int64 start_offset, unsigned start,
4857 unsigned limit, bool skip_wb, rtx *cfi_ops)
4858 {
4859 rtx base_rtx = stack_pointer_rtx;
4860 unsigned regno;
4861 unsigned regno2;
4862 poly_int64 offset;
4863
4864 for (regno = aarch64_next_callee_save (start, limit);
4865 regno <= limit;
4866 regno = aarch64_next_callee_save (regno + 1, limit))
4867 {
4868 if (cfun->machine->reg_is_wrapped_separately[regno])
4869 continue;
4870
4871 rtx reg, mem;
4872 int offset_diff;
4873
4874 if (skip_wb
4875 && (regno == cfun->machine->frame.wb_candidate1
4876 || regno == cfun->machine->frame.wb_candidate2))
4877 continue;
4878
4879 reg = gen_rtx_REG (mode, regno);
4880 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4881 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4882
4883 regno2 = aarch64_next_callee_save (regno + 1, limit);
4884 offset_diff = cfun->machine->frame.reg_offset[regno2]
4885 - cfun->machine->frame.reg_offset[regno];
4886
4887 if (regno2 <= limit
4888 && !cfun->machine->reg_is_wrapped_separately[regno2]
4889 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4890 {
4891 rtx reg2 = gen_rtx_REG (mode, regno2);
4892 rtx mem2;
4893
4894 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4895 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4896 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4897
4898 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4899 regno = regno2;
4900 }
4901 else
4902 emit_move_insn (reg, mem);
4903 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4904 }
4905 }
4906
4907 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4908 of MODE. */
4909
4910 static inline bool
4911 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4912 {
4913 HOST_WIDE_INT multiple;
4914 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4915 && IN_RANGE (multiple, -8, 7));
4916 }
4917
4918 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4919 of MODE. */
4920
4921 static inline bool
4922 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4923 {
4924 HOST_WIDE_INT multiple;
4925 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4926 && IN_RANGE (multiple, 0, 63));
4927 }
4928
4929 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4930 of MODE. */
4931
4932 bool
4933 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4934 {
4935 HOST_WIDE_INT multiple;
4936 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4937 && IN_RANGE (multiple, -64, 63));
4938 }
4939
4940 /* Return true if OFFSET is a signed 9-bit value. */
4941
4942 bool
4943 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4944 poly_int64 offset)
4945 {
4946 HOST_WIDE_INT const_offset;
4947 return (offset.is_constant (&const_offset)
4948 && IN_RANGE (const_offset, -256, 255));
4949 }
4950
4951 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4952 of MODE. */
4953
4954 static inline bool
4955 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4956 {
4957 HOST_WIDE_INT multiple;
4958 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4959 && IN_RANGE (multiple, -256, 255));
4960 }
4961
4962 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4963 of MODE. */
4964
4965 static inline bool
4966 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4967 {
4968 HOST_WIDE_INT multiple;
4969 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4970 && IN_RANGE (multiple, 0, 4095));
4971 }
4972
4973 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4974
4975 static sbitmap
4976 aarch64_get_separate_components (void)
4977 {
4978 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4979 bitmap_clear (components);
4980
4981 /* The registers we need saved to the frame. */
4982 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4983 if (aarch64_register_saved_on_entry (regno))
4984 {
4985 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4986 if (!frame_pointer_needed)
4987 offset += cfun->machine->frame.frame_size
4988 - cfun->machine->frame.hard_fp_offset;
4989 /* Check that we can access the stack slot of the register with one
4990 direct load with no adjustments needed. */
4991 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4992 bitmap_set_bit (components, regno);
4993 }
4994
4995 /* Don't mess with the hard frame pointer. */
4996 if (frame_pointer_needed)
4997 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4998
4999 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5000 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5001 /* If registers have been chosen to be stored/restored with
5002 writeback don't interfere with them to avoid having to output explicit
5003 stack adjustment instructions. */
5004 if (reg2 != INVALID_REGNUM)
5005 bitmap_clear_bit (components, reg2);
5006 if (reg1 != INVALID_REGNUM)
5007 bitmap_clear_bit (components, reg1);
5008
5009 bitmap_clear_bit (components, LR_REGNUM);
5010 bitmap_clear_bit (components, SP_REGNUM);
5011
5012 return components;
5013 }
5014
5015 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5016
5017 static sbitmap
5018 aarch64_components_for_bb (basic_block bb)
5019 {
5020 bitmap in = DF_LIVE_IN (bb);
5021 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5022 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5023 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5024
5025 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5026 bitmap_clear (components);
5027
5028 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5029 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5030 if ((!call_used_regs[regno]
5031 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5032 && (bitmap_bit_p (in, regno)
5033 || bitmap_bit_p (gen, regno)
5034 || bitmap_bit_p (kill, regno)))
5035 {
5036 unsigned regno2, offset, offset2;
5037 bitmap_set_bit (components, regno);
5038
5039 /* If there is a callee-save at an adjacent offset, add it too
5040 to increase the use of LDP/STP. */
5041 offset = cfun->machine->frame.reg_offset[regno];
5042 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5043
5044 if (regno2 <= LAST_SAVED_REGNUM)
5045 {
5046 offset2 = cfun->machine->frame.reg_offset[regno2];
5047 if ((offset & ~8) == (offset2 & ~8))
5048 bitmap_set_bit (components, regno2);
5049 }
5050 }
5051
5052 return components;
5053 }
5054
5055 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5056 Nothing to do for aarch64. */
5057
5058 static void
5059 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5060 {
5061 }
5062
5063 /* Return the next set bit in BMP from START onwards. Return the total number
5064 of bits in BMP if no set bit is found at or after START. */
5065
5066 static unsigned int
5067 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5068 {
5069 unsigned int nbits = SBITMAP_SIZE (bmp);
5070 if (start == nbits)
5071 return start;
5072
5073 gcc_assert (start < nbits);
5074 for (unsigned int i = start; i < nbits; i++)
5075 if (bitmap_bit_p (bmp, i))
5076 return i;
5077
5078 return nbits;
5079 }
5080
5081 /* Do the work for aarch64_emit_prologue_components and
5082 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5083 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5084 for these components or the epilogue sequence. That is, it determines
5085 whether we should emit stores or loads and what kind of CFA notes to attach
5086 to the insns. Otherwise the logic for the two sequences is very
5087 similar. */
5088
5089 static void
5090 aarch64_process_components (sbitmap components, bool prologue_p)
5091 {
5092 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5093 ? HARD_FRAME_POINTER_REGNUM
5094 : STACK_POINTER_REGNUM);
5095
5096 unsigned last_regno = SBITMAP_SIZE (components);
5097 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5098 rtx_insn *insn = NULL;
5099
5100 while (regno != last_regno)
5101 {
5102 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5103 so DFmode for the vector registers is enough. For simd functions
5104 we want to save the low 128 bits. */
5105 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5106
5107 rtx reg = gen_rtx_REG (mode, regno);
5108 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5109 if (!frame_pointer_needed)
5110 offset += cfun->machine->frame.frame_size
5111 - cfun->machine->frame.hard_fp_offset;
5112 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5113 rtx mem = gen_frame_mem (mode, addr);
5114
5115 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5116 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5117 /* No more registers to handle after REGNO.
5118 Emit a single save/restore and exit. */
5119 if (regno2 == last_regno)
5120 {
5121 insn = emit_insn (set);
5122 RTX_FRAME_RELATED_P (insn) = 1;
5123 if (prologue_p)
5124 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5125 else
5126 add_reg_note (insn, REG_CFA_RESTORE, reg);
5127 break;
5128 }
5129
5130 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5131 /* The next register is not of the same class or its offset is not
5132 mergeable with the current one into a pair. */
5133 if (!satisfies_constraint_Ump (mem)
5134 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5135 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5136 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5137 GET_MODE_SIZE (mode)))
5138 {
5139 insn = emit_insn (set);
5140 RTX_FRAME_RELATED_P (insn) = 1;
5141 if (prologue_p)
5142 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5143 else
5144 add_reg_note (insn, REG_CFA_RESTORE, reg);
5145
5146 regno = regno2;
5147 continue;
5148 }
5149
5150 /* REGNO2 can be saved/restored in a pair with REGNO. */
5151 rtx reg2 = gen_rtx_REG (mode, regno2);
5152 if (!frame_pointer_needed)
5153 offset2 += cfun->machine->frame.frame_size
5154 - cfun->machine->frame.hard_fp_offset;
5155 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5156 rtx mem2 = gen_frame_mem (mode, addr2);
5157 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5158 : gen_rtx_SET (reg2, mem2);
5159
5160 if (prologue_p)
5161 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5162 else
5163 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5164
5165 RTX_FRAME_RELATED_P (insn) = 1;
5166 if (prologue_p)
5167 {
5168 add_reg_note (insn, REG_CFA_OFFSET, set);
5169 add_reg_note (insn, REG_CFA_OFFSET, set2);
5170 }
5171 else
5172 {
5173 add_reg_note (insn, REG_CFA_RESTORE, reg);
5174 add_reg_note (insn, REG_CFA_RESTORE, reg2);
5175 }
5176
5177 regno = aarch64_get_next_set_bit (components, regno2 + 1);
5178 }
5179 }
5180
5181 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5182
5183 static void
5184 aarch64_emit_prologue_components (sbitmap components)
5185 {
5186 aarch64_process_components (components, true);
5187 }
5188
5189 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5190
5191 static void
5192 aarch64_emit_epilogue_components (sbitmap components)
5193 {
5194 aarch64_process_components (components, false);
5195 }
5196
5197 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5198
5199 static void
5200 aarch64_set_handled_components (sbitmap components)
5201 {
5202 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5203 if (bitmap_bit_p (components, regno))
5204 cfun->machine->reg_is_wrapped_separately[regno] = true;
5205 }
5206
5207 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5208 determining the probe offset for alloca. */
5209
5210 static HOST_WIDE_INT
5211 aarch64_stack_clash_protection_alloca_probe_range (void)
5212 {
5213 return STACK_CLASH_CALLER_GUARD;
5214 }
5215
5216
5217 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5218 registers. If POLY_SIZE is not large enough to require a probe this function
5219 will only adjust the stack. When allocating the stack space
5220 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5221 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5222 arguments. If we are then we ensure that any allocation larger than the ABI
5223 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5224 maintained.
5225
5226 We emit barriers after each stack adjustment to prevent optimizations from
5227 breaking the invariant that we never drop the stack more than a page. This
5228 invariant is needed to make it easier to correctly handle asynchronous
5229 events, e.g. if we were to allow the stack to be dropped by more than a page
5230 and then have multiple probes up and we take a signal somewhere in between
5231 then the signal handler doesn't know the state of the stack and can make no
5232 assumptions about which pages have been probed. */
5233
5234 static void
5235 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5236 poly_int64 poly_size,
5237 bool frame_related_p,
5238 bool final_adjustment_p)
5239 {
5240 HOST_WIDE_INT guard_size
5241 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5242 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5243 /* When doing the final adjustment for the outgoing argument size we can't
5244 assume that LR was saved at position 0. So subtract it's offset from the
5245 ABI safe buffer so that we don't accidentally allow an adjustment that
5246 would result in an allocation larger than the ABI buffer without
5247 probing. */
5248 HOST_WIDE_INT min_probe_threshold
5249 = final_adjustment_p
5250 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5251 : guard_size - guard_used_by_caller;
5252
5253 poly_int64 frame_size = cfun->machine->frame.frame_size;
5254
5255 /* We should always have a positive probe threshold. */
5256 gcc_assert (min_probe_threshold > 0);
5257
5258 if (flag_stack_clash_protection && !final_adjustment_p)
5259 {
5260 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5261 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5262
5263 if (known_eq (frame_size, 0))
5264 {
5265 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5266 }
5267 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5268 && known_lt (final_adjust, guard_used_by_caller))
5269 {
5270 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5271 }
5272 }
5273
5274 /* If SIZE is not large enough to require probing, just adjust the stack and
5275 exit. */
5276 if (known_lt (poly_size, min_probe_threshold)
5277 || !flag_stack_clash_protection)
5278 {
5279 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5280 return;
5281 }
5282
5283 HOST_WIDE_INT size;
5284 /* Handle the SVE non-constant case first. */
5285 if (!poly_size.is_constant (&size))
5286 {
5287 if (dump_file)
5288 {
5289 fprintf (dump_file, "Stack clash SVE prologue: ");
5290 print_dec (poly_size, dump_file);
5291 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5292 }
5293
5294 /* First calculate the amount of bytes we're actually spilling. */
5295 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5296 poly_size, temp1, temp2, false, true);
5297
5298 rtx_insn *insn = get_last_insn ();
5299
5300 if (frame_related_p)
5301 {
5302 /* This is done to provide unwinding information for the stack
5303 adjustments we're about to do, however to prevent the optimizers
5304 from removing the R15 move and leaving the CFA note (which would be
5305 very wrong) we tie the old and new stack pointer together.
5306 The tie will expand to nothing but the optimizers will not touch
5307 the instruction. */
5308 rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
5309 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5310 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5311
5312 /* We want the CFA independent of the stack pointer for the
5313 duration of the loop. */
5314 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5315 RTX_FRAME_RELATED_P (insn) = 1;
5316 }
5317
5318 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5319 rtx guard_const = gen_int_mode (guard_size, Pmode);
5320
5321 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5322 stack_pointer_rtx, temp1,
5323 probe_const, guard_const));
5324
5325 /* Now reset the CFA register if needed. */
5326 if (frame_related_p)
5327 {
5328 add_reg_note (insn, REG_CFA_DEF_CFA,
5329 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5330 gen_int_mode (poly_size, Pmode)));
5331 RTX_FRAME_RELATED_P (insn) = 1;
5332 }
5333
5334 return;
5335 }
5336
5337 if (dump_file)
5338 fprintf (dump_file,
5339 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5340 " bytes, probing will be required.\n", size);
5341
5342 /* Round size to the nearest multiple of guard_size, and calculate the
5343 residual as the difference between the original size and the rounded
5344 size. */
5345 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5346 HOST_WIDE_INT residual = size - rounded_size;
5347
5348 /* We can handle a small number of allocations/probes inline. Otherwise
5349 punt to a loop. */
5350 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5351 {
5352 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5353 {
5354 aarch64_sub_sp (NULL, temp2, guard_size, true);
5355 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5356 guard_used_by_caller));
5357 emit_insn (gen_blockage ());
5358 }
5359 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5360 }
5361 else
5362 {
5363 /* Compute the ending address. */
5364 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5365 temp1, NULL, false, true);
5366 rtx_insn *insn = get_last_insn ();
5367
5368 /* For the initial allocation, we don't have a frame pointer
5369 set up, so we always need CFI notes. If we're doing the
5370 final allocation, then we may have a frame pointer, in which
5371 case it is the CFA, otherwise we need CFI notes.
5372
5373 We can determine which allocation we are doing by looking at
5374 the value of FRAME_RELATED_P since the final allocations are not
5375 frame related. */
5376 if (frame_related_p)
5377 {
5378 /* We want the CFA independent of the stack pointer for the
5379 duration of the loop. */
5380 add_reg_note (insn, REG_CFA_DEF_CFA,
5381 plus_constant (Pmode, temp1, rounded_size));
5382 RTX_FRAME_RELATED_P (insn) = 1;
5383 }
5384
5385 /* This allocates and probes the stack. Note that this re-uses some of
5386 the existing Ada stack protection code. However we are guaranteed not
5387 to enter the non loop or residual branches of that code.
5388
5389 The non-loop part won't be entered because if our allocation amount
5390 doesn't require a loop, the case above would handle it.
5391
5392 The residual amount won't be entered because TEMP1 is a mutliple of
5393 the allocation size. The residual will always be 0. As such, the only
5394 part we are actually using from that code is the loop setup. The
5395 actual probing is done in aarch64_output_probe_stack_range. */
5396 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5397 stack_pointer_rtx, temp1));
5398
5399 /* Now reset the CFA register if needed. */
5400 if (frame_related_p)
5401 {
5402 add_reg_note (insn, REG_CFA_DEF_CFA,
5403 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5404 RTX_FRAME_RELATED_P (insn) = 1;
5405 }
5406
5407 emit_insn (gen_blockage ());
5408 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5409 }
5410
5411 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5412 be probed. This maintains the requirement that each page is probed at
5413 least once. For initial probing we probe only if the allocation is
5414 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5415 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5416 GUARD_SIZE. This works that for any allocation that is large enough to
5417 trigger a probe here, we'll have at least one, and if they're not large
5418 enough for this code to emit anything for them, The page would have been
5419 probed by the saving of FP/LR either by this function or any callees. If
5420 we don't have any callees then we won't have more stack adjustments and so
5421 are still safe. */
5422 if (residual)
5423 {
5424 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5425 /* If we're doing final adjustments, and we've done any full page
5426 allocations then any residual needs to be probed. */
5427 if (final_adjustment_p && rounded_size != 0)
5428 min_probe_threshold = 0;
5429 /* If doing a small final adjustment, we always probe at offset 0.
5430 This is done to avoid issues when LR is not at position 0 or when
5431 the final adjustment is smaller than the probing offset. */
5432 else if (final_adjustment_p && rounded_size == 0)
5433 residual_probe_offset = 0;
5434
5435 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5436 if (residual >= min_probe_threshold)
5437 {
5438 if (dump_file)
5439 fprintf (dump_file,
5440 "Stack clash AArch64 prologue residuals: "
5441 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5442 "\n", residual);
5443
5444 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5445 residual_probe_offset));
5446 emit_insn (gen_blockage ());
5447 }
5448 }
5449 }
5450
5451 /* Return 1 if the register is used by the epilogue. We need to say the
5452 return register is used, but only after epilogue generation is complete.
5453 Note that in the case of sibcalls, the values "used by the epilogue" are
5454 considered live at the start of the called function.
5455
5456 For SIMD functions we need to return 1 for FP registers that are saved and
5457 restored by a function but are not zero in call_used_regs. If we do not do
5458 this optimizations may remove the restore of the register. */
5459
5460 int
5461 aarch64_epilogue_uses (int regno)
5462 {
5463 if (epilogue_completed)
5464 {
5465 if (regno == LR_REGNUM)
5466 return 1;
5467 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5468 return 1;
5469 }
5470 return 0;
5471 }
5472
5473 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5474 is saved at BASE + OFFSET. */
5475
5476 static void
5477 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5478 rtx base, poly_int64 offset)
5479 {
5480 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5481 add_reg_note (insn, REG_CFA_EXPRESSION,
5482 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5483 }
5484
5485 /* AArch64 stack frames generated by this compiler look like:
5486
5487 +-------------------------------+
5488 | |
5489 | incoming stack arguments |
5490 | |
5491 +-------------------------------+
5492 | | <-- incoming stack pointer (aligned)
5493 | callee-allocated save area |
5494 | for register varargs |
5495 | |
5496 +-------------------------------+
5497 | local variables | <-- frame_pointer_rtx
5498 | |
5499 +-------------------------------+
5500 | padding | \
5501 +-------------------------------+ |
5502 | callee-saved registers | | frame.saved_regs_size
5503 +-------------------------------+ |
5504 | LR' | |
5505 +-------------------------------+ |
5506 | FP' | / <- hard_frame_pointer_rtx (aligned)
5507 +-------------------------------+
5508 | dynamic allocation |
5509 +-------------------------------+
5510 | padding |
5511 +-------------------------------+
5512 | outgoing stack arguments | <-- arg_pointer
5513 | |
5514 +-------------------------------+
5515 | | <-- stack_pointer_rtx (aligned)
5516
5517 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5518 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5519 unchanged.
5520
5521 By default for stack-clash we assume the guard is at least 64KB, but this
5522 value is configurable to either 4KB or 64KB. We also force the guard size to
5523 be the same as the probing interval and both values are kept in sync.
5524
5525 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5526 on the guard size) of stack space without probing.
5527
5528 When probing is needed, we emit a probe at the start of the prologue
5529 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5530
5531 We have to track how much space has been allocated and the only stores
5532 to the stack we track as implicit probes are the FP/LR stores.
5533
5534 For outgoing arguments we probe if the size is larger than 1KB, such that
5535 the ABI specified buffer is maintained for the next callee. */
5536
5537 /* Generate the prologue instructions for entry into a function.
5538 Establish the stack frame by decreasing the stack pointer with a
5539 properly calculated size and, if necessary, create a frame record
5540 filled with the values of LR and previous frame pointer. The
5541 current FP is also set up if it is in use. */
5542
5543 void
5544 aarch64_expand_prologue (void)
5545 {
5546 poly_int64 frame_size = cfun->machine->frame.frame_size;
5547 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5548 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5549 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5550 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5551 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5552 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5553 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5554 rtx_insn *insn;
5555
5556 /* Sign return address for functions. */
5557 if (aarch64_return_address_signing_enabled ())
5558 {
5559 insn = emit_insn (gen_pacisp ());
5560 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5561 RTX_FRAME_RELATED_P (insn) = 1;
5562 }
5563
5564 if (flag_stack_usage_info)
5565 current_function_static_stack_size = constant_lower_bound (frame_size);
5566
5567 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5568 {
5569 if (crtl->is_leaf && !cfun->calls_alloca)
5570 {
5571 if (maybe_gt (frame_size, PROBE_INTERVAL)
5572 && maybe_gt (frame_size, get_stack_check_protect ()))
5573 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5574 (frame_size
5575 - get_stack_check_protect ()));
5576 }
5577 else if (maybe_gt (frame_size, 0))
5578 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5579 }
5580
5581 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5582 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5583
5584 /* In theory we should never have both an initial adjustment
5585 and a callee save adjustment. Verify that is the case since the
5586 code below does not handle it for -fstack-clash-protection. */
5587 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5588
5589 /* Will only probe if the initial adjustment is larger than the guard
5590 less the amount of the guard reserved for use by the caller's
5591 outgoing args. */
5592 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5593 true, false);
5594
5595 if (callee_adjust != 0)
5596 aarch64_push_regs (reg1, reg2, callee_adjust);
5597
5598 if (emit_frame_chain)
5599 {
5600 poly_int64 reg_offset = callee_adjust;
5601 if (callee_adjust == 0)
5602 {
5603 reg1 = R29_REGNUM;
5604 reg2 = R30_REGNUM;
5605 reg_offset = callee_offset;
5606 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5607 }
5608 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5609 stack_pointer_rtx, callee_offset,
5610 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5611 if (frame_pointer_needed && !frame_size.is_constant ())
5612 {
5613 /* Variable-sized frames need to describe the save slot
5614 address using DW_CFA_expression rather than DW_CFA_offset.
5615 This means that, without taking further action, the
5616 locations of the registers that we've already saved would
5617 remain based on the stack pointer even after we redefine
5618 the CFA based on the frame pointer. We therefore need new
5619 DW_CFA_expressions to re-express the save slots with addresses
5620 based on the frame pointer. */
5621 rtx_insn *insn = get_last_insn ();
5622 gcc_assert (RTX_FRAME_RELATED_P (insn));
5623
5624 /* Add an explicit CFA definition if this was previously
5625 implicit. */
5626 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5627 {
5628 rtx src = plus_constant (Pmode, stack_pointer_rtx,
5629 callee_offset);
5630 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5631 gen_rtx_SET (hard_frame_pointer_rtx, src));
5632 }
5633
5634 /* Change the save slot expressions for the registers that
5635 we've already saved. */
5636 reg_offset -= callee_offset;
5637 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5638 reg_offset + UNITS_PER_WORD);
5639 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5640 reg_offset);
5641 }
5642 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5643 }
5644
5645 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5646 callee_adjust != 0 || emit_frame_chain);
5647 if (aarch64_simd_decl_p (cfun->decl))
5648 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5649 callee_adjust != 0 || emit_frame_chain);
5650 else
5651 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5652 callee_adjust != 0 || emit_frame_chain);
5653
5654 /* We may need to probe the final adjustment if it is larger than the guard
5655 that is assumed by the called. */
5656 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5657 !frame_pointer_needed, true);
5658 }
5659
5660 /* Return TRUE if we can use a simple_return insn.
5661
5662 This function checks whether the callee saved stack is empty, which
5663 means no restore actions are need. The pro_and_epilogue will use
5664 this to check whether shrink-wrapping opt is feasible. */
5665
5666 bool
5667 aarch64_use_return_insn_p (void)
5668 {
5669 if (!reload_completed)
5670 return false;
5671
5672 if (crtl->profile)
5673 return false;
5674
5675 return known_eq (cfun->machine->frame.frame_size, 0);
5676 }
5677
5678 /* Return false for non-leaf SIMD functions in order to avoid
5679 shrink-wrapping them. Doing this will lose the necessary
5680 save/restore of FP registers. */
5681
5682 bool
5683 aarch64_use_simple_return_insn_p (void)
5684 {
5685 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5686 return false;
5687
5688 return true;
5689 }
5690
5691 /* Generate the epilogue instructions for returning from a function.
5692 This is almost exactly the reverse of the prolog sequence, except
5693 that we need to insert barriers to avoid scheduling loads that read
5694 from a deallocated stack, and we optimize the unwind records by
5695 emitting them all together if possible. */
5696 void
5697 aarch64_expand_epilogue (bool for_sibcall)
5698 {
5699 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5700 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5701 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5702 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5703 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5704 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5705 rtx cfi_ops = NULL;
5706 rtx_insn *insn;
5707 /* A stack clash protection prologue may not have left EP0_REGNUM or
5708 EP1_REGNUM in a usable state. The same is true for allocations
5709 with an SVE component, since we then need both temporary registers
5710 for each allocation. For stack clash we are in a usable state if
5711 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5712 HOST_WIDE_INT guard_size
5713 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5714 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5715
5716 /* We can re-use the registers when the allocation amount is smaller than
5717 guard_size - guard_used_by_caller because we won't be doing any probes
5718 then. In such situations the register should remain live with the correct
5719 value. */
5720 bool can_inherit_p = (initial_adjust.is_constant ()
5721 && final_adjust.is_constant ())
5722 && (!flag_stack_clash_protection
5723 || known_lt (initial_adjust,
5724 guard_size - guard_used_by_caller));
5725
5726 /* We need to add memory barrier to prevent read from deallocated stack. */
5727 bool need_barrier_p
5728 = maybe_ne (get_frame_size ()
5729 + cfun->machine->frame.saved_varargs_size, 0);
5730
5731 /* Emit a barrier to prevent loads from a deallocated stack. */
5732 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5733 || cfun->calls_alloca
5734 || crtl->calls_eh_return)
5735 {
5736 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5737 need_barrier_p = false;
5738 }
5739
5740 /* Restore the stack pointer from the frame pointer if it may not
5741 be the same as the stack pointer. */
5742 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5743 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5744 if (frame_pointer_needed
5745 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5746 /* If writeback is used when restoring callee-saves, the CFA
5747 is restored on the instruction doing the writeback. */
5748 aarch64_add_offset (Pmode, stack_pointer_rtx,
5749 hard_frame_pointer_rtx, -callee_offset,
5750 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5751 else
5752 /* The case where we need to re-use the register here is very rare, so
5753 avoid the complicated condition and just always emit a move if the
5754 immediate doesn't fit. */
5755 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5756
5757 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5758 callee_adjust != 0, &cfi_ops);
5759 if (aarch64_simd_decl_p (cfun->decl))
5760 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5761 callee_adjust != 0, &cfi_ops);
5762 else
5763 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5764 callee_adjust != 0, &cfi_ops);
5765
5766 if (need_barrier_p)
5767 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5768
5769 if (callee_adjust != 0)
5770 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5771
5772 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5773 {
5774 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5775 insn = get_last_insn ();
5776 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5777 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5778 RTX_FRAME_RELATED_P (insn) = 1;
5779 cfi_ops = NULL;
5780 }
5781
5782 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5783 add restriction on emit_move optimization to leaf functions. */
5784 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5785 (!can_inherit_p || !crtl->is_leaf
5786 || df_regs_ever_live_p (EP0_REGNUM)));
5787
5788 if (cfi_ops)
5789 {
5790 /* Emit delayed restores and reset the CFA to be SP. */
5791 insn = get_last_insn ();
5792 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5793 REG_NOTES (insn) = cfi_ops;
5794 RTX_FRAME_RELATED_P (insn) = 1;
5795 }
5796
5797 /* We prefer to emit the combined return/authenticate instruction RETAA,
5798 however there are three cases in which we must instead emit an explicit
5799 authentication instruction.
5800
5801 1) Sibcalls don't return in a normal way, so if we're about to call one
5802 we must authenticate.
5803
5804 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5805 generating code for !TARGET_ARMV8_3 we can't use it and must
5806 explicitly authenticate.
5807
5808 3) On an eh_return path we make extra stack adjustments to update the
5809 canonical frame address to be the exception handler's CFA. We want
5810 to authenticate using the CFA of the function which calls eh_return.
5811 */
5812 if (aarch64_return_address_signing_enabled ()
5813 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5814 {
5815 insn = emit_insn (gen_autisp ());
5816 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5817 RTX_FRAME_RELATED_P (insn) = 1;
5818 }
5819
5820 /* Stack adjustment for exception handler. */
5821 if (crtl->calls_eh_return)
5822 {
5823 /* We need to unwind the stack by the offset computed by
5824 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5825 to be SP; letting the CFA move during this adjustment
5826 is just as correct as retaining the CFA from the body
5827 of the function. Therefore, do nothing special. */
5828 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5829 }
5830
5831 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5832 if (!for_sibcall)
5833 emit_jump_insn (ret_rtx);
5834 }
5835
5836 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5837 normally or return to a previous frame after unwinding.
5838
5839 An EH return uses a single shared return sequence. The epilogue is
5840 exactly like a normal epilogue except that it has an extra input
5841 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5842 that must be applied after the frame has been destroyed. An extra label
5843 is inserted before the epilogue which initializes this register to zero,
5844 and this is the entry point for a normal return.
5845
5846 An actual EH return updates the return address, initializes the stack
5847 adjustment and jumps directly into the epilogue (bypassing the zeroing
5848 of the adjustment). Since the return address is typically saved on the
5849 stack when a function makes a call, the saved LR must be updated outside
5850 the epilogue.
5851
5852 This poses problems as the store is generated well before the epilogue,
5853 so the offset of LR is not known yet. Also optimizations will remove the
5854 store as it appears dead, even after the epilogue is generated (as the
5855 base or offset for loading LR is different in many cases).
5856
5857 To avoid these problems this implementation forces the frame pointer
5858 in eh_return functions so that the location of LR is fixed and known early.
5859 It also marks the store volatile, so no optimization is permitted to
5860 remove the store. */
5861 rtx
5862 aarch64_eh_return_handler_rtx (void)
5863 {
5864 rtx tmp = gen_frame_mem (Pmode,
5865 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5866
5867 /* Mark the store volatile, so no optimization is permitted to remove it. */
5868 MEM_VOLATILE_P (tmp) = true;
5869 return tmp;
5870 }
5871
5872 /* Output code to add DELTA to the first argument, and then jump
5873 to FUNCTION. Used for C++ multiple inheritance. */
5874 static void
5875 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5876 HOST_WIDE_INT delta,
5877 HOST_WIDE_INT vcall_offset,
5878 tree function)
5879 {
5880 /* The this pointer is always in x0. Note that this differs from
5881 Arm where the this pointer maybe bumped to r1 if r0 is required
5882 to return a pointer to an aggregate. On AArch64 a result value
5883 pointer will be in x8. */
5884 int this_regno = R0_REGNUM;
5885 rtx this_rtx, temp0, temp1, addr, funexp;
5886 rtx_insn *insn;
5887
5888 reload_completed = 1;
5889 emit_note (NOTE_INSN_PROLOGUE_END);
5890
5891 this_rtx = gen_rtx_REG (Pmode, this_regno);
5892 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5893 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5894
5895 if (vcall_offset == 0)
5896 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5897 else
5898 {
5899 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5900
5901 addr = this_rtx;
5902 if (delta != 0)
5903 {
5904 if (delta >= -256 && delta < 256)
5905 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5906 plus_constant (Pmode, this_rtx, delta));
5907 else
5908 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5909 temp1, temp0, false);
5910 }
5911
5912 if (Pmode == ptr_mode)
5913 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5914 else
5915 aarch64_emit_move (temp0,
5916 gen_rtx_ZERO_EXTEND (Pmode,
5917 gen_rtx_MEM (ptr_mode, addr)));
5918
5919 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5920 addr = plus_constant (Pmode, temp0, vcall_offset);
5921 else
5922 {
5923 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5924 Pmode);
5925 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5926 }
5927
5928 if (Pmode == ptr_mode)
5929 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5930 else
5931 aarch64_emit_move (temp1,
5932 gen_rtx_SIGN_EXTEND (Pmode,
5933 gen_rtx_MEM (ptr_mode, addr)));
5934
5935 emit_insn (gen_add2_insn (this_rtx, temp1));
5936 }
5937
5938 /* Generate a tail call to the target function. */
5939 if (!TREE_USED (function))
5940 {
5941 assemble_external (function);
5942 TREE_USED (function) = 1;
5943 }
5944 funexp = XEXP (DECL_RTL (function), 0);
5945 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5946 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5947 SIBLING_CALL_P (insn) = 1;
5948
5949 insn = get_insns ();
5950 shorten_branches (insn);
5951 final_start_function (insn, file, 1);
5952 final (insn, file, 1);
5953 final_end_function ();
5954
5955 /* Stop pretending to be a post-reload pass. */
5956 reload_completed = 0;
5957 }
5958
5959 static bool
5960 aarch64_tls_referenced_p (rtx x)
5961 {
5962 if (!TARGET_HAVE_TLS)
5963 return false;
5964 subrtx_iterator::array_type array;
5965 FOR_EACH_SUBRTX (iter, array, x, ALL)
5966 {
5967 const_rtx x = *iter;
5968 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5969 return true;
5970 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5971 TLS offsets, not real symbol references. */
5972 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5973 iter.skip_subrtxes ();
5974 }
5975 return false;
5976 }
5977
5978
5979 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5980 a left shift of 0 or 12 bits. */
5981 bool
5982 aarch64_uimm12_shift (HOST_WIDE_INT val)
5983 {
5984 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5985 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5986 );
5987 }
5988
5989 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5990 that can be created with a left shift of 0 or 12. */
5991 static HOST_WIDE_INT
5992 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
5993 {
5994 /* Check to see if the value fits in 24 bits, as that is the maximum we can
5995 handle correctly. */
5996 gcc_assert ((val & 0xffffff) == val);
5997
5998 if (((val & 0xfff) << 0) == val)
5999 return val;
6000
6001 return val & (0xfff << 12);
6002 }
6003
6004 /* Return true if val is an immediate that can be loaded into a
6005 register by a MOVZ instruction. */
6006 static bool
6007 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6008 {
6009 if (GET_MODE_SIZE (mode) > 4)
6010 {
6011 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6012 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6013 return 1;
6014 }
6015 else
6016 {
6017 /* Ignore sign extension. */
6018 val &= (HOST_WIDE_INT) 0xffffffff;
6019 }
6020 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6021 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6022 }
6023
6024 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6025 64-bit (DImode) integer. */
6026
6027 static unsigned HOST_WIDE_INT
6028 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6029 {
6030 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6031 while (size < 64)
6032 {
6033 val &= (HOST_WIDE_INT_1U << size) - 1;
6034 val |= val << size;
6035 size *= 2;
6036 }
6037 return val;
6038 }
6039
6040 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6041
6042 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6043 {
6044 0x0000000100000001ull,
6045 0x0001000100010001ull,
6046 0x0101010101010101ull,
6047 0x1111111111111111ull,
6048 0x5555555555555555ull,
6049 };
6050
6051
6052 /* Return true if val is a valid bitmask immediate. */
6053
6054 bool
6055 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6056 {
6057 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6058 int bits;
6059
6060 /* Check for a single sequence of one bits and return quickly if so.
6061 The special cases of all ones and all zeroes returns false. */
6062 val = aarch64_replicate_bitmask_imm (val_in, mode);
6063 tmp = val + (val & -val);
6064
6065 if (tmp == (tmp & -tmp))
6066 return (val + 1) > 1;
6067
6068 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6069 if (mode == SImode)
6070 val = (val << 32) | (val & 0xffffffff);
6071
6072 /* Invert if the immediate doesn't start with a zero bit - this means we
6073 only need to search for sequences of one bits. */
6074 if (val & 1)
6075 val = ~val;
6076
6077 /* Find the first set bit and set tmp to val with the first sequence of one
6078 bits removed. Return success if there is a single sequence of ones. */
6079 first_one = val & -val;
6080 tmp = val & (val + first_one);
6081
6082 if (tmp == 0)
6083 return true;
6084
6085 /* Find the next set bit and compute the difference in bit position. */
6086 next_one = tmp & -tmp;
6087 bits = clz_hwi (first_one) - clz_hwi (next_one);
6088 mask = val ^ tmp;
6089
6090 /* Check the bit position difference is a power of 2, and that the first
6091 sequence of one bits fits within 'bits' bits. */
6092 if ((mask >> bits) != 0 || bits != (bits & -bits))
6093 return false;
6094
6095 /* Check the sequence of one bits is repeated 64/bits times. */
6096 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6097 }
6098
6099 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6100 Assumed precondition: VAL_IN Is not zero. */
6101
6102 unsigned HOST_WIDE_INT
6103 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6104 {
6105 int lowest_bit_set = ctz_hwi (val_in);
6106 int highest_bit_set = floor_log2 (val_in);
6107 gcc_assert (val_in != 0);
6108
6109 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6110 (HOST_WIDE_INT_1U << lowest_bit_set));
6111 }
6112
6113 /* Create constant where bits outside of lowest bit set to highest bit set
6114 are set to 1. */
6115
6116 unsigned HOST_WIDE_INT
6117 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6118 {
6119 return val_in | ~aarch64_and_split_imm1 (val_in);
6120 }
6121
6122 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6123
6124 bool
6125 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6126 {
6127 scalar_int_mode int_mode;
6128 if (!is_a <scalar_int_mode> (mode, &int_mode))
6129 return false;
6130
6131 if (aarch64_bitmask_imm (val_in, int_mode))
6132 return false;
6133
6134 if (aarch64_move_imm (val_in, int_mode))
6135 return false;
6136
6137 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6138
6139 return aarch64_bitmask_imm (imm2, int_mode);
6140 }
6141
6142 /* Return true if val is an immediate that can be loaded into a
6143 register in a single instruction. */
6144 bool
6145 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6146 {
6147 scalar_int_mode int_mode;
6148 if (!is_a <scalar_int_mode> (mode, &int_mode))
6149 return false;
6150
6151 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6152 return 1;
6153 return aarch64_bitmask_imm (val, int_mode);
6154 }
6155
6156 static bool
6157 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6158 {
6159 rtx base, offset;
6160
6161 if (GET_CODE (x) == HIGH)
6162 return true;
6163
6164 /* There's no way to calculate VL-based values using relocations. */
6165 subrtx_iterator::array_type array;
6166 FOR_EACH_SUBRTX (iter, array, x, ALL)
6167 if (GET_CODE (*iter) == CONST_POLY_INT)
6168 return true;
6169
6170 split_const (x, &base, &offset);
6171 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6172 {
6173 if (aarch64_classify_symbol (base, INTVAL (offset))
6174 != SYMBOL_FORCE_TO_MEM)
6175 return true;
6176 else
6177 /* Avoid generating a 64-bit relocation in ILP32; leave
6178 to aarch64_expand_mov_immediate to handle it properly. */
6179 return mode != ptr_mode;
6180 }
6181
6182 return aarch64_tls_referenced_p (x);
6183 }
6184
6185 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6186 The expansion for a table switch is quite expensive due to the number
6187 of instructions, the table lookup and hard to predict indirect jump.
6188 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6189 set, otherwise use tables for > 16 cases as a tradeoff between size and
6190 performance. When optimizing for size, use the default setting. */
6191
6192 static unsigned int
6193 aarch64_case_values_threshold (void)
6194 {
6195 /* Use the specified limit for the number of cases before using jump
6196 tables at higher optimization levels. */
6197 if (optimize > 2
6198 && selected_cpu->tune->max_case_values != 0)
6199 return selected_cpu->tune->max_case_values;
6200 else
6201 return optimize_size ? default_case_values_threshold () : 17;
6202 }
6203
6204 /* Return true if register REGNO is a valid index register.
6205 STRICT_P is true if REG_OK_STRICT is in effect. */
6206
6207 bool
6208 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6209 {
6210 if (!HARD_REGISTER_NUM_P (regno))
6211 {
6212 if (!strict_p)
6213 return true;
6214
6215 if (!reg_renumber)
6216 return false;
6217
6218 regno = reg_renumber[regno];
6219 }
6220 return GP_REGNUM_P (regno);
6221 }
6222
6223 /* Return true if register REGNO is a valid base register for mode MODE.
6224 STRICT_P is true if REG_OK_STRICT is in effect. */
6225
6226 bool
6227 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6228 {
6229 if (!HARD_REGISTER_NUM_P (regno))
6230 {
6231 if (!strict_p)
6232 return true;
6233
6234 if (!reg_renumber)
6235 return false;
6236
6237 regno = reg_renumber[regno];
6238 }
6239
6240 /* The fake registers will be eliminated to either the stack or
6241 hard frame pointer, both of which are usually valid base registers.
6242 Reload deals with the cases where the eliminated form isn't valid. */
6243 return (GP_REGNUM_P (regno)
6244 || regno == SP_REGNUM
6245 || regno == FRAME_POINTER_REGNUM
6246 || regno == ARG_POINTER_REGNUM);
6247 }
6248
6249 /* Return true if X is a valid base register for mode MODE.
6250 STRICT_P is true if REG_OK_STRICT is in effect. */
6251
6252 static bool
6253 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6254 {
6255 if (!strict_p
6256 && GET_CODE (x) == SUBREG
6257 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6258 x = SUBREG_REG (x);
6259
6260 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6261 }
6262
6263 /* Return true if address offset is a valid index. If it is, fill in INFO
6264 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6265
6266 static bool
6267 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6268 machine_mode mode, bool strict_p)
6269 {
6270 enum aarch64_address_type type;
6271 rtx index;
6272 int shift;
6273
6274 /* (reg:P) */
6275 if ((REG_P (x) || GET_CODE (x) == SUBREG)
6276 && GET_MODE (x) == Pmode)
6277 {
6278 type = ADDRESS_REG_REG;
6279 index = x;
6280 shift = 0;
6281 }
6282 /* (sign_extend:DI (reg:SI)) */
6283 else if ((GET_CODE (x) == SIGN_EXTEND
6284 || GET_CODE (x) == ZERO_EXTEND)
6285 && GET_MODE (x) == DImode
6286 && GET_MODE (XEXP (x, 0)) == SImode)
6287 {
6288 type = (GET_CODE (x) == SIGN_EXTEND)
6289 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6290 index = XEXP (x, 0);
6291 shift = 0;
6292 }
6293 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6294 else if (GET_CODE (x) == MULT
6295 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6296 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6297 && GET_MODE (XEXP (x, 0)) == DImode
6298 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6299 && CONST_INT_P (XEXP (x, 1)))
6300 {
6301 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6302 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6303 index = XEXP (XEXP (x, 0), 0);
6304 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6305 }
6306 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6307 else if (GET_CODE (x) == ASHIFT
6308 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6309 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6310 && GET_MODE (XEXP (x, 0)) == DImode
6311 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6312 && CONST_INT_P (XEXP (x, 1)))
6313 {
6314 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6315 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6316 index = XEXP (XEXP (x, 0), 0);
6317 shift = INTVAL (XEXP (x, 1));
6318 }
6319 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6320 else if ((GET_CODE (x) == SIGN_EXTRACT
6321 || GET_CODE (x) == ZERO_EXTRACT)
6322 && GET_MODE (x) == DImode
6323 && GET_CODE (XEXP (x, 0)) == MULT
6324 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6325 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6326 {
6327 type = (GET_CODE (x) == SIGN_EXTRACT)
6328 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6329 index = XEXP (XEXP (x, 0), 0);
6330 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6331 if (INTVAL (XEXP (x, 1)) != 32 + shift
6332 || INTVAL (XEXP (x, 2)) != 0)
6333 shift = -1;
6334 }
6335 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6336 (const_int 0xffffffff<<shift)) */
6337 else if (GET_CODE (x) == AND
6338 && GET_MODE (x) == DImode
6339 && GET_CODE (XEXP (x, 0)) == MULT
6340 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6341 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6342 && CONST_INT_P (XEXP (x, 1)))
6343 {
6344 type = ADDRESS_REG_UXTW;
6345 index = XEXP (XEXP (x, 0), 0);
6346 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6347 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6348 shift = -1;
6349 }
6350 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6351 else if ((GET_CODE (x) == SIGN_EXTRACT
6352 || GET_CODE (x) == ZERO_EXTRACT)
6353 && GET_MODE (x) == DImode
6354 && GET_CODE (XEXP (x, 0)) == ASHIFT
6355 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6356 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6357 {
6358 type = (GET_CODE (x) == SIGN_EXTRACT)
6359 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6360 index = XEXP (XEXP (x, 0), 0);
6361 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6362 if (INTVAL (XEXP (x, 1)) != 32 + shift
6363 || INTVAL (XEXP (x, 2)) != 0)
6364 shift = -1;
6365 }
6366 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6367 (const_int 0xffffffff<<shift)) */
6368 else if (GET_CODE (x) == AND
6369 && GET_MODE (x) == DImode
6370 && GET_CODE (XEXP (x, 0)) == ASHIFT
6371 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6372 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6373 && CONST_INT_P (XEXP (x, 1)))
6374 {
6375 type = ADDRESS_REG_UXTW;
6376 index = XEXP (XEXP (x, 0), 0);
6377 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6378 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6379 shift = -1;
6380 }
6381 /* (mult:P (reg:P) (const_int scale)) */
6382 else if (GET_CODE (x) == MULT
6383 && GET_MODE (x) == Pmode
6384 && GET_MODE (XEXP (x, 0)) == Pmode
6385 && CONST_INT_P (XEXP (x, 1)))
6386 {
6387 type = ADDRESS_REG_REG;
6388 index = XEXP (x, 0);
6389 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6390 }
6391 /* (ashift:P (reg:P) (const_int shift)) */
6392 else if (GET_CODE (x) == ASHIFT
6393 && GET_MODE (x) == Pmode
6394 && GET_MODE (XEXP (x, 0)) == Pmode
6395 && CONST_INT_P (XEXP (x, 1)))
6396 {
6397 type = ADDRESS_REG_REG;
6398 index = XEXP (x, 0);
6399 shift = INTVAL (XEXP (x, 1));
6400 }
6401 else
6402 return false;
6403
6404 if (!strict_p
6405 && GET_CODE (index) == SUBREG
6406 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6407 index = SUBREG_REG (index);
6408
6409 if (aarch64_sve_data_mode_p (mode))
6410 {
6411 if (type != ADDRESS_REG_REG
6412 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6413 return false;
6414 }
6415 else
6416 {
6417 if (shift != 0
6418 && !(IN_RANGE (shift, 1, 3)
6419 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6420 return false;
6421 }
6422
6423 if (REG_P (index)
6424 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6425 {
6426 info->type = type;
6427 info->offset = index;
6428 info->shift = shift;
6429 return true;
6430 }
6431
6432 return false;
6433 }
6434
6435 /* Return true if MODE is one of the modes for which we
6436 support LDP/STP operations. */
6437
6438 static bool
6439 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6440 {
6441 return mode == SImode || mode == DImode
6442 || mode == SFmode || mode == DFmode
6443 || (aarch64_vector_mode_supported_p (mode)
6444 && (known_eq (GET_MODE_SIZE (mode), 8)
6445 || (known_eq (GET_MODE_SIZE (mode), 16)
6446 && (aarch64_tune_params.extra_tuning_flags
6447 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6448 }
6449
6450 /* Return true if REGNO is a virtual pointer register, or an eliminable
6451 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6452 include stack_pointer or hard_frame_pointer. */
6453 static bool
6454 virt_or_elim_regno_p (unsigned regno)
6455 {
6456 return ((regno >= FIRST_VIRTUAL_REGISTER
6457 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6458 || regno == FRAME_POINTER_REGNUM
6459 || regno == ARG_POINTER_REGNUM);
6460 }
6461
6462 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6463 If it is, fill in INFO appropriately. STRICT_P is true if
6464 REG_OK_STRICT is in effect. */
6465
6466 bool
6467 aarch64_classify_address (struct aarch64_address_info *info,
6468 rtx x, machine_mode mode, bool strict_p,
6469 aarch64_addr_query_type type)
6470 {
6471 enum rtx_code code = GET_CODE (x);
6472 rtx op0, op1;
6473 poly_int64 offset;
6474
6475 HOST_WIDE_INT const_size;
6476
6477 /* On BE, we use load/store pair for all large int mode load/stores.
6478 TI/TFmode may also use a load/store pair. */
6479 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6480 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6481 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6482 || type == ADDR_QUERY_LDP_STP_N
6483 || mode == TImode
6484 || mode == TFmode
6485 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6486
6487 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6488 corresponds to the actual size of the memory being loaded/stored and the
6489 mode of the corresponding addressing mode is half of that. */
6490 if (type == ADDR_QUERY_LDP_STP_N
6491 && known_eq (GET_MODE_SIZE (mode), 16))
6492 mode = DFmode;
6493
6494 bool allow_reg_index_p = (!load_store_pair_p
6495 && (known_lt (GET_MODE_SIZE (mode), 16)
6496 || vec_flags == VEC_ADVSIMD
6497 || vec_flags == VEC_SVE_DATA));
6498
6499 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6500 [Rn, #offset, MUL VL]. */
6501 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6502 && (code != REG && code != PLUS))
6503 return false;
6504
6505 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6506 REG addressing. */
6507 if (advsimd_struct_p
6508 && !BYTES_BIG_ENDIAN
6509 && (code != POST_INC && code != REG))
6510 return false;
6511
6512 gcc_checking_assert (GET_MODE (x) == VOIDmode
6513 || SCALAR_INT_MODE_P (GET_MODE (x)));
6514
6515 switch (code)
6516 {
6517 case REG:
6518 case SUBREG:
6519 info->type = ADDRESS_REG_IMM;
6520 info->base = x;
6521 info->offset = const0_rtx;
6522 info->const_offset = 0;
6523 return aarch64_base_register_rtx_p (x, strict_p);
6524
6525 case PLUS:
6526 op0 = XEXP (x, 0);
6527 op1 = XEXP (x, 1);
6528
6529 if (! strict_p
6530 && REG_P (op0)
6531 && virt_or_elim_regno_p (REGNO (op0))
6532 && poly_int_rtx_p (op1, &offset))
6533 {
6534 info->type = ADDRESS_REG_IMM;
6535 info->base = op0;
6536 info->offset = op1;
6537 info->const_offset = offset;
6538
6539 return true;
6540 }
6541
6542 if (maybe_ne (GET_MODE_SIZE (mode), 0)
6543 && aarch64_base_register_rtx_p (op0, strict_p)
6544 && poly_int_rtx_p (op1, &offset))
6545 {
6546 info->type = ADDRESS_REG_IMM;
6547 info->base = op0;
6548 info->offset = op1;
6549 info->const_offset = offset;
6550
6551 /* TImode and TFmode values are allowed in both pairs of X
6552 registers and individual Q registers. The available
6553 address modes are:
6554 X,X: 7-bit signed scaled offset
6555 Q: 9-bit signed offset
6556 We conservatively require an offset representable in either mode.
6557 When performing the check for pairs of X registers i.e. LDP/STP
6558 pass down DImode since that is the natural size of the LDP/STP
6559 instruction memory accesses. */
6560 if (mode == TImode || mode == TFmode)
6561 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6562 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6563 || offset_12bit_unsigned_scaled_p (mode, offset)));
6564
6565 /* A 7bit offset check because OImode will emit a ldp/stp
6566 instruction (only big endian will get here).
6567 For ldp/stp instructions, the offset is scaled for the size of a
6568 single element of the pair. */
6569 if (mode == OImode)
6570 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6571
6572 /* Three 9/12 bit offsets checks because CImode will emit three
6573 ldr/str instructions (only big endian will get here). */
6574 if (mode == CImode)
6575 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6576 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6577 offset + 32)
6578 || offset_12bit_unsigned_scaled_p (V16QImode,
6579 offset + 32)));
6580
6581 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6582 instructions (only big endian will get here). */
6583 if (mode == XImode)
6584 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6585 && aarch64_offset_7bit_signed_scaled_p (TImode,
6586 offset + 32));
6587
6588 /* Make "m" use the LD1 offset range for SVE data modes, so
6589 that pre-RTL optimizers like ivopts will work to that
6590 instead of the wider LDR/STR range. */
6591 if (vec_flags == VEC_SVE_DATA)
6592 return (type == ADDR_QUERY_M
6593 ? offset_4bit_signed_scaled_p (mode, offset)
6594 : offset_9bit_signed_scaled_p (mode, offset));
6595
6596 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6597 {
6598 poly_int64 end_offset = (offset
6599 + GET_MODE_SIZE (mode)
6600 - BYTES_PER_SVE_VECTOR);
6601 return (type == ADDR_QUERY_M
6602 ? offset_4bit_signed_scaled_p (mode, offset)
6603 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6604 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6605 end_offset)));
6606 }
6607
6608 if (vec_flags == VEC_SVE_PRED)
6609 return offset_9bit_signed_scaled_p (mode, offset);
6610
6611 if (load_store_pair_p)
6612 return ((known_eq (GET_MODE_SIZE (mode), 4)
6613 || known_eq (GET_MODE_SIZE (mode), 8)
6614 || known_eq (GET_MODE_SIZE (mode), 16))
6615 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6616 else
6617 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6618 || offset_12bit_unsigned_scaled_p (mode, offset));
6619 }
6620
6621 if (allow_reg_index_p)
6622 {
6623 /* Look for base + (scaled/extended) index register. */
6624 if (aarch64_base_register_rtx_p (op0, strict_p)
6625 && aarch64_classify_index (info, op1, mode, strict_p))
6626 {
6627 info->base = op0;
6628 return true;
6629 }
6630 if (aarch64_base_register_rtx_p (op1, strict_p)
6631 && aarch64_classify_index (info, op0, mode, strict_p))
6632 {
6633 info->base = op1;
6634 return true;
6635 }
6636 }
6637
6638 return false;
6639
6640 case POST_INC:
6641 case POST_DEC:
6642 case PRE_INC:
6643 case PRE_DEC:
6644 info->type = ADDRESS_REG_WB;
6645 info->base = XEXP (x, 0);
6646 info->offset = NULL_RTX;
6647 return aarch64_base_register_rtx_p (info->base, strict_p);
6648
6649 case POST_MODIFY:
6650 case PRE_MODIFY:
6651 info->type = ADDRESS_REG_WB;
6652 info->base = XEXP (x, 0);
6653 if (GET_CODE (XEXP (x, 1)) == PLUS
6654 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6655 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6656 && aarch64_base_register_rtx_p (info->base, strict_p))
6657 {
6658 info->offset = XEXP (XEXP (x, 1), 1);
6659 info->const_offset = offset;
6660
6661 /* TImode and TFmode values are allowed in both pairs of X
6662 registers and individual Q registers. The available
6663 address modes are:
6664 X,X: 7-bit signed scaled offset
6665 Q: 9-bit signed offset
6666 We conservatively require an offset representable in either mode.
6667 */
6668 if (mode == TImode || mode == TFmode)
6669 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6670 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6671
6672 if (load_store_pair_p)
6673 return ((known_eq (GET_MODE_SIZE (mode), 4)
6674 || known_eq (GET_MODE_SIZE (mode), 8)
6675 || known_eq (GET_MODE_SIZE (mode), 16))
6676 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6677 else
6678 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6679 }
6680 return false;
6681
6682 case CONST:
6683 case SYMBOL_REF:
6684 case LABEL_REF:
6685 /* load literal: pc-relative constant pool entry. Only supported
6686 for SI mode or larger. */
6687 info->type = ADDRESS_SYMBOLIC;
6688
6689 if (!load_store_pair_p
6690 && GET_MODE_SIZE (mode).is_constant (&const_size)
6691 && const_size >= 4)
6692 {
6693 rtx sym, addend;
6694
6695 split_const (x, &sym, &addend);
6696 return ((GET_CODE (sym) == LABEL_REF
6697 || (GET_CODE (sym) == SYMBOL_REF
6698 && CONSTANT_POOL_ADDRESS_P (sym)
6699 && aarch64_pcrelative_literal_loads)));
6700 }
6701 return false;
6702
6703 case LO_SUM:
6704 info->type = ADDRESS_LO_SUM;
6705 info->base = XEXP (x, 0);
6706 info->offset = XEXP (x, 1);
6707 if (allow_reg_index_p
6708 && aarch64_base_register_rtx_p (info->base, strict_p))
6709 {
6710 rtx sym, offs;
6711 split_const (info->offset, &sym, &offs);
6712 if (GET_CODE (sym) == SYMBOL_REF
6713 && (aarch64_classify_symbol (sym, INTVAL (offs))
6714 == SYMBOL_SMALL_ABSOLUTE))
6715 {
6716 /* The symbol and offset must be aligned to the access size. */
6717 unsigned int align;
6718
6719 if (CONSTANT_POOL_ADDRESS_P (sym))
6720 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6721 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6722 {
6723 tree exp = SYMBOL_REF_DECL (sym);
6724 align = TYPE_ALIGN (TREE_TYPE (exp));
6725 align = aarch64_constant_alignment (exp, align);
6726 }
6727 else if (SYMBOL_REF_DECL (sym))
6728 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6729 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6730 && SYMBOL_REF_BLOCK (sym) != NULL)
6731 align = SYMBOL_REF_BLOCK (sym)->alignment;
6732 else
6733 align = BITS_PER_UNIT;
6734
6735 poly_int64 ref_size = GET_MODE_SIZE (mode);
6736 if (known_eq (ref_size, 0))
6737 ref_size = GET_MODE_SIZE (DImode);
6738
6739 return (multiple_p (INTVAL (offs), ref_size)
6740 && multiple_p (align / BITS_PER_UNIT, ref_size));
6741 }
6742 }
6743 return false;
6744
6745 default:
6746 return false;
6747 }
6748 }
6749
6750 /* Return true if the address X is valid for a PRFM instruction.
6751 STRICT_P is true if we should do strict checking with
6752 aarch64_classify_address. */
6753
6754 bool
6755 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6756 {
6757 struct aarch64_address_info addr;
6758
6759 /* PRFM accepts the same addresses as DImode... */
6760 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6761 if (!res)
6762 return false;
6763
6764 /* ... except writeback forms. */
6765 return addr.type != ADDRESS_REG_WB;
6766 }
6767
6768 bool
6769 aarch64_symbolic_address_p (rtx x)
6770 {
6771 rtx offset;
6772
6773 split_const (x, &x, &offset);
6774 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6775 }
6776
6777 /* Classify the base of symbolic expression X. */
6778
6779 enum aarch64_symbol_type
6780 aarch64_classify_symbolic_expression (rtx x)
6781 {
6782 rtx offset;
6783
6784 split_const (x, &x, &offset);
6785 return aarch64_classify_symbol (x, INTVAL (offset));
6786 }
6787
6788
6789 /* Return TRUE if X is a legitimate address for accessing memory in
6790 mode MODE. */
6791 static bool
6792 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6793 {
6794 struct aarch64_address_info addr;
6795
6796 return aarch64_classify_address (&addr, x, mode, strict_p);
6797 }
6798
6799 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6800 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6801 bool
6802 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6803 aarch64_addr_query_type type)
6804 {
6805 struct aarch64_address_info addr;
6806
6807 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6808 }
6809
6810 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6811
6812 static bool
6813 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6814 poly_int64 orig_offset,
6815 machine_mode mode)
6816 {
6817 HOST_WIDE_INT size;
6818 if (GET_MODE_SIZE (mode).is_constant (&size))
6819 {
6820 HOST_WIDE_INT const_offset, second_offset;
6821
6822 /* A general SVE offset is A * VQ + B. Remove the A component from
6823 coefficient 0 in order to get the constant B. */
6824 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6825
6826 /* Split an out-of-range address displacement into a base and
6827 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6828 range otherwise to increase opportunities for sharing the base
6829 address of different sizes. Unaligned accesses use the signed
6830 9-bit range, TImode/TFmode use the intersection of signed
6831 scaled 7-bit and signed 9-bit offset. */
6832 if (mode == TImode || mode == TFmode)
6833 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6834 else if ((const_offset & (size - 1)) != 0)
6835 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6836 else
6837 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6838
6839 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6840 return false;
6841
6842 /* Split the offset into second_offset and the rest. */
6843 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6844 *offset2 = gen_int_mode (second_offset, Pmode);
6845 return true;
6846 }
6847 else
6848 {
6849 /* Get the mode we should use as the basis of the range. For structure
6850 modes this is the mode of one vector. */
6851 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6852 machine_mode step_mode
6853 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6854
6855 /* Get the "mul vl" multiplier we'd like to use. */
6856 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6857 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6858 if (vec_flags & VEC_SVE_DATA)
6859 /* LDR supports a 9-bit range, but the move patterns for
6860 structure modes require all vectors to be in range of the
6861 same base. The simplest way of accomodating that while still
6862 promoting reuse of anchor points between different modes is
6863 to use an 8-bit range unconditionally. */
6864 vnum = ((vnum + 128) & 255) - 128;
6865 else
6866 /* Predicates are only handled singly, so we might as well use
6867 the full range. */
6868 vnum = ((vnum + 256) & 511) - 256;
6869 if (vnum == 0)
6870 return false;
6871
6872 /* Convert the "mul vl" multiplier into a byte offset. */
6873 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6874 if (known_eq (second_offset, orig_offset))
6875 return false;
6876
6877 /* Split the offset into second_offset and the rest. */
6878 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6879 *offset2 = gen_int_mode (second_offset, Pmode);
6880 return true;
6881 }
6882 }
6883
6884 /* Return the binary representation of floating point constant VALUE in INTVAL.
6885 If the value cannot be converted, return false without setting INTVAL.
6886 The conversion is done in the given MODE. */
6887 bool
6888 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6889 {
6890
6891 /* We make a general exception for 0. */
6892 if (aarch64_float_const_zero_rtx_p (value))
6893 {
6894 *intval = 0;
6895 return true;
6896 }
6897
6898 scalar_float_mode mode;
6899 if (GET_CODE (value) != CONST_DOUBLE
6900 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6901 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6902 /* Only support up to DF mode. */
6903 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6904 return false;
6905
6906 unsigned HOST_WIDE_INT ival = 0;
6907
6908 long res[2];
6909 real_to_target (res,
6910 CONST_DOUBLE_REAL_VALUE (value),
6911 REAL_MODE_FORMAT (mode));
6912
6913 if (mode == DFmode)
6914 {
6915 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6916 ival = zext_hwi (res[order], 32);
6917 ival |= (zext_hwi (res[1 - order], 32) << 32);
6918 }
6919 else
6920 ival = zext_hwi (res[0], 32);
6921
6922 *intval = ival;
6923 return true;
6924 }
6925
6926 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6927 single MOV(+MOVK) followed by an FMOV. */
6928 bool
6929 aarch64_float_const_rtx_p (rtx x)
6930 {
6931 machine_mode mode = GET_MODE (x);
6932 if (mode == VOIDmode)
6933 return false;
6934
6935 /* Determine whether it's cheaper to write float constants as
6936 mov/movk pairs over ldr/adrp pairs. */
6937 unsigned HOST_WIDE_INT ival;
6938
6939 if (GET_CODE (x) == CONST_DOUBLE
6940 && SCALAR_FLOAT_MODE_P (mode)
6941 && aarch64_reinterpret_float_as_int (x, &ival))
6942 {
6943 scalar_int_mode imode = (mode == HFmode
6944 ? SImode
6945 : int_mode_for_mode (mode).require ());
6946 int num_instr = aarch64_internal_mov_immediate
6947 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6948 return num_instr < 3;
6949 }
6950
6951 return false;
6952 }
6953
6954 /* Return TRUE if rtx X is immediate constant 0.0 */
6955 bool
6956 aarch64_float_const_zero_rtx_p (rtx x)
6957 {
6958 if (GET_MODE (x) == VOIDmode)
6959 return false;
6960
6961 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6962 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6963 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6964 }
6965
6966 /* Return TRUE if rtx X is immediate constant that fits in a single
6967 MOVI immediate operation. */
6968 bool
6969 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6970 {
6971 if (!TARGET_SIMD)
6972 return false;
6973
6974 machine_mode vmode;
6975 scalar_int_mode imode;
6976 unsigned HOST_WIDE_INT ival;
6977
6978 if (GET_CODE (x) == CONST_DOUBLE
6979 && SCALAR_FLOAT_MODE_P (mode))
6980 {
6981 if (!aarch64_reinterpret_float_as_int (x, &ival))
6982 return false;
6983
6984 /* We make a general exception for 0. */
6985 if (aarch64_float_const_zero_rtx_p (x))
6986 return true;
6987
6988 imode = int_mode_for_mode (mode).require ();
6989 }
6990 else if (GET_CODE (x) == CONST_INT
6991 && is_a <scalar_int_mode> (mode, &imode))
6992 ival = INTVAL (x);
6993 else
6994 return false;
6995
6996 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6997 a 128 bit vector mode. */
6998 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6999
7000 vmode = aarch64_simd_container_mode (imode, width);
7001 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7002
7003 return aarch64_simd_valid_immediate (v_op, NULL);
7004 }
7005
7006
7007 /* Return the fixed registers used for condition codes. */
7008
7009 static bool
7010 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7011 {
7012 *p1 = CC_REGNUM;
7013 *p2 = INVALID_REGNUM;
7014 return true;
7015 }
7016
7017 /* This function is used by the call expanders of the machine description.
7018 RESULT is the register in which the result is returned. It's NULL for
7019 "call" and "sibcall".
7020 MEM is the location of the function call.
7021 SIBCALL indicates whether this function call is normal call or sibling call.
7022 It will generate different pattern accordingly. */
7023
7024 void
7025 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7026 {
7027 rtx call, callee, tmp;
7028 rtvec vec;
7029 machine_mode mode;
7030
7031 gcc_assert (MEM_P (mem));
7032 callee = XEXP (mem, 0);
7033 mode = GET_MODE (callee);
7034 gcc_assert (mode == Pmode);
7035
7036 /* Decide if we should generate indirect calls by loading the
7037 address of the callee into a register before performing
7038 the branch-and-link. */
7039 if (SYMBOL_REF_P (callee)
7040 ? (aarch64_is_long_call_p (callee)
7041 || aarch64_is_noplt_call_p (callee))
7042 : !REG_P (callee))
7043 XEXP (mem, 0) = force_reg (mode, callee);
7044
7045 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7046
7047 if (result != NULL_RTX)
7048 call = gen_rtx_SET (result, call);
7049
7050 if (sibcall)
7051 tmp = ret_rtx;
7052 else
7053 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7054
7055 vec = gen_rtvec (2, call, tmp);
7056 call = gen_rtx_PARALLEL (VOIDmode, vec);
7057
7058 aarch64_emit_call_insn (call);
7059 }
7060
7061 /* Emit call insn with PAT and do aarch64-specific handling. */
7062
7063 void
7064 aarch64_emit_call_insn (rtx pat)
7065 {
7066 rtx insn = emit_call_insn (pat);
7067
7068 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7069 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7070 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7071 }
7072
7073 machine_mode
7074 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7075 {
7076 /* All floating point compares return CCFP if it is an equality
7077 comparison, and CCFPE otherwise. */
7078 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
7079 {
7080 switch (code)
7081 {
7082 case EQ:
7083 case NE:
7084 case UNORDERED:
7085 case ORDERED:
7086 case UNLT:
7087 case UNLE:
7088 case UNGT:
7089 case UNGE:
7090 case UNEQ:
7091 return CCFPmode;
7092
7093 case LT:
7094 case LE:
7095 case GT:
7096 case GE:
7097 case LTGT:
7098 return CCFPEmode;
7099
7100 default:
7101 gcc_unreachable ();
7102 }
7103 }
7104
7105 /* Equality comparisons of short modes against zero can be performed
7106 using the TST instruction with the appropriate bitmask. */
7107 if (y == const0_rtx && REG_P (x)
7108 && (code == EQ || code == NE)
7109 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
7110 return CC_NZmode;
7111
7112 /* Similarly, comparisons of zero_extends from shorter modes can
7113 be performed using an ANDS with an immediate mask. */
7114 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
7115 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7116 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7117 && (code == EQ || code == NE))
7118 return CC_NZmode;
7119
7120 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7121 && y == const0_rtx
7122 && (code == EQ || code == NE || code == LT || code == GE)
7123 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
7124 || GET_CODE (x) == NEG
7125 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7126 && CONST_INT_P (XEXP (x, 2)))))
7127 return CC_NZmode;
7128
7129 /* A compare with a shifted operand. Because of canonicalization,
7130 the comparison will have to be swapped when we emit the assembly
7131 code. */
7132 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7133 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7134 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
7135 || GET_CODE (x) == LSHIFTRT
7136 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
7137 return CC_SWPmode;
7138
7139 /* Similarly for a negated operand, but we can only do this for
7140 equalities. */
7141 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
7142 && (REG_P (y) || GET_CODE (y) == SUBREG)
7143 && (code == EQ || code == NE)
7144 && GET_CODE (x) == NEG)
7145 return CC_Zmode;
7146
7147 /* A test for unsigned overflow. */
7148 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
7149 && code == NE
7150 && GET_CODE (x) == PLUS
7151 && GET_CODE (y) == ZERO_EXTEND)
7152 return CC_Cmode;
7153
7154 /* A test for signed overflow. */
7155 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
7156 && code == NE
7157 && GET_CODE (x) == PLUS
7158 && GET_CODE (y) == SIGN_EXTEND)
7159 return CC_Vmode;
7160
7161 /* For everything else, return CCmode. */
7162 return CCmode;
7163 }
7164
7165 static int
7166 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7167
7168 int
7169 aarch64_get_condition_code (rtx x)
7170 {
7171 machine_mode mode = GET_MODE (XEXP (x, 0));
7172 enum rtx_code comp_code = GET_CODE (x);
7173
7174 if (GET_MODE_CLASS (mode) != MODE_CC)
7175 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7176 return aarch64_get_condition_code_1 (mode, comp_code);
7177 }
7178
7179 static int
7180 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7181 {
7182 switch (mode)
7183 {
7184 case E_CCFPmode:
7185 case E_CCFPEmode:
7186 switch (comp_code)
7187 {
7188 case GE: return AARCH64_GE;
7189 case GT: return AARCH64_GT;
7190 case LE: return AARCH64_LS;
7191 case LT: return AARCH64_MI;
7192 case NE: return AARCH64_NE;
7193 case EQ: return AARCH64_EQ;
7194 case ORDERED: return AARCH64_VC;
7195 case UNORDERED: return AARCH64_VS;
7196 case UNLT: return AARCH64_LT;
7197 case UNLE: return AARCH64_LE;
7198 case UNGT: return AARCH64_HI;
7199 case UNGE: return AARCH64_PL;
7200 default: return -1;
7201 }
7202 break;
7203
7204 case E_CCmode:
7205 switch (comp_code)
7206 {
7207 case NE: return AARCH64_NE;
7208 case EQ: return AARCH64_EQ;
7209 case GE: return AARCH64_GE;
7210 case GT: return AARCH64_GT;
7211 case LE: return AARCH64_LE;
7212 case LT: return AARCH64_LT;
7213 case GEU: return AARCH64_CS;
7214 case GTU: return AARCH64_HI;
7215 case LEU: return AARCH64_LS;
7216 case LTU: return AARCH64_CC;
7217 default: return -1;
7218 }
7219 break;
7220
7221 case E_CC_SWPmode:
7222 switch (comp_code)
7223 {
7224 case NE: return AARCH64_NE;
7225 case EQ: return AARCH64_EQ;
7226 case GE: return AARCH64_LE;
7227 case GT: return AARCH64_LT;
7228 case LE: return AARCH64_GE;
7229 case LT: return AARCH64_GT;
7230 case GEU: return AARCH64_LS;
7231 case GTU: return AARCH64_CC;
7232 case LEU: return AARCH64_CS;
7233 case LTU: return AARCH64_HI;
7234 default: return -1;
7235 }
7236 break;
7237
7238 case E_CC_NZmode:
7239 switch (comp_code)
7240 {
7241 case NE: return AARCH64_NE;
7242 case EQ: return AARCH64_EQ;
7243 case GE: return AARCH64_PL;
7244 case LT: return AARCH64_MI;
7245 default: return -1;
7246 }
7247 break;
7248
7249 case E_CC_Zmode:
7250 switch (comp_code)
7251 {
7252 case NE: return AARCH64_NE;
7253 case EQ: return AARCH64_EQ;
7254 default: return -1;
7255 }
7256 break;
7257
7258 case E_CC_Cmode:
7259 switch (comp_code)
7260 {
7261 case NE: return AARCH64_CS;
7262 case EQ: return AARCH64_CC;
7263 default: return -1;
7264 }
7265 break;
7266
7267 case E_CC_Vmode:
7268 switch (comp_code)
7269 {
7270 case NE: return AARCH64_VS;
7271 case EQ: return AARCH64_VC;
7272 default: return -1;
7273 }
7274 break;
7275
7276 default:
7277 return -1;
7278 }
7279
7280 return -1;
7281 }
7282
7283 bool
7284 aarch64_const_vec_all_same_in_range_p (rtx x,
7285 HOST_WIDE_INT minval,
7286 HOST_WIDE_INT maxval)
7287 {
7288 rtx elt;
7289 return (const_vec_duplicate_p (x, &elt)
7290 && CONST_INT_P (elt)
7291 && IN_RANGE (INTVAL (elt), minval, maxval));
7292 }
7293
7294 bool
7295 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7296 {
7297 return aarch64_const_vec_all_same_in_range_p (x, val, val);
7298 }
7299
7300 /* Return true if VEC is a constant in which every element is in the range
7301 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7302
7303 static bool
7304 aarch64_const_vec_all_in_range_p (rtx vec,
7305 HOST_WIDE_INT minval,
7306 HOST_WIDE_INT maxval)
7307 {
7308 if (GET_CODE (vec) != CONST_VECTOR
7309 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7310 return false;
7311
7312 int nunits;
7313 if (!CONST_VECTOR_STEPPED_P (vec))
7314 nunits = const_vector_encoded_nelts (vec);
7315 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7316 return false;
7317
7318 for (int i = 0; i < nunits; i++)
7319 {
7320 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7321 if (!CONST_INT_P (vec_elem)
7322 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7323 return false;
7324 }
7325 return true;
7326 }
7327
7328 /* N Z C V. */
7329 #define AARCH64_CC_V 1
7330 #define AARCH64_CC_C (1 << 1)
7331 #define AARCH64_CC_Z (1 << 2)
7332 #define AARCH64_CC_N (1 << 3)
7333
7334 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7335 static const int aarch64_nzcv_codes[] =
7336 {
7337 0, /* EQ, Z == 1. */
7338 AARCH64_CC_Z, /* NE, Z == 0. */
7339 0, /* CS, C == 1. */
7340 AARCH64_CC_C, /* CC, C == 0. */
7341 0, /* MI, N == 1. */
7342 AARCH64_CC_N, /* PL, N == 0. */
7343 0, /* VS, V == 1. */
7344 AARCH64_CC_V, /* VC, V == 0. */
7345 0, /* HI, C ==1 && Z == 0. */
7346 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
7347 AARCH64_CC_V, /* GE, N == V. */
7348 0, /* LT, N != V. */
7349 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
7350 0, /* LE, !(Z == 0 && N == V). */
7351 0, /* AL, Any. */
7352 0 /* NV, Any. */
7353 };
7354
7355 /* Print floating-point vector immediate operand X to F, negating it
7356 first if NEGATE is true. Return true on success, false if it isn't
7357 a constant we can handle. */
7358
7359 static bool
7360 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7361 {
7362 rtx elt;
7363
7364 if (!const_vec_duplicate_p (x, &elt))
7365 return false;
7366
7367 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7368 if (negate)
7369 r = real_value_negate (&r);
7370
7371 /* We only handle the SVE single-bit immediates here. */
7372 if (real_equal (&r, &dconst0))
7373 asm_fprintf (f, "0.0");
7374 else if (real_equal (&r, &dconst1))
7375 asm_fprintf (f, "1.0");
7376 else if (real_equal (&r, &dconsthalf))
7377 asm_fprintf (f, "0.5");
7378 else
7379 return false;
7380
7381 return true;
7382 }
7383
7384 /* Return the equivalent letter for size. */
7385 static char
7386 sizetochar (int size)
7387 {
7388 switch (size)
7389 {
7390 case 64: return 'd';
7391 case 32: return 's';
7392 case 16: return 'h';
7393 case 8 : return 'b';
7394 default: gcc_unreachable ();
7395 }
7396 }
7397
7398 /* Print operand X to file F in a target specific manner according to CODE.
7399 The acceptable formatting commands given by CODE are:
7400 'c': An integer or symbol address without a preceding #
7401 sign.
7402 'C': Take the duplicated element in a vector constant
7403 and print it in hex.
7404 'D': Take the duplicated element in a vector constant
7405 and print it as an unsigned integer, in decimal.
7406 'e': Print the sign/zero-extend size as a character 8->b,
7407 16->h, 32->w.
7408 'p': Prints N such that 2^N == X (X must be power of 2 and
7409 const int).
7410 'P': Print the number of non-zero bits in X (a const_int).
7411 'H': Print the higher numbered register of a pair (TImode)
7412 of regs.
7413 'm': Print a condition (eq, ne, etc).
7414 'M': Same as 'm', but invert condition.
7415 'N': Take the duplicated element in a vector constant
7416 and print the negative of it in decimal.
7417 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7418 'S/T/U/V': Print a FP/SIMD register name for a register list.
7419 The register printed is the FP/SIMD register name
7420 of X + 0/1/2/3 for S/T/U/V.
7421 'R': Print a scalar FP/SIMD register name + 1.
7422 'X': Print bottom 16 bits of integer constant in hex.
7423 'w/x': Print a general register name or the zero register
7424 (32-bit or 64-bit).
7425 '0': Print a normal operand, if it's a general register,
7426 then we assume DImode.
7427 'k': Print NZCV for conditional compare instructions.
7428 'A': Output address constant representing the first
7429 argument of X, specifying a relocation offset
7430 if appropriate.
7431 'L': Output constant address specified by X
7432 with a relocation offset if appropriate.
7433 'G': Prints address of X, specifying a PC relative
7434 relocation mode if appropriate.
7435 'y': Output address of LDP or STP - this is used for
7436 some LDP/STPs which don't use a PARALLEL in their
7437 pattern (so the mode needs to be adjusted).
7438 'z': Output address of a typical LDP or STP. */
7439
7440 static void
7441 aarch64_print_operand (FILE *f, rtx x, int code)
7442 {
7443 rtx elt;
7444 switch (code)
7445 {
7446 case 'c':
7447 switch (GET_CODE (x))
7448 {
7449 case CONST_INT:
7450 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7451 break;
7452
7453 case SYMBOL_REF:
7454 output_addr_const (f, x);
7455 break;
7456
7457 case CONST:
7458 if (GET_CODE (XEXP (x, 0)) == PLUS
7459 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7460 {
7461 output_addr_const (f, x);
7462 break;
7463 }
7464 /* Fall through. */
7465
7466 default:
7467 output_operand_lossage ("unsupported operand for code '%c'", code);
7468 }
7469 break;
7470
7471 case 'e':
7472 {
7473 int n;
7474
7475 if (!CONST_INT_P (x)
7476 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7477 {
7478 output_operand_lossage ("invalid operand for '%%%c'", code);
7479 return;
7480 }
7481
7482 switch (n)
7483 {
7484 case 3:
7485 fputc ('b', f);
7486 break;
7487 case 4:
7488 fputc ('h', f);
7489 break;
7490 case 5:
7491 fputc ('w', f);
7492 break;
7493 default:
7494 output_operand_lossage ("invalid operand for '%%%c'", code);
7495 return;
7496 }
7497 }
7498 break;
7499
7500 case 'p':
7501 {
7502 int n;
7503
7504 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7505 {
7506 output_operand_lossage ("invalid operand for '%%%c'", code);
7507 return;
7508 }
7509
7510 asm_fprintf (f, "%d", n);
7511 }
7512 break;
7513
7514 case 'P':
7515 if (!CONST_INT_P (x))
7516 {
7517 output_operand_lossage ("invalid operand for '%%%c'", code);
7518 return;
7519 }
7520
7521 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7522 break;
7523
7524 case 'H':
7525 if (x == const0_rtx)
7526 {
7527 asm_fprintf (f, "xzr");
7528 break;
7529 }
7530
7531 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7532 {
7533 output_operand_lossage ("invalid operand for '%%%c'", code);
7534 return;
7535 }
7536
7537 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7538 break;
7539
7540 case 'M':
7541 case 'm':
7542 {
7543 int cond_code;
7544 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7545 if (x == const_true_rtx)
7546 {
7547 if (code == 'M')
7548 fputs ("nv", f);
7549 return;
7550 }
7551
7552 if (!COMPARISON_P (x))
7553 {
7554 output_operand_lossage ("invalid operand for '%%%c'", code);
7555 return;
7556 }
7557
7558 cond_code = aarch64_get_condition_code (x);
7559 gcc_assert (cond_code >= 0);
7560 if (code == 'M')
7561 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7562 fputs (aarch64_condition_codes[cond_code], f);
7563 }
7564 break;
7565
7566 case 'N':
7567 if (!const_vec_duplicate_p (x, &elt))
7568 {
7569 output_operand_lossage ("invalid vector constant");
7570 return;
7571 }
7572
7573 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7574 asm_fprintf (f, "%wd", -INTVAL (elt));
7575 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7576 && aarch64_print_vector_float_operand (f, x, true))
7577 ;
7578 else
7579 {
7580 output_operand_lossage ("invalid vector constant");
7581 return;
7582 }
7583 break;
7584
7585 case 'b':
7586 case 'h':
7587 case 's':
7588 case 'd':
7589 case 'q':
7590 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7591 {
7592 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7593 return;
7594 }
7595 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7596 break;
7597
7598 case 'S':
7599 case 'T':
7600 case 'U':
7601 case 'V':
7602 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7603 {
7604 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7605 return;
7606 }
7607 asm_fprintf (f, "%c%d",
7608 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7609 REGNO (x) - V0_REGNUM + (code - 'S'));
7610 break;
7611
7612 case 'R':
7613 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7614 {
7615 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7616 return;
7617 }
7618 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7619 break;
7620
7621 case 'X':
7622 if (!CONST_INT_P (x))
7623 {
7624 output_operand_lossage ("invalid operand for '%%%c'", code);
7625 return;
7626 }
7627 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7628 break;
7629
7630 case 'C':
7631 {
7632 /* Print a replicated constant in hex. */
7633 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7634 {
7635 output_operand_lossage ("invalid operand for '%%%c'", code);
7636 return;
7637 }
7638 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7639 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7640 }
7641 break;
7642
7643 case 'D':
7644 {
7645 /* Print a replicated constant in decimal, treating it as
7646 unsigned. */
7647 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7648 {
7649 output_operand_lossage ("invalid operand for '%%%c'", code);
7650 return;
7651 }
7652 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7653 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7654 }
7655 break;
7656
7657 case 'w':
7658 case 'x':
7659 if (x == const0_rtx
7660 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7661 {
7662 asm_fprintf (f, "%czr", code);
7663 break;
7664 }
7665
7666 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7667 {
7668 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7669 break;
7670 }
7671
7672 if (REG_P (x) && REGNO (x) == SP_REGNUM)
7673 {
7674 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7675 break;
7676 }
7677
7678 /* Fall through */
7679
7680 case 0:
7681 if (x == NULL)
7682 {
7683 output_operand_lossage ("missing operand");
7684 return;
7685 }
7686
7687 switch (GET_CODE (x))
7688 {
7689 case REG:
7690 if (aarch64_sve_data_mode_p (GET_MODE (x)))
7691 {
7692 if (REG_NREGS (x) == 1)
7693 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7694 else
7695 {
7696 char suffix
7697 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7698 asm_fprintf (f, "{z%d.%c - z%d.%c}",
7699 REGNO (x) - V0_REGNUM, suffix,
7700 END_REGNO (x) - V0_REGNUM - 1, suffix);
7701 }
7702 }
7703 else
7704 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7705 break;
7706
7707 case MEM:
7708 output_address (GET_MODE (x), XEXP (x, 0));
7709 break;
7710
7711 case LABEL_REF:
7712 case SYMBOL_REF:
7713 output_addr_const (asm_out_file, x);
7714 break;
7715
7716 case CONST_INT:
7717 asm_fprintf (f, "%wd", INTVAL (x));
7718 break;
7719
7720 case CONST:
7721 if (!VECTOR_MODE_P (GET_MODE (x)))
7722 {
7723 output_addr_const (asm_out_file, x);
7724 break;
7725 }
7726 /* fall through */
7727
7728 case CONST_VECTOR:
7729 if (!const_vec_duplicate_p (x, &elt))
7730 {
7731 output_operand_lossage ("invalid vector constant");
7732 return;
7733 }
7734
7735 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7736 asm_fprintf (f, "%wd", INTVAL (elt));
7737 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7738 && aarch64_print_vector_float_operand (f, x, false))
7739 ;
7740 else
7741 {
7742 output_operand_lossage ("invalid vector constant");
7743 return;
7744 }
7745 break;
7746
7747 case CONST_DOUBLE:
7748 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7749 be getting CONST_DOUBLEs holding integers. */
7750 gcc_assert (GET_MODE (x) != VOIDmode);
7751 if (aarch64_float_const_zero_rtx_p (x))
7752 {
7753 fputc ('0', f);
7754 break;
7755 }
7756 else if (aarch64_float_const_representable_p (x))
7757 {
7758 #define buf_size 20
7759 char float_buf[buf_size] = {'\0'};
7760 real_to_decimal_for_mode (float_buf,
7761 CONST_DOUBLE_REAL_VALUE (x),
7762 buf_size, buf_size,
7763 1, GET_MODE (x));
7764 asm_fprintf (asm_out_file, "%s", float_buf);
7765 break;
7766 #undef buf_size
7767 }
7768 output_operand_lossage ("invalid constant");
7769 return;
7770 default:
7771 output_operand_lossage ("invalid operand");
7772 return;
7773 }
7774 break;
7775
7776 case 'A':
7777 if (GET_CODE (x) == HIGH)
7778 x = XEXP (x, 0);
7779
7780 switch (aarch64_classify_symbolic_expression (x))
7781 {
7782 case SYMBOL_SMALL_GOT_4G:
7783 asm_fprintf (asm_out_file, ":got:");
7784 break;
7785
7786 case SYMBOL_SMALL_TLSGD:
7787 asm_fprintf (asm_out_file, ":tlsgd:");
7788 break;
7789
7790 case SYMBOL_SMALL_TLSDESC:
7791 asm_fprintf (asm_out_file, ":tlsdesc:");
7792 break;
7793
7794 case SYMBOL_SMALL_TLSIE:
7795 asm_fprintf (asm_out_file, ":gottprel:");
7796 break;
7797
7798 case SYMBOL_TLSLE24:
7799 asm_fprintf (asm_out_file, ":tprel:");
7800 break;
7801
7802 case SYMBOL_TINY_GOT:
7803 gcc_unreachable ();
7804 break;
7805
7806 default:
7807 break;
7808 }
7809 output_addr_const (asm_out_file, x);
7810 break;
7811
7812 case 'L':
7813 switch (aarch64_classify_symbolic_expression (x))
7814 {
7815 case SYMBOL_SMALL_GOT_4G:
7816 asm_fprintf (asm_out_file, ":lo12:");
7817 break;
7818
7819 case SYMBOL_SMALL_TLSGD:
7820 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7821 break;
7822
7823 case SYMBOL_SMALL_TLSDESC:
7824 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7825 break;
7826
7827 case SYMBOL_SMALL_TLSIE:
7828 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7829 break;
7830
7831 case SYMBOL_TLSLE12:
7832 asm_fprintf (asm_out_file, ":tprel_lo12:");
7833 break;
7834
7835 case SYMBOL_TLSLE24:
7836 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7837 break;
7838
7839 case SYMBOL_TINY_GOT:
7840 asm_fprintf (asm_out_file, ":got:");
7841 break;
7842
7843 case SYMBOL_TINY_TLSIE:
7844 asm_fprintf (asm_out_file, ":gottprel:");
7845 break;
7846
7847 default:
7848 break;
7849 }
7850 output_addr_const (asm_out_file, x);
7851 break;
7852
7853 case 'G':
7854 switch (aarch64_classify_symbolic_expression (x))
7855 {
7856 case SYMBOL_TLSLE24:
7857 asm_fprintf (asm_out_file, ":tprel_hi12:");
7858 break;
7859 default:
7860 break;
7861 }
7862 output_addr_const (asm_out_file, x);
7863 break;
7864
7865 case 'k':
7866 {
7867 HOST_WIDE_INT cond_code;
7868
7869 if (!CONST_INT_P (x))
7870 {
7871 output_operand_lossage ("invalid operand for '%%%c'", code);
7872 return;
7873 }
7874
7875 cond_code = INTVAL (x);
7876 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7877 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7878 }
7879 break;
7880
7881 case 'y':
7882 case 'z':
7883 {
7884 machine_mode mode = GET_MODE (x);
7885
7886 if (GET_CODE (x) != MEM
7887 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7888 {
7889 output_operand_lossage ("invalid operand for '%%%c'", code);
7890 return;
7891 }
7892
7893 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7894 code == 'y'
7895 ? ADDR_QUERY_LDP_STP_N
7896 : ADDR_QUERY_LDP_STP))
7897 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7898 }
7899 break;
7900
7901 default:
7902 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7903 return;
7904 }
7905 }
7906
7907 /* Print address 'x' of a memory access with mode 'mode'.
7908 'op' is the context required by aarch64_classify_address. It can either be
7909 MEM for a normal memory access or PARALLEL for LDP/STP. */
7910 static bool
7911 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7912 aarch64_addr_query_type type)
7913 {
7914 struct aarch64_address_info addr;
7915 unsigned int size;
7916
7917 /* Check all addresses are Pmode - including ILP32. */
7918 if (GET_MODE (x) != Pmode
7919 && (!CONST_INT_P (x)
7920 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
7921 {
7922 output_operand_lossage ("invalid address mode");
7923 return false;
7924 }
7925
7926 if (aarch64_classify_address (&addr, x, mode, true, type))
7927 switch (addr.type)
7928 {
7929 case ADDRESS_REG_IMM:
7930 if (known_eq (addr.const_offset, 0))
7931 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7932 else if (aarch64_sve_data_mode_p (mode))
7933 {
7934 HOST_WIDE_INT vnum
7935 = exact_div (addr.const_offset,
7936 BYTES_PER_SVE_VECTOR).to_constant ();
7937 asm_fprintf (f, "[%s, #%wd, mul vl]",
7938 reg_names[REGNO (addr.base)], vnum);
7939 }
7940 else if (aarch64_sve_pred_mode_p (mode))
7941 {
7942 HOST_WIDE_INT vnum
7943 = exact_div (addr.const_offset,
7944 BYTES_PER_SVE_PRED).to_constant ();
7945 asm_fprintf (f, "[%s, #%wd, mul vl]",
7946 reg_names[REGNO (addr.base)], vnum);
7947 }
7948 else
7949 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7950 INTVAL (addr.offset));
7951 return true;
7952
7953 case ADDRESS_REG_REG:
7954 if (addr.shift == 0)
7955 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7956 reg_names [REGNO (addr.offset)]);
7957 else
7958 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7959 reg_names [REGNO (addr.offset)], addr.shift);
7960 return true;
7961
7962 case ADDRESS_REG_UXTW:
7963 if (addr.shift == 0)
7964 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7965 REGNO (addr.offset) - R0_REGNUM);
7966 else
7967 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7968 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7969 return true;
7970
7971 case ADDRESS_REG_SXTW:
7972 if (addr.shift == 0)
7973 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7974 REGNO (addr.offset) - R0_REGNUM);
7975 else
7976 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7977 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7978 return true;
7979
7980 case ADDRESS_REG_WB:
7981 /* Writeback is only supported for fixed-width modes. */
7982 size = GET_MODE_SIZE (mode).to_constant ();
7983 switch (GET_CODE (x))
7984 {
7985 case PRE_INC:
7986 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7987 return true;
7988 case POST_INC:
7989 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7990 return true;
7991 case PRE_DEC:
7992 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7993 return true;
7994 case POST_DEC:
7995 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7996 return true;
7997 case PRE_MODIFY:
7998 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7999 INTVAL (addr.offset));
8000 return true;
8001 case POST_MODIFY:
8002 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8003 INTVAL (addr.offset));
8004 return true;
8005 default:
8006 break;
8007 }
8008 break;
8009
8010 case ADDRESS_LO_SUM:
8011 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8012 output_addr_const (f, addr.offset);
8013 asm_fprintf (f, "]");
8014 return true;
8015
8016 case ADDRESS_SYMBOLIC:
8017 output_addr_const (f, x);
8018 return true;
8019 }
8020
8021 return false;
8022 }
8023
8024 /* Print address 'x' of a memory access with mode 'mode'. */
8025 static void
8026 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8027 {
8028 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8029 output_addr_const (f, x);
8030 }
8031
8032 bool
8033 aarch64_label_mentioned_p (rtx x)
8034 {
8035 const char *fmt;
8036 int i;
8037
8038 if (GET_CODE (x) == LABEL_REF)
8039 return true;
8040
8041 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8042 referencing instruction, but they are constant offsets, not
8043 symbols. */
8044 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8045 return false;
8046
8047 fmt = GET_RTX_FORMAT (GET_CODE (x));
8048 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8049 {
8050 if (fmt[i] == 'E')
8051 {
8052 int j;
8053
8054 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8055 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8056 return 1;
8057 }
8058 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8059 return 1;
8060 }
8061
8062 return 0;
8063 }
8064
8065 /* Implement REGNO_REG_CLASS. */
8066
8067 enum reg_class
8068 aarch64_regno_regclass (unsigned regno)
8069 {
8070 if (GP_REGNUM_P (regno))
8071 return GENERAL_REGS;
8072
8073 if (regno == SP_REGNUM)
8074 return STACK_REG;
8075
8076 if (regno == FRAME_POINTER_REGNUM
8077 || regno == ARG_POINTER_REGNUM)
8078 return POINTER_REGS;
8079
8080 if (FP_REGNUM_P (regno))
8081 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
8082
8083 if (PR_REGNUM_P (regno))
8084 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8085
8086 return NO_REGS;
8087 }
8088
8089 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8090 If OFFSET is out of range, return an offset of an anchor point
8091 that is in range. Return 0 otherwise. */
8092
8093 static HOST_WIDE_INT
8094 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8095 machine_mode mode)
8096 {
8097 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8098 if (size > 16)
8099 return (offset + 0x400) & ~0x7f0;
8100
8101 /* For offsets that aren't a multiple of the access size, the limit is
8102 -256...255. */
8103 if (offset & (size - 1))
8104 {
8105 /* BLKmode typically uses LDP of X-registers. */
8106 if (mode == BLKmode)
8107 return (offset + 512) & ~0x3ff;
8108 return (offset + 0x100) & ~0x1ff;
8109 }
8110
8111 /* Small negative offsets are supported. */
8112 if (IN_RANGE (offset, -256, 0))
8113 return 0;
8114
8115 if (mode == TImode || mode == TFmode)
8116 return (offset + 0x100) & ~0x1ff;
8117
8118 /* Use 12-bit offset by access size. */
8119 return offset & (~0xfff * size);
8120 }
8121
8122 static rtx
8123 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
8124 {
8125 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8126 where mask is selected by alignment and size of the offset.
8127 We try to pick as large a range for the offset as possible to
8128 maximize the chance of a CSE. However, for aligned addresses
8129 we limit the range to 4k so that structures with different sized
8130 elements are likely to use the same base. We need to be careful
8131 not to split a CONST for some forms of address expression, otherwise
8132 it will generate sub-optimal code. */
8133
8134 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8135 {
8136 rtx base = XEXP (x, 0);
8137 rtx offset_rtx = XEXP (x, 1);
8138 HOST_WIDE_INT offset = INTVAL (offset_rtx);
8139
8140 if (GET_CODE (base) == PLUS)
8141 {
8142 rtx op0 = XEXP (base, 0);
8143 rtx op1 = XEXP (base, 1);
8144
8145 /* Force any scaling into a temp for CSE. */
8146 op0 = force_reg (Pmode, op0);
8147 op1 = force_reg (Pmode, op1);
8148
8149 /* Let the pointer register be in op0. */
8150 if (REG_POINTER (op1))
8151 std::swap (op0, op1);
8152
8153 /* If the pointer is virtual or frame related, then we know that
8154 virtual register instantiation or register elimination is going
8155 to apply a second constant. We want the two constants folded
8156 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8157 if (virt_or_elim_regno_p (REGNO (op0)))
8158 {
8159 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8160 NULL_RTX, true, OPTAB_DIRECT);
8161 return gen_rtx_PLUS (Pmode, base, op1);
8162 }
8163
8164 /* Otherwise, in order to encourage CSE (and thence loop strength
8165 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8166 base = expand_binop (Pmode, add_optab, op0, op1,
8167 NULL_RTX, true, OPTAB_DIRECT);
8168 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8169 }
8170
8171 HOST_WIDE_INT size;
8172 if (GET_MODE_SIZE (mode).is_constant (&size))
8173 {
8174 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8175 mode);
8176 if (base_offset != 0)
8177 {
8178 base = plus_constant (Pmode, base, base_offset);
8179 base = force_operand (base, NULL_RTX);
8180 return plus_constant (Pmode, base, offset - base_offset);
8181 }
8182 }
8183 }
8184
8185 return x;
8186 }
8187
8188 static reg_class_t
8189 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8190 reg_class_t rclass,
8191 machine_mode mode,
8192 secondary_reload_info *sri)
8193 {
8194 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8195 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8196 comment at the head of aarch64-sve.md for more details about the
8197 big-endian handling. */
8198 if (BYTES_BIG_ENDIAN
8199 && reg_class_subset_p (rclass, FP_REGS)
8200 && !((REG_P (x) && HARD_REGISTER_P (x))
8201 || aarch64_simd_valid_immediate (x, NULL))
8202 && aarch64_sve_data_mode_p (mode))
8203 {
8204 sri->icode = CODE_FOR_aarch64_sve_reload_be;
8205 return NO_REGS;
8206 }
8207
8208 /* If we have to disable direct literal pool loads and stores because the
8209 function is too big, then we need a scratch register. */
8210 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8211 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8212 || targetm.vector_mode_supported_p (GET_MODE (x)))
8213 && !aarch64_pcrelative_literal_loads)
8214 {
8215 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8216 return NO_REGS;
8217 }
8218
8219 /* Without the TARGET_SIMD instructions we cannot move a Q register
8220 to a Q register directly. We need a scratch. */
8221 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8222 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8223 && reg_class_subset_p (rclass, FP_REGS))
8224 {
8225 sri->icode = code_for_aarch64_reload_mov (mode);
8226 return NO_REGS;
8227 }
8228
8229 /* A TFmode or TImode memory access should be handled via an FP_REGS
8230 because AArch64 has richer addressing modes for LDR/STR instructions
8231 than LDP/STP instructions. */
8232 if (TARGET_FLOAT && rclass == GENERAL_REGS
8233 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8234 return FP_REGS;
8235
8236 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8237 return GENERAL_REGS;
8238
8239 return NO_REGS;
8240 }
8241
8242 static bool
8243 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8244 {
8245 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8246
8247 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8248 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8249 if (frame_pointer_needed)
8250 return to == HARD_FRAME_POINTER_REGNUM;
8251 return true;
8252 }
8253
8254 poly_int64
8255 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8256 {
8257 if (to == HARD_FRAME_POINTER_REGNUM)
8258 {
8259 if (from == ARG_POINTER_REGNUM)
8260 return cfun->machine->frame.hard_fp_offset;
8261
8262 if (from == FRAME_POINTER_REGNUM)
8263 return cfun->machine->frame.hard_fp_offset
8264 - cfun->machine->frame.locals_offset;
8265 }
8266
8267 if (to == STACK_POINTER_REGNUM)
8268 {
8269 if (from == FRAME_POINTER_REGNUM)
8270 return cfun->machine->frame.frame_size
8271 - cfun->machine->frame.locals_offset;
8272 }
8273
8274 return cfun->machine->frame.frame_size;
8275 }
8276
8277 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8278 previous frame. */
8279
8280 rtx
8281 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8282 {
8283 if (count != 0)
8284 return const0_rtx;
8285 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8286 }
8287
8288
8289 static void
8290 aarch64_asm_trampoline_template (FILE *f)
8291 {
8292 int offset1 = 16;
8293 int offset2 = 20;
8294
8295 if (aarch64_bti_enabled ())
8296 {
8297 asm_fprintf (f, "\thint\t34 // bti c\n");
8298 offset1 -= 4;
8299 offset2 -= 4;
8300 }
8301
8302 if (TARGET_ILP32)
8303 {
8304 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8305 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8306 offset1);
8307 }
8308 else
8309 {
8310 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8311 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8312 offset2);
8313 }
8314 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8315
8316 /* The trampoline needs an extra padding instruction. In case if BTI is
8317 enabled the padding instruction is replaced by the BTI instruction at
8318 the beginning. */
8319 if (!aarch64_bti_enabled ())
8320 assemble_aligned_integer (4, const0_rtx);
8321
8322 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8323 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8324 }
8325
8326 static void
8327 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8328 {
8329 rtx fnaddr, mem, a_tramp;
8330 const int tramp_code_sz = 16;
8331
8332 /* Don't need to copy the trailing D-words, we fill those in below. */
8333 emit_block_move (m_tramp, assemble_trampoline_template (),
8334 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8335 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8336 fnaddr = XEXP (DECL_RTL (fndecl), 0);
8337 if (GET_MODE (fnaddr) != ptr_mode)
8338 fnaddr = convert_memory_address (ptr_mode, fnaddr);
8339 emit_move_insn (mem, fnaddr);
8340
8341 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8342 emit_move_insn (mem, chain_value);
8343
8344 /* XXX We should really define a "clear_cache" pattern and use
8345 gen_clear_cache(). */
8346 a_tramp = XEXP (m_tramp, 0);
8347 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8348 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8349 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8350 ptr_mode);
8351 }
8352
8353 static unsigned char
8354 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8355 {
8356 /* ??? Logically we should only need to provide a value when
8357 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8358 can hold MODE, but at the moment we need to handle all modes.
8359 Just ignore any runtime parts for registers that can't store them. */
8360 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8361 unsigned int nregs;
8362 switch (regclass)
8363 {
8364 case TAILCALL_ADDR_REGS:
8365 case POINTER_REGS:
8366 case GENERAL_REGS:
8367 case ALL_REGS:
8368 case POINTER_AND_FP_REGS:
8369 case FP_REGS:
8370 case FP_LO_REGS:
8371 if (aarch64_sve_data_mode_p (mode)
8372 && constant_multiple_p (GET_MODE_SIZE (mode),
8373 BYTES_PER_SVE_VECTOR, &nregs))
8374 return nregs;
8375 return (aarch64_vector_data_mode_p (mode)
8376 ? CEIL (lowest_size, UNITS_PER_VREG)
8377 : CEIL (lowest_size, UNITS_PER_WORD));
8378 case STACK_REG:
8379 case PR_REGS:
8380 case PR_LO_REGS:
8381 case PR_HI_REGS:
8382 return 1;
8383
8384 case NO_REGS:
8385 return 0;
8386
8387 default:
8388 break;
8389 }
8390 gcc_unreachable ();
8391 }
8392
8393 static reg_class_t
8394 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8395 {
8396 if (regclass == POINTER_REGS)
8397 return GENERAL_REGS;
8398
8399 if (regclass == STACK_REG)
8400 {
8401 if (REG_P(x)
8402 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8403 return regclass;
8404
8405 return NO_REGS;
8406 }
8407
8408 /* Register eliminiation can result in a request for
8409 SP+constant->FP_REGS. We cannot support such operations which
8410 use SP as source and an FP_REG as destination, so reject out
8411 right now. */
8412 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8413 {
8414 rtx lhs = XEXP (x, 0);
8415
8416 /* Look through a possible SUBREG introduced by ILP32. */
8417 if (GET_CODE (lhs) == SUBREG)
8418 lhs = SUBREG_REG (lhs);
8419
8420 gcc_assert (REG_P (lhs));
8421 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8422 POINTER_REGS));
8423 return NO_REGS;
8424 }
8425
8426 return regclass;
8427 }
8428
8429 void
8430 aarch64_asm_output_labelref (FILE* f, const char *name)
8431 {
8432 asm_fprintf (f, "%U%s", name);
8433 }
8434
8435 static void
8436 aarch64_elf_asm_constructor (rtx symbol, int priority)
8437 {
8438 if (priority == DEFAULT_INIT_PRIORITY)
8439 default_ctor_section_asm_out_constructor (symbol, priority);
8440 else
8441 {
8442 section *s;
8443 /* While priority is known to be in range [0, 65535], so 18 bytes
8444 would be enough, the compiler might not know that. To avoid
8445 -Wformat-truncation false positive, use a larger size. */
8446 char buf[23];
8447 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8448 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8449 switch_to_section (s);
8450 assemble_align (POINTER_SIZE);
8451 assemble_aligned_integer (POINTER_BYTES, symbol);
8452 }
8453 }
8454
8455 static void
8456 aarch64_elf_asm_destructor (rtx symbol, int priority)
8457 {
8458 if (priority == DEFAULT_INIT_PRIORITY)
8459 default_dtor_section_asm_out_destructor (symbol, priority);
8460 else
8461 {
8462 section *s;
8463 /* While priority is known to be in range [0, 65535], so 18 bytes
8464 would be enough, the compiler might not know that. To avoid
8465 -Wformat-truncation false positive, use a larger size. */
8466 char buf[23];
8467 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8468 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8469 switch_to_section (s);
8470 assemble_align (POINTER_SIZE);
8471 assemble_aligned_integer (POINTER_BYTES, symbol);
8472 }
8473 }
8474
8475 const char*
8476 aarch64_output_casesi (rtx *operands)
8477 {
8478 char buf[100];
8479 char label[100];
8480 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8481 int index;
8482 static const char *const patterns[4][2] =
8483 {
8484 {
8485 "ldrb\t%w3, [%0,%w1,uxtw]",
8486 "add\t%3, %4, %w3, sxtb #2"
8487 },
8488 {
8489 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8490 "add\t%3, %4, %w3, sxth #2"
8491 },
8492 {
8493 "ldr\t%w3, [%0,%w1,uxtw #2]",
8494 "add\t%3, %4, %w3, sxtw #2"
8495 },
8496 /* We assume that DImode is only generated when not optimizing and
8497 that we don't really need 64-bit address offsets. That would
8498 imply an object file with 8GB of code in a single function! */
8499 {
8500 "ldr\t%w3, [%0,%w1,uxtw #2]",
8501 "add\t%3, %4, %w3, sxtw #2"
8502 }
8503 };
8504
8505 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8506
8507 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8508 index = exact_log2 (GET_MODE_SIZE (mode));
8509
8510 gcc_assert (index >= 0 && index <= 3);
8511
8512 /* Need to implement table size reduction, by chaning the code below. */
8513 output_asm_insn (patterns[index][0], operands);
8514 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8515 snprintf (buf, sizeof (buf),
8516 "adr\t%%4, %s", targetm.strip_name_encoding (label));
8517 output_asm_insn (buf, operands);
8518 output_asm_insn (patterns[index][1], operands);
8519 output_asm_insn ("br\t%3", operands);
8520 assemble_label (asm_out_file, label);
8521 return "";
8522 }
8523
8524
8525 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8526 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8527 operator. */
8528
8529 int
8530 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8531 {
8532 if (shift >= 0 && shift <= 3)
8533 {
8534 int size;
8535 for (size = 8; size <= 32; size *= 2)
8536 {
8537 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8538 if (mask == bits << shift)
8539 return size;
8540 }
8541 }
8542 return 0;
8543 }
8544
8545 /* Constant pools are per function only when PC relative
8546 literal loads are true or we are in the large memory
8547 model. */
8548
8549 static inline bool
8550 aarch64_can_use_per_function_literal_pools_p (void)
8551 {
8552 return (aarch64_pcrelative_literal_loads
8553 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8554 }
8555
8556 static bool
8557 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8558 {
8559 /* We can't use blocks for constants when we're using a per-function
8560 constant pool. */
8561 return !aarch64_can_use_per_function_literal_pools_p ();
8562 }
8563
8564 /* Select appropriate section for constants depending
8565 on where we place literal pools. */
8566
8567 static section *
8568 aarch64_select_rtx_section (machine_mode mode,
8569 rtx x,
8570 unsigned HOST_WIDE_INT align)
8571 {
8572 if (aarch64_can_use_per_function_literal_pools_p ())
8573 return function_section (current_function_decl);
8574
8575 return default_elf_select_rtx_section (mode, x, align);
8576 }
8577
8578 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8579 void
8580 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8581 HOST_WIDE_INT offset)
8582 {
8583 /* When using per-function literal pools, we must ensure that any code
8584 section is aligned to the minimal instruction length, lest we get
8585 errors from the assembler re "unaligned instructions". */
8586 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8587 ASM_OUTPUT_ALIGN (f, 2);
8588 }
8589
8590 /* Costs. */
8591
8592 /* Helper function for rtx cost calculation. Strip a shift expression
8593 from X. Returns the inner operand if successful, or the original
8594 expression on failure. */
8595 static rtx
8596 aarch64_strip_shift (rtx x)
8597 {
8598 rtx op = x;
8599
8600 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8601 we can convert both to ROR during final output. */
8602 if ((GET_CODE (op) == ASHIFT
8603 || GET_CODE (op) == ASHIFTRT
8604 || GET_CODE (op) == LSHIFTRT
8605 || GET_CODE (op) == ROTATERT
8606 || GET_CODE (op) == ROTATE)
8607 && CONST_INT_P (XEXP (op, 1)))
8608 return XEXP (op, 0);
8609
8610 if (GET_CODE (op) == MULT
8611 && CONST_INT_P (XEXP (op, 1))
8612 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8613 return XEXP (op, 0);
8614
8615 return x;
8616 }
8617
8618 /* Helper function for rtx cost calculation. Strip an extend
8619 expression from X. Returns the inner operand if successful, or the
8620 original expression on failure. We deal with a number of possible
8621 canonicalization variations here. If STRIP_SHIFT is true, then
8622 we can strip off a shift also. */
8623 static rtx
8624 aarch64_strip_extend (rtx x, bool strip_shift)
8625 {
8626 scalar_int_mode mode;
8627 rtx op = x;
8628
8629 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8630 return op;
8631
8632 /* Zero and sign extraction of a widened value. */
8633 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8634 && XEXP (op, 2) == const0_rtx
8635 && GET_CODE (XEXP (op, 0)) == MULT
8636 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8637 XEXP (op, 1)))
8638 return XEXP (XEXP (op, 0), 0);
8639
8640 /* It can also be represented (for zero-extend) as an AND with an
8641 immediate. */
8642 if (GET_CODE (op) == AND
8643 && GET_CODE (XEXP (op, 0)) == MULT
8644 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8645 && CONST_INT_P (XEXP (op, 1))
8646 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8647 INTVAL (XEXP (op, 1))) != 0)
8648 return XEXP (XEXP (op, 0), 0);
8649
8650 /* Now handle extended register, as this may also have an optional
8651 left shift by 1..4. */
8652 if (strip_shift
8653 && GET_CODE (op) == ASHIFT
8654 && CONST_INT_P (XEXP (op, 1))
8655 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8656 op = XEXP (op, 0);
8657
8658 if (GET_CODE (op) == ZERO_EXTEND
8659 || GET_CODE (op) == SIGN_EXTEND)
8660 op = XEXP (op, 0);
8661
8662 if (op != x)
8663 return op;
8664
8665 return x;
8666 }
8667
8668 /* Return true iff CODE is a shift supported in combination
8669 with arithmetic instructions. */
8670
8671 static bool
8672 aarch64_shift_p (enum rtx_code code)
8673 {
8674 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8675 }
8676
8677
8678 /* Return true iff X is a cheap shift without a sign extend. */
8679
8680 static bool
8681 aarch64_cheap_mult_shift_p (rtx x)
8682 {
8683 rtx op0, op1;
8684
8685 op0 = XEXP (x, 0);
8686 op1 = XEXP (x, 1);
8687
8688 if (!(aarch64_tune_params.extra_tuning_flags
8689 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8690 return false;
8691
8692 if (GET_CODE (op0) == SIGN_EXTEND)
8693 return false;
8694
8695 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8696 && UINTVAL (op1) <= 4)
8697 return true;
8698
8699 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8700 return false;
8701
8702 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8703
8704 if (l2 > 0 && l2 <= 4)
8705 return true;
8706
8707 return false;
8708 }
8709
8710 /* Helper function for rtx cost calculation. Calculate the cost of
8711 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8712 Return the calculated cost of the expression, recursing manually in to
8713 operands where needed. */
8714
8715 static int
8716 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8717 {
8718 rtx op0, op1;
8719 const struct cpu_cost_table *extra_cost
8720 = aarch64_tune_params.insn_extra_cost;
8721 int cost = 0;
8722 bool compound_p = (outer == PLUS || outer == MINUS);
8723 machine_mode mode = GET_MODE (x);
8724
8725 gcc_checking_assert (code == MULT);
8726
8727 op0 = XEXP (x, 0);
8728 op1 = XEXP (x, 1);
8729
8730 if (VECTOR_MODE_P (mode))
8731 mode = GET_MODE_INNER (mode);
8732
8733 /* Integer multiply/fma. */
8734 if (GET_MODE_CLASS (mode) == MODE_INT)
8735 {
8736 /* The multiply will be canonicalized as a shift, cost it as such. */
8737 if (aarch64_shift_p (GET_CODE (x))
8738 || (CONST_INT_P (op1)
8739 && exact_log2 (INTVAL (op1)) > 0))
8740 {
8741 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8742 || GET_CODE (op0) == SIGN_EXTEND;
8743 if (speed)
8744 {
8745 if (compound_p)
8746 {
8747 /* If the shift is considered cheap,
8748 then don't add any cost. */
8749 if (aarch64_cheap_mult_shift_p (x))
8750 ;
8751 else if (REG_P (op1))
8752 /* ARITH + shift-by-register. */
8753 cost += extra_cost->alu.arith_shift_reg;
8754 else if (is_extend)
8755 /* ARITH + extended register. We don't have a cost field
8756 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8757 cost += extra_cost->alu.extend_arith;
8758 else
8759 /* ARITH + shift-by-immediate. */
8760 cost += extra_cost->alu.arith_shift;
8761 }
8762 else
8763 /* LSL (immediate). */
8764 cost += extra_cost->alu.shift;
8765
8766 }
8767 /* Strip extends as we will have costed them in the case above. */
8768 if (is_extend)
8769 op0 = aarch64_strip_extend (op0, true);
8770
8771 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8772
8773 return cost;
8774 }
8775
8776 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8777 compound and let the below cases handle it. After all, MNEG is a
8778 special-case alias of MSUB. */
8779 if (GET_CODE (op0) == NEG)
8780 {
8781 op0 = XEXP (op0, 0);
8782 compound_p = true;
8783 }
8784
8785 /* Integer multiplies or FMAs have zero/sign extending variants. */
8786 if ((GET_CODE (op0) == ZERO_EXTEND
8787 && GET_CODE (op1) == ZERO_EXTEND)
8788 || (GET_CODE (op0) == SIGN_EXTEND
8789 && GET_CODE (op1) == SIGN_EXTEND))
8790 {
8791 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8792 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8793
8794 if (speed)
8795 {
8796 if (compound_p)
8797 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8798 cost += extra_cost->mult[0].extend_add;
8799 else
8800 /* MUL/SMULL/UMULL. */
8801 cost += extra_cost->mult[0].extend;
8802 }
8803
8804 return cost;
8805 }
8806
8807 /* This is either an integer multiply or a MADD. In both cases
8808 we want to recurse and cost the operands. */
8809 cost += rtx_cost (op0, mode, MULT, 0, speed);
8810 cost += rtx_cost (op1, mode, MULT, 1, speed);
8811
8812 if (speed)
8813 {
8814 if (compound_p)
8815 /* MADD/MSUB. */
8816 cost += extra_cost->mult[mode == DImode].add;
8817 else
8818 /* MUL. */
8819 cost += extra_cost->mult[mode == DImode].simple;
8820 }
8821
8822 return cost;
8823 }
8824 else
8825 {
8826 if (speed)
8827 {
8828 /* Floating-point FMA/FMUL can also support negations of the
8829 operands, unless the rounding mode is upward or downward in
8830 which case FNMUL is different than FMUL with operand negation. */
8831 bool neg0 = GET_CODE (op0) == NEG;
8832 bool neg1 = GET_CODE (op1) == NEG;
8833 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8834 {
8835 if (neg0)
8836 op0 = XEXP (op0, 0);
8837 if (neg1)
8838 op1 = XEXP (op1, 0);
8839 }
8840
8841 if (compound_p)
8842 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8843 cost += extra_cost->fp[mode == DFmode].fma;
8844 else
8845 /* FMUL/FNMUL. */
8846 cost += extra_cost->fp[mode == DFmode].mult;
8847 }
8848
8849 cost += rtx_cost (op0, mode, MULT, 0, speed);
8850 cost += rtx_cost (op1, mode, MULT, 1, speed);
8851 return cost;
8852 }
8853 }
8854
8855 static int
8856 aarch64_address_cost (rtx x,
8857 machine_mode mode,
8858 addr_space_t as ATTRIBUTE_UNUSED,
8859 bool speed)
8860 {
8861 enum rtx_code c = GET_CODE (x);
8862 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8863 struct aarch64_address_info info;
8864 int cost = 0;
8865 info.shift = 0;
8866
8867 if (!aarch64_classify_address (&info, x, mode, false))
8868 {
8869 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8870 {
8871 /* This is a CONST or SYMBOL ref which will be split
8872 in a different way depending on the code model in use.
8873 Cost it through the generic infrastructure. */
8874 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8875 /* Divide through by the cost of one instruction to
8876 bring it to the same units as the address costs. */
8877 cost_symbol_ref /= COSTS_N_INSNS (1);
8878 /* The cost is then the cost of preparing the address,
8879 followed by an immediate (possibly 0) offset. */
8880 return cost_symbol_ref + addr_cost->imm_offset;
8881 }
8882 else
8883 {
8884 /* This is most likely a jump table from a case
8885 statement. */
8886 return addr_cost->register_offset;
8887 }
8888 }
8889
8890 switch (info.type)
8891 {
8892 case ADDRESS_LO_SUM:
8893 case ADDRESS_SYMBOLIC:
8894 case ADDRESS_REG_IMM:
8895 cost += addr_cost->imm_offset;
8896 break;
8897
8898 case ADDRESS_REG_WB:
8899 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8900 cost += addr_cost->pre_modify;
8901 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8902 cost += addr_cost->post_modify;
8903 else
8904 gcc_unreachable ();
8905
8906 break;
8907
8908 case ADDRESS_REG_REG:
8909 cost += addr_cost->register_offset;
8910 break;
8911
8912 case ADDRESS_REG_SXTW:
8913 cost += addr_cost->register_sextend;
8914 break;
8915
8916 case ADDRESS_REG_UXTW:
8917 cost += addr_cost->register_zextend;
8918 break;
8919
8920 default:
8921 gcc_unreachable ();
8922 }
8923
8924
8925 if (info.shift > 0)
8926 {
8927 /* For the sake of calculating the cost of the shifted register
8928 component, we can treat same sized modes in the same way. */
8929 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8930 cost += addr_cost->addr_scale_costs.hi;
8931 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8932 cost += addr_cost->addr_scale_costs.si;
8933 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8934 cost += addr_cost->addr_scale_costs.di;
8935 else
8936 /* We can't tell, or this is a 128-bit vector. */
8937 cost += addr_cost->addr_scale_costs.ti;
8938 }
8939
8940 return cost;
8941 }
8942
8943 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8944 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8945 to be taken. */
8946
8947 int
8948 aarch64_branch_cost (bool speed_p, bool predictable_p)
8949 {
8950 /* When optimizing for speed, use the cost of unpredictable branches. */
8951 const struct cpu_branch_cost *branch_costs =
8952 aarch64_tune_params.branch_costs;
8953
8954 if (!speed_p || predictable_p)
8955 return branch_costs->predictable;
8956 else
8957 return branch_costs->unpredictable;
8958 }
8959
8960 /* Return true if the RTX X in mode MODE is a zero or sign extract
8961 usable in an ADD or SUB (extended register) instruction. */
8962 static bool
8963 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8964 {
8965 /* Catch add with a sign extract.
8966 This is add_<optab><mode>_multp2. */
8967 if (GET_CODE (x) == SIGN_EXTRACT
8968 || GET_CODE (x) == ZERO_EXTRACT)
8969 {
8970 rtx op0 = XEXP (x, 0);
8971 rtx op1 = XEXP (x, 1);
8972 rtx op2 = XEXP (x, 2);
8973
8974 if (GET_CODE (op0) == MULT
8975 && CONST_INT_P (op1)
8976 && op2 == const0_rtx
8977 && CONST_INT_P (XEXP (op0, 1))
8978 && aarch64_is_extend_from_extract (mode,
8979 XEXP (op0, 1),
8980 op1))
8981 {
8982 return true;
8983 }
8984 }
8985 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8986 No shift. */
8987 else if (GET_CODE (x) == SIGN_EXTEND
8988 || GET_CODE (x) == ZERO_EXTEND)
8989 return REG_P (XEXP (x, 0));
8990
8991 return false;
8992 }
8993
8994 static bool
8995 aarch64_frint_unspec_p (unsigned int u)
8996 {
8997 switch (u)
8998 {
8999 case UNSPEC_FRINTZ:
9000 case UNSPEC_FRINTP:
9001 case UNSPEC_FRINTM:
9002 case UNSPEC_FRINTA:
9003 case UNSPEC_FRINTN:
9004 case UNSPEC_FRINTX:
9005 case UNSPEC_FRINTI:
9006 return true;
9007
9008 default:
9009 return false;
9010 }
9011 }
9012
9013 /* Return true iff X is an rtx that will match an extr instruction
9014 i.e. as described in the *extr<mode>5_insn family of patterns.
9015 OP0 and OP1 will be set to the operands of the shifts involved
9016 on success and will be NULL_RTX otherwise. */
9017
9018 static bool
9019 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9020 {
9021 rtx op0, op1;
9022 scalar_int_mode mode;
9023 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9024 return false;
9025
9026 *res_op0 = NULL_RTX;
9027 *res_op1 = NULL_RTX;
9028
9029 if (GET_CODE (x) != IOR)
9030 return false;
9031
9032 op0 = XEXP (x, 0);
9033 op1 = XEXP (x, 1);
9034
9035 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9036 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9037 {
9038 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9039 if (GET_CODE (op1) == ASHIFT)
9040 std::swap (op0, op1);
9041
9042 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9043 return false;
9044
9045 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9046 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9047
9048 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9049 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9050 {
9051 *res_op0 = XEXP (op0, 0);
9052 *res_op1 = XEXP (op1, 0);
9053 return true;
9054 }
9055 }
9056
9057 return false;
9058 }
9059
9060 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9061 storing it in *COST. Result is true if the total cost of the operation
9062 has now been calculated. */
9063 static bool
9064 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9065 {
9066 rtx inner;
9067 rtx comparator;
9068 enum rtx_code cmpcode;
9069
9070 if (COMPARISON_P (op0))
9071 {
9072 inner = XEXP (op0, 0);
9073 comparator = XEXP (op0, 1);
9074 cmpcode = GET_CODE (op0);
9075 }
9076 else
9077 {
9078 inner = op0;
9079 comparator = const0_rtx;
9080 cmpcode = NE;
9081 }
9082
9083 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9084 {
9085 /* Conditional branch. */
9086 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9087 return true;
9088 else
9089 {
9090 if (cmpcode == NE || cmpcode == EQ)
9091 {
9092 if (comparator == const0_rtx)
9093 {
9094 /* TBZ/TBNZ/CBZ/CBNZ. */
9095 if (GET_CODE (inner) == ZERO_EXTRACT)
9096 /* TBZ/TBNZ. */
9097 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9098 ZERO_EXTRACT, 0, speed);
9099 else
9100 /* CBZ/CBNZ. */
9101 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9102
9103 return true;
9104 }
9105 }
9106 else if (cmpcode == LT || cmpcode == GE)
9107 {
9108 /* TBZ/TBNZ. */
9109 if (comparator == const0_rtx)
9110 return true;
9111 }
9112 }
9113 }
9114 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9115 {
9116 /* CCMP. */
9117 if (GET_CODE (op1) == COMPARE)
9118 {
9119 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9120 if (XEXP (op1, 1) == const0_rtx)
9121 *cost += 1;
9122 if (speed)
9123 {
9124 machine_mode mode = GET_MODE (XEXP (op1, 0));
9125 const struct cpu_cost_table *extra_cost
9126 = aarch64_tune_params.insn_extra_cost;
9127
9128 if (GET_MODE_CLASS (mode) == MODE_INT)
9129 *cost += extra_cost->alu.arith;
9130 else
9131 *cost += extra_cost->fp[mode == DFmode].compare;
9132 }
9133 return true;
9134 }
9135
9136 /* It's a conditional operation based on the status flags,
9137 so it must be some flavor of CSEL. */
9138
9139 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9140 if (GET_CODE (op1) == NEG
9141 || GET_CODE (op1) == NOT
9142 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9143 op1 = XEXP (op1, 0);
9144 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9145 {
9146 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9147 op1 = XEXP (op1, 0);
9148 op2 = XEXP (op2, 0);
9149 }
9150
9151 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9152 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9153 return true;
9154 }
9155
9156 /* We don't know what this is, cost all operands. */
9157 return false;
9158 }
9159
9160 /* Check whether X is a bitfield operation of the form shift + extend that
9161 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9162 operand to which the bitfield operation is applied. Otherwise return
9163 NULL_RTX. */
9164
9165 static rtx
9166 aarch64_extend_bitfield_pattern_p (rtx x)
9167 {
9168 rtx_code outer_code = GET_CODE (x);
9169 machine_mode outer_mode = GET_MODE (x);
9170
9171 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9172 && outer_mode != SImode && outer_mode != DImode)
9173 return NULL_RTX;
9174
9175 rtx inner = XEXP (x, 0);
9176 rtx_code inner_code = GET_CODE (inner);
9177 machine_mode inner_mode = GET_MODE (inner);
9178 rtx op = NULL_RTX;
9179
9180 switch (inner_code)
9181 {
9182 case ASHIFT:
9183 if (CONST_INT_P (XEXP (inner, 1))
9184 && (inner_mode == QImode || inner_mode == HImode))
9185 op = XEXP (inner, 0);
9186 break;
9187 case LSHIFTRT:
9188 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9189 && (inner_mode == QImode || inner_mode == HImode))
9190 op = XEXP (inner, 0);
9191 break;
9192 case ASHIFTRT:
9193 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9194 && (inner_mode == QImode || inner_mode == HImode))
9195 op = XEXP (inner, 0);
9196 break;
9197 default:
9198 break;
9199 }
9200
9201 return op;
9202 }
9203
9204 /* Return true if the mask and a shift amount from an RTX of the form
9205 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9206 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9207
9208 bool
9209 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9210 rtx shft_amnt)
9211 {
9212 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9213 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9214 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9215 && (INTVAL (mask)
9216 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9217 }
9218
9219 /* Calculate the cost of calculating X, storing it in *COST. Result
9220 is true if the total cost of the operation has now been calculated. */
9221 static bool
9222 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9223 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9224 {
9225 rtx op0, op1, op2;
9226 const struct cpu_cost_table *extra_cost
9227 = aarch64_tune_params.insn_extra_cost;
9228 int code = GET_CODE (x);
9229 scalar_int_mode int_mode;
9230
9231 /* By default, assume that everything has equivalent cost to the
9232 cheapest instruction. Any additional costs are applied as a delta
9233 above this default. */
9234 *cost = COSTS_N_INSNS (1);
9235
9236 switch (code)
9237 {
9238 case SET:
9239 /* The cost depends entirely on the operands to SET. */
9240 *cost = 0;
9241 op0 = SET_DEST (x);
9242 op1 = SET_SRC (x);
9243
9244 switch (GET_CODE (op0))
9245 {
9246 case MEM:
9247 if (speed)
9248 {
9249 rtx address = XEXP (op0, 0);
9250 if (VECTOR_MODE_P (mode))
9251 *cost += extra_cost->ldst.storev;
9252 else if (GET_MODE_CLASS (mode) == MODE_INT)
9253 *cost += extra_cost->ldst.store;
9254 else if (mode == SFmode)
9255 *cost += extra_cost->ldst.storef;
9256 else if (mode == DFmode)
9257 *cost += extra_cost->ldst.stored;
9258
9259 *cost +=
9260 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9261 0, speed));
9262 }
9263
9264 *cost += rtx_cost (op1, mode, SET, 1, speed);
9265 return true;
9266
9267 case SUBREG:
9268 if (! REG_P (SUBREG_REG (op0)))
9269 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9270
9271 /* Fall through. */
9272 case REG:
9273 /* The cost is one per vector-register copied. */
9274 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9275 {
9276 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9277 *cost = COSTS_N_INSNS (nregs);
9278 }
9279 /* const0_rtx is in general free, but we will use an
9280 instruction to set a register to 0. */
9281 else if (REG_P (op1) || op1 == const0_rtx)
9282 {
9283 /* The cost is 1 per register copied. */
9284 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9285 *cost = COSTS_N_INSNS (nregs);
9286 }
9287 else
9288 /* Cost is just the cost of the RHS of the set. */
9289 *cost += rtx_cost (op1, mode, SET, 1, speed);
9290 return true;
9291
9292 case ZERO_EXTRACT:
9293 case SIGN_EXTRACT:
9294 /* Bit-field insertion. Strip any redundant widening of
9295 the RHS to meet the width of the target. */
9296 if (GET_CODE (op1) == SUBREG)
9297 op1 = SUBREG_REG (op1);
9298 if ((GET_CODE (op1) == ZERO_EXTEND
9299 || GET_CODE (op1) == SIGN_EXTEND)
9300 && CONST_INT_P (XEXP (op0, 1))
9301 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9302 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9303 op1 = XEXP (op1, 0);
9304
9305 if (CONST_INT_P (op1))
9306 {
9307 /* MOV immediate is assumed to always be cheap. */
9308 *cost = COSTS_N_INSNS (1);
9309 }
9310 else
9311 {
9312 /* BFM. */
9313 if (speed)
9314 *cost += extra_cost->alu.bfi;
9315 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9316 }
9317
9318 return true;
9319
9320 default:
9321 /* We can't make sense of this, assume default cost. */
9322 *cost = COSTS_N_INSNS (1);
9323 return false;
9324 }
9325 return false;
9326
9327 case CONST_INT:
9328 /* If an instruction can incorporate a constant within the
9329 instruction, the instruction's expression avoids calling
9330 rtx_cost() on the constant. If rtx_cost() is called on a
9331 constant, then it is usually because the constant must be
9332 moved into a register by one or more instructions.
9333
9334 The exception is constant 0, which can be expressed
9335 as XZR/WZR and is therefore free. The exception to this is
9336 if we have (set (reg) (const0_rtx)) in which case we must cost
9337 the move. However, we can catch that when we cost the SET, so
9338 we don't need to consider that here. */
9339 if (x == const0_rtx)
9340 *cost = 0;
9341 else
9342 {
9343 /* To an approximation, building any other constant is
9344 proportionally expensive to the number of instructions
9345 required to build that constant. This is true whether we
9346 are compiling for SPEED or otherwise. */
9347 if (!is_a <scalar_int_mode> (mode, &int_mode))
9348 int_mode = word_mode;
9349 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9350 (NULL_RTX, x, false, int_mode));
9351 }
9352 return true;
9353
9354 case CONST_DOUBLE:
9355
9356 /* First determine number of instructions to do the move
9357 as an integer constant. */
9358 if (!aarch64_float_const_representable_p (x)
9359 && !aarch64_can_const_movi_rtx_p (x, mode)
9360 && aarch64_float_const_rtx_p (x))
9361 {
9362 unsigned HOST_WIDE_INT ival;
9363 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9364 gcc_assert (succeed);
9365
9366 scalar_int_mode imode = (mode == HFmode
9367 ? SImode
9368 : int_mode_for_mode (mode).require ());
9369 int ncost = aarch64_internal_mov_immediate
9370 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9371 *cost += COSTS_N_INSNS (ncost);
9372 return true;
9373 }
9374
9375 if (speed)
9376 {
9377 /* mov[df,sf]_aarch64. */
9378 if (aarch64_float_const_representable_p (x))
9379 /* FMOV (scalar immediate). */
9380 *cost += extra_cost->fp[mode == DFmode].fpconst;
9381 else if (!aarch64_float_const_zero_rtx_p (x))
9382 {
9383 /* This will be a load from memory. */
9384 if (mode == DFmode)
9385 *cost += extra_cost->ldst.loadd;
9386 else
9387 *cost += extra_cost->ldst.loadf;
9388 }
9389 else
9390 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9391 or MOV v0.s[0], wzr - neither of which are modeled by the
9392 cost tables. Just use the default cost. */
9393 {
9394 }
9395 }
9396
9397 return true;
9398
9399 case MEM:
9400 if (speed)
9401 {
9402 /* For loads we want the base cost of a load, plus an
9403 approximation for the additional cost of the addressing
9404 mode. */
9405 rtx address = XEXP (x, 0);
9406 if (VECTOR_MODE_P (mode))
9407 *cost += extra_cost->ldst.loadv;
9408 else if (GET_MODE_CLASS (mode) == MODE_INT)
9409 *cost += extra_cost->ldst.load;
9410 else if (mode == SFmode)
9411 *cost += extra_cost->ldst.loadf;
9412 else if (mode == DFmode)
9413 *cost += extra_cost->ldst.loadd;
9414
9415 *cost +=
9416 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9417 0, speed));
9418 }
9419
9420 return true;
9421
9422 case NEG:
9423 op0 = XEXP (x, 0);
9424
9425 if (VECTOR_MODE_P (mode))
9426 {
9427 if (speed)
9428 {
9429 /* FNEG. */
9430 *cost += extra_cost->vect.alu;
9431 }
9432 return false;
9433 }
9434
9435 if (GET_MODE_CLASS (mode) == MODE_INT)
9436 {
9437 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9438 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9439 {
9440 /* CSETM. */
9441 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9442 return true;
9443 }
9444
9445 /* Cost this as SUB wzr, X. */
9446 op0 = CONST0_RTX (mode);
9447 op1 = XEXP (x, 0);
9448 goto cost_minus;
9449 }
9450
9451 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9452 {
9453 /* Support (neg(fma...)) as a single instruction only if
9454 sign of zeros is unimportant. This matches the decision
9455 making in aarch64.md. */
9456 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9457 {
9458 /* FNMADD. */
9459 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9460 return true;
9461 }
9462 if (GET_CODE (op0) == MULT)
9463 {
9464 /* FNMUL. */
9465 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9466 return true;
9467 }
9468 if (speed)
9469 /* FNEG. */
9470 *cost += extra_cost->fp[mode == DFmode].neg;
9471 return false;
9472 }
9473
9474 return false;
9475
9476 case CLRSB:
9477 case CLZ:
9478 if (speed)
9479 {
9480 if (VECTOR_MODE_P (mode))
9481 *cost += extra_cost->vect.alu;
9482 else
9483 *cost += extra_cost->alu.clz;
9484 }
9485
9486 return false;
9487
9488 case COMPARE:
9489 op0 = XEXP (x, 0);
9490 op1 = XEXP (x, 1);
9491
9492 if (op1 == const0_rtx
9493 && GET_CODE (op0) == AND)
9494 {
9495 x = op0;
9496 mode = GET_MODE (op0);
9497 goto cost_logic;
9498 }
9499
9500 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9501 {
9502 /* TODO: A write to the CC flags possibly costs extra, this
9503 needs encoding in the cost tables. */
9504
9505 mode = GET_MODE (op0);
9506 /* ANDS. */
9507 if (GET_CODE (op0) == AND)
9508 {
9509 x = op0;
9510 goto cost_logic;
9511 }
9512
9513 if (GET_CODE (op0) == PLUS)
9514 {
9515 /* ADDS (and CMN alias). */
9516 x = op0;
9517 goto cost_plus;
9518 }
9519
9520 if (GET_CODE (op0) == MINUS)
9521 {
9522 /* SUBS. */
9523 x = op0;
9524 goto cost_minus;
9525 }
9526
9527 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9528 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9529 && CONST_INT_P (XEXP (op0, 2)))
9530 {
9531 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9532 Handle it here directly rather than going to cost_logic
9533 since we know the immediate generated for the TST is valid
9534 so we can avoid creating an intermediate rtx for it only
9535 for costing purposes. */
9536 if (speed)
9537 *cost += extra_cost->alu.logical;
9538
9539 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9540 ZERO_EXTRACT, 0, speed);
9541 return true;
9542 }
9543
9544 if (GET_CODE (op1) == NEG)
9545 {
9546 /* CMN. */
9547 if (speed)
9548 *cost += extra_cost->alu.arith;
9549
9550 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9551 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9552 return true;
9553 }
9554
9555 /* CMP.
9556
9557 Compare can freely swap the order of operands, and
9558 canonicalization puts the more complex operation first.
9559 But the integer MINUS logic expects the shift/extend
9560 operation in op1. */
9561 if (! (REG_P (op0)
9562 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9563 {
9564 op0 = XEXP (x, 1);
9565 op1 = XEXP (x, 0);
9566 }
9567 goto cost_minus;
9568 }
9569
9570 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9571 {
9572 /* FCMP. */
9573 if (speed)
9574 *cost += extra_cost->fp[mode == DFmode].compare;
9575
9576 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9577 {
9578 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9579 /* FCMP supports constant 0.0 for no extra cost. */
9580 return true;
9581 }
9582 return false;
9583 }
9584
9585 if (VECTOR_MODE_P (mode))
9586 {
9587 /* Vector compare. */
9588 if (speed)
9589 *cost += extra_cost->vect.alu;
9590
9591 if (aarch64_float_const_zero_rtx_p (op1))
9592 {
9593 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9594 cost. */
9595 return true;
9596 }
9597 return false;
9598 }
9599 return false;
9600
9601 case MINUS:
9602 {
9603 op0 = XEXP (x, 0);
9604 op1 = XEXP (x, 1);
9605
9606 cost_minus:
9607 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9608
9609 /* Detect valid immediates. */
9610 if ((GET_MODE_CLASS (mode) == MODE_INT
9611 || (GET_MODE_CLASS (mode) == MODE_CC
9612 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9613 && CONST_INT_P (op1)
9614 && aarch64_uimm12_shift (INTVAL (op1)))
9615 {
9616 if (speed)
9617 /* SUB(S) (immediate). */
9618 *cost += extra_cost->alu.arith;
9619 return true;
9620 }
9621
9622 /* Look for SUB (extended register). */
9623 if (is_a <scalar_int_mode> (mode, &int_mode)
9624 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9625 {
9626 if (speed)
9627 *cost += extra_cost->alu.extend_arith;
9628
9629 op1 = aarch64_strip_extend (op1, true);
9630 *cost += rtx_cost (op1, VOIDmode,
9631 (enum rtx_code) GET_CODE (op1), 0, speed);
9632 return true;
9633 }
9634
9635 rtx new_op1 = aarch64_strip_extend (op1, false);
9636
9637 /* Cost this as an FMA-alike operation. */
9638 if ((GET_CODE (new_op1) == MULT
9639 || aarch64_shift_p (GET_CODE (new_op1)))
9640 && code != COMPARE)
9641 {
9642 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9643 (enum rtx_code) code,
9644 speed);
9645 return true;
9646 }
9647
9648 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9649
9650 if (speed)
9651 {
9652 if (VECTOR_MODE_P (mode))
9653 {
9654 /* Vector SUB. */
9655 *cost += extra_cost->vect.alu;
9656 }
9657 else if (GET_MODE_CLASS (mode) == MODE_INT)
9658 {
9659 /* SUB(S). */
9660 *cost += extra_cost->alu.arith;
9661 }
9662 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9663 {
9664 /* FSUB. */
9665 *cost += extra_cost->fp[mode == DFmode].addsub;
9666 }
9667 }
9668 return true;
9669 }
9670
9671 case PLUS:
9672 {
9673 rtx new_op0;
9674
9675 op0 = XEXP (x, 0);
9676 op1 = XEXP (x, 1);
9677
9678 cost_plus:
9679 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9680 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9681 {
9682 /* CSINC. */
9683 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9684 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9685 return true;
9686 }
9687
9688 if (GET_MODE_CLASS (mode) == MODE_INT
9689 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9690 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9691 {
9692 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9693
9694 if (speed)
9695 /* ADD (immediate). */
9696 *cost += extra_cost->alu.arith;
9697 return true;
9698 }
9699
9700 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9701
9702 /* Look for ADD (extended register). */
9703 if (is_a <scalar_int_mode> (mode, &int_mode)
9704 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9705 {
9706 if (speed)
9707 *cost += extra_cost->alu.extend_arith;
9708
9709 op0 = aarch64_strip_extend (op0, true);
9710 *cost += rtx_cost (op0, VOIDmode,
9711 (enum rtx_code) GET_CODE (op0), 0, speed);
9712 return true;
9713 }
9714
9715 /* Strip any extend, leave shifts behind as we will
9716 cost them through mult_cost. */
9717 new_op0 = aarch64_strip_extend (op0, false);
9718
9719 if (GET_CODE (new_op0) == MULT
9720 || aarch64_shift_p (GET_CODE (new_op0)))
9721 {
9722 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9723 speed);
9724 return true;
9725 }
9726
9727 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9728
9729 if (speed)
9730 {
9731 if (VECTOR_MODE_P (mode))
9732 {
9733 /* Vector ADD. */
9734 *cost += extra_cost->vect.alu;
9735 }
9736 else if (GET_MODE_CLASS (mode) == MODE_INT)
9737 {
9738 /* ADD. */
9739 *cost += extra_cost->alu.arith;
9740 }
9741 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9742 {
9743 /* FADD. */
9744 *cost += extra_cost->fp[mode == DFmode].addsub;
9745 }
9746 }
9747 return true;
9748 }
9749
9750 case BSWAP:
9751 *cost = COSTS_N_INSNS (1);
9752
9753 if (speed)
9754 {
9755 if (VECTOR_MODE_P (mode))
9756 *cost += extra_cost->vect.alu;
9757 else
9758 *cost += extra_cost->alu.rev;
9759 }
9760 return false;
9761
9762 case IOR:
9763 if (aarch_rev16_p (x))
9764 {
9765 *cost = COSTS_N_INSNS (1);
9766
9767 if (speed)
9768 {
9769 if (VECTOR_MODE_P (mode))
9770 *cost += extra_cost->vect.alu;
9771 else
9772 *cost += extra_cost->alu.rev;
9773 }
9774 return true;
9775 }
9776
9777 if (aarch64_extr_rtx_p (x, &op0, &op1))
9778 {
9779 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9780 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9781 if (speed)
9782 *cost += extra_cost->alu.shift;
9783
9784 return true;
9785 }
9786 /* Fall through. */
9787 case XOR:
9788 case AND:
9789 cost_logic:
9790 op0 = XEXP (x, 0);
9791 op1 = XEXP (x, 1);
9792
9793 if (VECTOR_MODE_P (mode))
9794 {
9795 if (speed)
9796 *cost += extra_cost->vect.alu;
9797 return true;
9798 }
9799
9800 if (code == AND
9801 && GET_CODE (op0) == MULT
9802 && CONST_INT_P (XEXP (op0, 1))
9803 && CONST_INT_P (op1)
9804 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9805 INTVAL (op1)) != 0)
9806 {
9807 /* This is a UBFM/SBFM. */
9808 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9809 if (speed)
9810 *cost += extra_cost->alu.bfx;
9811 return true;
9812 }
9813
9814 if (is_int_mode (mode, &int_mode))
9815 {
9816 if (CONST_INT_P (op1))
9817 {
9818 /* We have a mask + shift version of a UBFIZ
9819 i.e. the *andim_ashift<mode>_bfiz pattern. */
9820 if (GET_CODE (op0) == ASHIFT
9821 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9822 XEXP (op0, 1)))
9823 {
9824 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9825 (enum rtx_code) code, 0, speed);
9826 if (speed)
9827 *cost += extra_cost->alu.bfx;
9828
9829 return true;
9830 }
9831 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9832 {
9833 /* We possibly get the immediate for free, this is not
9834 modelled. */
9835 *cost += rtx_cost (op0, int_mode,
9836 (enum rtx_code) code, 0, speed);
9837 if (speed)
9838 *cost += extra_cost->alu.logical;
9839
9840 return true;
9841 }
9842 }
9843 else
9844 {
9845 rtx new_op0 = op0;
9846
9847 /* Handle ORN, EON, or BIC. */
9848 if (GET_CODE (op0) == NOT)
9849 op0 = XEXP (op0, 0);
9850
9851 new_op0 = aarch64_strip_shift (op0);
9852
9853 /* If we had a shift on op0 then this is a logical-shift-
9854 by-register/immediate operation. Otherwise, this is just
9855 a logical operation. */
9856 if (speed)
9857 {
9858 if (new_op0 != op0)
9859 {
9860 /* Shift by immediate. */
9861 if (CONST_INT_P (XEXP (op0, 1)))
9862 *cost += extra_cost->alu.log_shift;
9863 else
9864 *cost += extra_cost->alu.log_shift_reg;
9865 }
9866 else
9867 *cost += extra_cost->alu.logical;
9868 }
9869
9870 /* In both cases we want to cost both operands. */
9871 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9872 0, speed);
9873 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9874 1, speed);
9875
9876 return true;
9877 }
9878 }
9879 return false;
9880
9881 case NOT:
9882 x = XEXP (x, 0);
9883 op0 = aarch64_strip_shift (x);
9884
9885 if (VECTOR_MODE_P (mode))
9886 {
9887 /* Vector NOT. */
9888 *cost += extra_cost->vect.alu;
9889 return false;
9890 }
9891
9892 /* MVN-shifted-reg. */
9893 if (op0 != x)
9894 {
9895 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9896
9897 if (speed)
9898 *cost += extra_cost->alu.log_shift;
9899
9900 return true;
9901 }
9902 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9903 Handle the second form here taking care that 'a' in the above can
9904 be a shift. */
9905 else if (GET_CODE (op0) == XOR)
9906 {
9907 rtx newop0 = XEXP (op0, 0);
9908 rtx newop1 = XEXP (op0, 1);
9909 rtx op0_stripped = aarch64_strip_shift (newop0);
9910
9911 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9912 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9913
9914 if (speed)
9915 {
9916 if (op0_stripped != newop0)
9917 *cost += extra_cost->alu.log_shift;
9918 else
9919 *cost += extra_cost->alu.logical;
9920 }
9921
9922 return true;
9923 }
9924 /* MVN. */
9925 if (speed)
9926 *cost += extra_cost->alu.logical;
9927
9928 return false;
9929
9930 case ZERO_EXTEND:
9931
9932 op0 = XEXP (x, 0);
9933 /* If a value is written in SI mode, then zero extended to DI
9934 mode, the operation will in general be free as a write to
9935 a 'w' register implicitly zeroes the upper bits of an 'x'
9936 register. However, if this is
9937
9938 (set (reg) (zero_extend (reg)))
9939
9940 we must cost the explicit register move. */
9941 if (mode == DImode
9942 && GET_MODE (op0) == SImode
9943 && outer == SET)
9944 {
9945 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9946
9947 /* If OP_COST is non-zero, then the cost of the zero extend
9948 is effectively the cost of the inner operation. Otherwise
9949 we have a MOV instruction and we take the cost from the MOV
9950 itself. This is true independently of whether we are
9951 optimizing for space or time. */
9952 if (op_cost)
9953 *cost = op_cost;
9954
9955 return true;
9956 }
9957 else if (MEM_P (op0))
9958 {
9959 /* All loads can zero extend to any size for free. */
9960 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9961 return true;
9962 }
9963
9964 op0 = aarch64_extend_bitfield_pattern_p (x);
9965 if (op0)
9966 {
9967 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9968 if (speed)
9969 *cost += extra_cost->alu.bfx;
9970 return true;
9971 }
9972
9973 if (speed)
9974 {
9975 if (VECTOR_MODE_P (mode))
9976 {
9977 /* UMOV. */
9978 *cost += extra_cost->vect.alu;
9979 }
9980 else
9981 {
9982 /* We generate an AND instead of UXTB/UXTH. */
9983 *cost += extra_cost->alu.logical;
9984 }
9985 }
9986 return false;
9987
9988 case SIGN_EXTEND:
9989 if (MEM_P (XEXP (x, 0)))
9990 {
9991 /* LDRSH. */
9992 if (speed)
9993 {
9994 rtx address = XEXP (XEXP (x, 0), 0);
9995 *cost += extra_cost->ldst.load_sign_extend;
9996
9997 *cost +=
9998 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9999 0, speed));
10000 }
10001 return true;
10002 }
10003
10004 op0 = aarch64_extend_bitfield_pattern_p (x);
10005 if (op0)
10006 {
10007 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10008 if (speed)
10009 *cost += extra_cost->alu.bfx;
10010 return true;
10011 }
10012
10013 if (speed)
10014 {
10015 if (VECTOR_MODE_P (mode))
10016 *cost += extra_cost->vect.alu;
10017 else
10018 *cost += extra_cost->alu.extend;
10019 }
10020 return false;
10021
10022 case ASHIFT:
10023 op0 = XEXP (x, 0);
10024 op1 = XEXP (x, 1);
10025
10026 if (CONST_INT_P (op1))
10027 {
10028 if (speed)
10029 {
10030 if (VECTOR_MODE_P (mode))
10031 {
10032 /* Vector shift (immediate). */
10033 *cost += extra_cost->vect.alu;
10034 }
10035 else
10036 {
10037 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10038 aliases. */
10039 *cost += extra_cost->alu.shift;
10040 }
10041 }
10042
10043 /* We can incorporate zero/sign extend for free. */
10044 if (GET_CODE (op0) == ZERO_EXTEND
10045 || GET_CODE (op0) == SIGN_EXTEND)
10046 op0 = XEXP (op0, 0);
10047
10048 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10049 return true;
10050 }
10051 else
10052 {
10053 if (VECTOR_MODE_P (mode))
10054 {
10055 if (speed)
10056 /* Vector shift (register). */
10057 *cost += extra_cost->vect.alu;
10058 }
10059 else
10060 {
10061 if (speed)
10062 /* LSLV. */
10063 *cost += extra_cost->alu.shift_reg;
10064
10065 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10066 && CONST_INT_P (XEXP (op1, 1))
10067 && known_eq (INTVAL (XEXP (op1, 1)),
10068 GET_MODE_BITSIZE (mode) - 1))
10069 {
10070 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10071 /* We already demanded XEXP (op1, 0) to be REG_P, so
10072 don't recurse into it. */
10073 return true;
10074 }
10075 }
10076 return false; /* All arguments need to be in registers. */
10077 }
10078
10079 case ROTATE:
10080 case ROTATERT:
10081 case LSHIFTRT:
10082 case ASHIFTRT:
10083 op0 = XEXP (x, 0);
10084 op1 = XEXP (x, 1);
10085
10086 if (CONST_INT_P (op1))
10087 {
10088 /* ASR (immediate) and friends. */
10089 if (speed)
10090 {
10091 if (VECTOR_MODE_P (mode))
10092 *cost += extra_cost->vect.alu;
10093 else
10094 *cost += extra_cost->alu.shift;
10095 }
10096
10097 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10098 return true;
10099 }
10100 else
10101 {
10102 if (VECTOR_MODE_P (mode))
10103 {
10104 if (speed)
10105 /* Vector shift (register). */
10106 *cost += extra_cost->vect.alu;
10107 }
10108 else
10109 {
10110 if (speed)
10111 /* ASR (register) and friends. */
10112 *cost += extra_cost->alu.shift_reg;
10113
10114 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10115 && CONST_INT_P (XEXP (op1, 1))
10116 && known_eq (INTVAL (XEXP (op1, 1)),
10117 GET_MODE_BITSIZE (mode) - 1))
10118 {
10119 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10120 /* We already demanded XEXP (op1, 0) to be REG_P, so
10121 don't recurse into it. */
10122 return true;
10123 }
10124 }
10125 return false; /* All arguments need to be in registers. */
10126 }
10127
10128 case SYMBOL_REF:
10129
10130 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10131 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10132 {
10133 /* LDR. */
10134 if (speed)
10135 *cost += extra_cost->ldst.load;
10136 }
10137 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10138 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10139 {
10140 /* ADRP, followed by ADD. */
10141 *cost += COSTS_N_INSNS (1);
10142 if (speed)
10143 *cost += 2 * extra_cost->alu.arith;
10144 }
10145 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10146 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10147 {
10148 /* ADR. */
10149 if (speed)
10150 *cost += extra_cost->alu.arith;
10151 }
10152
10153 if (flag_pic)
10154 {
10155 /* One extra load instruction, after accessing the GOT. */
10156 *cost += COSTS_N_INSNS (1);
10157 if (speed)
10158 *cost += extra_cost->ldst.load;
10159 }
10160 return true;
10161
10162 case HIGH:
10163 case LO_SUM:
10164 /* ADRP/ADD (immediate). */
10165 if (speed)
10166 *cost += extra_cost->alu.arith;
10167 return true;
10168
10169 case ZERO_EXTRACT:
10170 case SIGN_EXTRACT:
10171 /* UBFX/SBFX. */
10172 if (speed)
10173 {
10174 if (VECTOR_MODE_P (mode))
10175 *cost += extra_cost->vect.alu;
10176 else
10177 *cost += extra_cost->alu.bfx;
10178 }
10179
10180 /* We can trust that the immediates used will be correct (there
10181 are no by-register forms), so we need only cost op0. */
10182 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10183 return true;
10184
10185 case MULT:
10186 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10187 /* aarch64_rtx_mult_cost always handles recursion to its
10188 operands. */
10189 return true;
10190
10191 case MOD:
10192 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10193 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10194 an unconditional negate. This case should only ever be reached through
10195 the set_smod_pow2_cheap check in expmed.c. */
10196 if (CONST_INT_P (XEXP (x, 1))
10197 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10198 && (mode == SImode || mode == DImode))
10199 {
10200 /* We expand to 4 instructions. Reset the baseline. */
10201 *cost = COSTS_N_INSNS (4);
10202
10203 if (speed)
10204 *cost += 2 * extra_cost->alu.logical
10205 + 2 * extra_cost->alu.arith;
10206
10207 return true;
10208 }
10209
10210 /* Fall-through. */
10211 case UMOD:
10212 if (speed)
10213 {
10214 /* Slighly prefer UMOD over SMOD. */
10215 if (VECTOR_MODE_P (mode))
10216 *cost += extra_cost->vect.alu;
10217 else if (GET_MODE_CLASS (mode) == MODE_INT)
10218 *cost += (extra_cost->mult[mode == DImode].add
10219 + extra_cost->mult[mode == DImode].idiv
10220 + (code == MOD ? 1 : 0));
10221 }
10222 return false; /* All arguments need to be in registers. */
10223
10224 case DIV:
10225 case UDIV:
10226 case SQRT:
10227 if (speed)
10228 {
10229 if (VECTOR_MODE_P (mode))
10230 *cost += extra_cost->vect.alu;
10231 else if (GET_MODE_CLASS (mode) == MODE_INT)
10232 /* There is no integer SQRT, so only DIV and UDIV can get
10233 here. */
10234 *cost += (extra_cost->mult[mode == DImode].idiv
10235 /* Slighly prefer UDIV over SDIV. */
10236 + (code == DIV ? 1 : 0));
10237 else
10238 *cost += extra_cost->fp[mode == DFmode].div;
10239 }
10240 return false; /* All arguments need to be in registers. */
10241
10242 case IF_THEN_ELSE:
10243 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10244 XEXP (x, 2), cost, speed);
10245
10246 case EQ:
10247 case NE:
10248 case GT:
10249 case GTU:
10250 case LT:
10251 case LTU:
10252 case GE:
10253 case GEU:
10254 case LE:
10255 case LEU:
10256
10257 return false; /* All arguments must be in registers. */
10258
10259 case FMA:
10260 op0 = XEXP (x, 0);
10261 op1 = XEXP (x, 1);
10262 op2 = XEXP (x, 2);
10263
10264 if (speed)
10265 {
10266 if (VECTOR_MODE_P (mode))
10267 *cost += extra_cost->vect.alu;
10268 else
10269 *cost += extra_cost->fp[mode == DFmode].fma;
10270 }
10271
10272 /* FMSUB, FNMADD, and FNMSUB are free. */
10273 if (GET_CODE (op0) == NEG)
10274 op0 = XEXP (op0, 0);
10275
10276 if (GET_CODE (op2) == NEG)
10277 op2 = XEXP (op2, 0);
10278
10279 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10280 and the by-element operand as operand 0. */
10281 if (GET_CODE (op1) == NEG)
10282 op1 = XEXP (op1, 0);
10283
10284 /* Catch vector-by-element operations. The by-element operand can
10285 either be (vec_duplicate (vec_select (x))) or just
10286 (vec_select (x)), depending on whether we are multiplying by
10287 a vector or a scalar.
10288
10289 Canonicalization is not very good in these cases, FMA4 will put the
10290 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10291 if (GET_CODE (op0) == VEC_DUPLICATE)
10292 op0 = XEXP (op0, 0);
10293 else if (GET_CODE (op1) == VEC_DUPLICATE)
10294 op1 = XEXP (op1, 0);
10295
10296 if (GET_CODE (op0) == VEC_SELECT)
10297 op0 = XEXP (op0, 0);
10298 else if (GET_CODE (op1) == VEC_SELECT)
10299 op1 = XEXP (op1, 0);
10300
10301 /* If the remaining parameters are not registers,
10302 get the cost to put them into registers. */
10303 *cost += rtx_cost (op0, mode, FMA, 0, speed);
10304 *cost += rtx_cost (op1, mode, FMA, 1, speed);
10305 *cost += rtx_cost (op2, mode, FMA, 2, speed);
10306 return true;
10307
10308 case FLOAT:
10309 case UNSIGNED_FLOAT:
10310 if (speed)
10311 *cost += extra_cost->fp[mode == DFmode].fromint;
10312 return false;
10313
10314 case FLOAT_EXTEND:
10315 if (speed)
10316 {
10317 if (VECTOR_MODE_P (mode))
10318 {
10319 /*Vector truncate. */
10320 *cost += extra_cost->vect.alu;
10321 }
10322 else
10323 *cost += extra_cost->fp[mode == DFmode].widen;
10324 }
10325 return false;
10326
10327 case FLOAT_TRUNCATE:
10328 if (speed)
10329 {
10330 if (VECTOR_MODE_P (mode))
10331 {
10332 /*Vector conversion. */
10333 *cost += extra_cost->vect.alu;
10334 }
10335 else
10336 *cost += extra_cost->fp[mode == DFmode].narrow;
10337 }
10338 return false;
10339
10340 case FIX:
10341 case UNSIGNED_FIX:
10342 x = XEXP (x, 0);
10343 /* Strip the rounding part. They will all be implemented
10344 by the fcvt* family of instructions anyway. */
10345 if (GET_CODE (x) == UNSPEC)
10346 {
10347 unsigned int uns_code = XINT (x, 1);
10348
10349 if (uns_code == UNSPEC_FRINTA
10350 || uns_code == UNSPEC_FRINTM
10351 || uns_code == UNSPEC_FRINTN
10352 || uns_code == UNSPEC_FRINTP
10353 || uns_code == UNSPEC_FRINTZ)
10354 x = XVECEXP (x, 0, 0);
10355 }
10356
10357 if (speed)
10358 {
10359 if (VECTOR_MODE_P (mode))
10360 *cost += extra_cost->vect.alu;
10361 else
10362 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10363 }
10364
10365 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10366 fixed-point fcvt. */
10367 if (GET_CODE (x) == MULT
10368 && ((VECTOR_MODE_P (mode)
10369 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10370 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10371 {
10372 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10373 0, speed);
10374 return true;
10375 }
10376
10377 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10378 return true;
10379
10380 case ABS:
10381 if (VECTOR_MODE_P (mode))
10382 {
10383 /* ABS (vector). */
10384 if (speed)
10385 *cost += extra_cost->vect.alu;
10386 }
10387 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10388 {
10389 op0 = XEXP (x, 0);
10390
10391 /* FABD, which is analogous to FADD. */
10392 if (GET_CODE (op0) == MINUS)
10393 {
10394 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10395 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10396 if (speed)
10397 *cost += extra_cost->fp[mode == DFmode].addsub;
10398
10399 return true;
10400 }
10401 /* Simple FABS is analogous to FNEG. */
10402 if (speed)
10403 *cost += extra_cost->fp[mode == DFmode].neg;
10404 }
10405 else
10406 {
10407 /* Integer ABS will either be split to
10408 two arithmetic instructions, or will be an ABS
10409 (scalar), which we don't model. */
10410 *cost = COSTS_N_INSNS (2);
10411 if (speed)
10412 *cost += 2 * extra_cost->alu.arith;
10413 }
10414 return false;
10415
10416 case SMAX:
10417 case SMIN:
10418 if (speed)
10419 {
10420 if (VECTOR_MODE_P (mode))
10421 *cost += extra_cost->vect.alu;
10422 else
10423 {
10424 /* FMAXNM/FMINNM/FMAX/FMIN.
10425 TODO: This may not be accurate for all implementations, but
10426 we do not model this in the cost tables. */
10427 *cost += extra_cost->fp[mode == DFmode].addsub;
10428 }
10429 }
10430 return false;
10431
10432 case UNSPEC:
10433 /* The floating point round to integer frint* instructions. */
10434 if (aarch64_frint_unspec_p (XINT (x, 1)))
10435 {
10436 if (speed)
10437 *cost += extra_cost->fp[mode == DFmode].roundint;
10438
10439 return false;
10440 }
10441
10442 if (XINT (x, 1) == UNSPEC_RBIT)
10443 {
10444 if (speed)
10445 *cost += extra_cost->alu.rev;
10446
10447 return false;
10448 }
10449 break;
10450
10451 case TRUNCATE:
10452
10453 /* Decompose <su>muldi3_highpart. */
10454 if (/* (truncate:DI */
10455 mode == DImode
10456 /* (lshiftrt:TI */
10457 && GET_MODE (XEXP (x, 0)) == TImode
10458 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10459 /* (mult:TI */
10460 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10461 /* (ANY_EXTEND:TI (reg:DI))
10462 (ANY_EXTEND:TI (reg:DI))) */
10463 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10464 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10465 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10466 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10467 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10468 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10469 /* (const_int 64) */
10470 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10471 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10472 {
10473 /* UMULH/SMULH. */
10474 if (speed)
10475 *cost += extra_cost->mult[mode == DImode].extend;
10476 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10477 mode, MULT, 0, speed);
10478 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10479 mode, MULT, 1, speed);
10480 return true;
10481 }
10482
10483 /* Fall through. */
10484 default:
10485 break;
10486 }
10487
10488 if (dump_file
10489 && flag_aarch64_verbose_cost)
10490 fprintf (dump_file,
10491 "\nFailed to cost RTX. Assuming default cost.\n");
10492
10493 return true;
10494 }
10495
10496 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10497 calculated for X. This cost is stored in *COST. Returns true
10498 if the total cost of X was calculated. */
10499 static bool
10500 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10501 int param, int *cost, bool speed)
10502 {
10503 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10504
10505 if (dump_file
10506 && flag_aarch64_verbose_cost)
10507 {
10508 print_rtl_single (dump_file, x);
10509 fprintf (dump_file, "\n%s cost: %d (%s)\n",
10510 speed ? "Hot" : "Cold",
10511 *cost, result ? "final" : "partial");
10512 }
10513
10514 return result;
10515 }
10516
10517 static int
10518 aarch64_register_move_cost (machine_mode mode,
10519 reg_class_t from_i, reg_class_t to_i)
10520 {
10521 enum reg_class from = (enum reg_class) from_i;
10522 enum reg_class to = (enum reg_class) to_i;
10523 const struct cpu_regmove_cost *regmove_cost
10524 = aarch64_tune_params.regmove_cost;
10525
10526 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10527 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10528 to = GENERAL_REGS;
10529
10530 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10531 from = GENERAL_REGS;
10532
10533 /* Moving between GPR and stack cost is the same as GP2GP. */
10534 if ((from == GENERAL_REGS && to == STACK_REG)
10535 || (to == GENERAL_REGS && from == STACK_REG))
10536 return regmove_cost->GP2GP;
10537
10538 /* To/From the stack register, we move via the gprs. */
10539 if (to == STACK_REG || from == STACK_REG)
10540 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10541 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10542
10543 if (known_eq (GET_MODE_SIZE (mode), 16))
10544 {
10545 /* 128-bit operations on general registers require 2 instructions. */
10546 if (from == GENERAL_REGS && to == GENERAL_REGS)
10547 return regmove_cost->GP2GP * 2;
10548 else if (from == GENERAL_REGS)
10549 return regmove_cost->GP2FP * 2;
10550 else if (to == GENERAL_REGS)
10551 return regmove_cost->FP2GP * 2;
10552
10553 /* When AdvSIMD instructions are disabled it is not possible to move
10554 a 128-bit value directly between Q registers. This is handled in
10555 secondary reload. A general register is used as a scratch to move
10556 the upper DI value and the lower DI value is moved directly,
10557 hence the cost is the sum of three moves. */
10558 if (! TARGET_SIMD)
10559 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10560
10561 return regmove_cost->FP2FP;
10562 }
10563
10564 if (from == GENERAL_REGS && to == GENERAL_REGS)
10565 return regmove_cost->GP2GP;
10566 else if (from == GENERAL_REGS)
10567 return regmove_cost->GP2FP;
10568 else if (to == GENERAL_REGS)
10569 return regmove_cost->FP2GP;
10570
10571 return regmove_cost->FP2FP;
10572 }
10573
10574 static int
10575 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10576 reg_class_t rclass ATTRIBUTE_UNUSED,
10577 bool in ATTRIBUTE_UNUSED)
10578 {
10579 return aarch64_tune_params.memmov_cost;
10580 }
10581
10582 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10583 to optimize 1.0/sqrt. */
10584
10585 static bool
10586 use_rsqrt_p (machine_mode mode)
10587 {
10588 return (!flag_trapping_math
10589 && flag_unsafe_math_optimizations
10590 && ((aarch64_tune_params.approx_modes->recip_sqrt
10591 & AARCH64_APPROX_MODE (mode))
10592 || flag_mrecip_low_precision_sqrt));
10593 }
10594
10595 /* Function to decide when to use the approximate reciprocal square root
10596 builtin. */
10597
10598 static tree
10599 aarch64_builtin_reciprocal (tree fndecl)
10600 {
10601 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10602
10603 if (!use_rsqrt_p (mode))
10604 return NULL_TREE;
10605 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10606 }
10607
10608 /* Emit instruction sequence to compute either the approximate square root
10609 or its approximate reciprocal, depending on the flag RECP, and return
10610 whether the sequence was emitted or not. */
10611
10612 bool
10613 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10614 {
10615 machine_mode mode = GET_MODE (dst);
10616
10617 if (GET_MODE_INNER (mode) == HFmode)
10618 {
10619 gcc_assert (!recp);
10620 return false;
10621 }
10622
10623 if (!recp)
10624 {
10625 if (!(flag_mlow_precision_sqrt
10626 || (aarch64_tune_params.approx_modes->sqrt
10627 & AARCH64_APPROX_MODE (mode))))
10628 return false;
10629
10630 if (flag_finite_math_only
10631 || flag_trapping_math
10632 || !flag_unsafe_math_optimizations
10633 || optimize_function_for_size_p (cfun))
10634 return false;
10635 }
10636 else
10637 /* Caller assumes we cannot fail. */
10638 gcc_assert (use_rsqrt_p (mode));
10639
10640 machine_mode mmsk = mode_for_int_vector (mode).require ();
10641 rtx xmsk = gen_reg_rtx (mmsk);
10642 if (!recp)
10643 /* When calculating the approximate square root, compare the
10644 argument with 0.0 and create a mask. */
10645 emit_insn (gen_rtx_SET (xmsk,
10646 gen_rtx_NEG (mmsk,
10647 gen_rtx_EQ (mmsk, src,
10648 CONST0_RTX (mode)))));
10649
10650 /* Estimate the approximate reciprocal square root. */
10651 rtx xdst = gen_reg_rtx (mode);
10652 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10653
10654 /* Iterate over the series twice for SF and thrice for DF. */
10655 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10656
10657 /* Optionally iterate over the series once less for faster performance
10658 while sacrificing the accuracy. */
10659 if ((recp && flag_mrecip_low_precision_sqrt)
10660 || (!recp && flag_mlow_precision_sqrt))
10661 iterations--;
10662
10663 /* Iterate over the series to calculate the approximate reciprocal square
10664 root. */
10665 rtx x1 = gen_reg_rtx (mode);
10666 while (iterations--)
10667 {
10668 rtx x2 = gen_reg_rtx (mode);
10669 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10670
10671 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10672
10673 if (iterations > 0)
10674 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10675 }
10676
10677 if (!recp)
10678 {
10679 /* Qualify the approximate reciprocal square root when the argument is
10680 0.0 by squashing the intermediary result to 0.0. */
10681 rtx xtmp = gen_reg_rtx (mmsk);
10682 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10683 gen_rtx_SUBREG (mmsk, xdst, 0)));
10684 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10685
10686 /* Calculate the approximate square root. */
10687 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10688 }
10689
10690 /* Finalize the approximation. */
10691 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10692
10693 return true;
10694 }
10695
10696 /* Emit the instruction sequence to compute the approximation for the division
10697 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10698
10699 bool
10700 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10701 {
10702 machine_mode mode = GET_MODE (quo);
10703
10704 if (GET_MODE_INNER (mode) == HFmode)
10705 return false;
10706
10707 bool use_approx_division_p = (flag_mlow_precision_div
10708 || (aarch64_tune_params.approx_modes->division
10709 & AARCH64_APPROX_MODE (mode)));
10710
10711 if (!flag_finite_math_only
10712 || flag_trapping_math
10713 || !flag_unsafe_math_optimizations
10714 || optimize_function_for_size_p (cfun)
10715 || !use_approx_division_p)
10716 return false;
10717
10718 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10719 return false;
10720
10721 /* Estimate the approximate reciprocal. */
10722 rtx xrcp = gen_reg_rtx (mode);
10723 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10724
10725 /* Iterate over the series twice for SF and thrice for DF. */
10726 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10727
10728 /* Optionally iterate over the series once less for faster performance,
10729 while sacrificing the accuracy. */
10730 if (flag_mlow_precision_div)
10731 iterations--;
10732
10733 /* Iterate over the series to calculate the approximate reciprocal. */
10734 rtx xtmp = gen_reg_rtx (mode);
10735 while (iterations--)
10736 {
10737 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10738
10739 if (iterations > 0)
10740 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10741 }
10742
10743 if (num != CONST1_RTX (mode))
10744 {
10745 /* As the approximate reciprocal of DEN is already calculated, only
10746 calculate the approximate division when NUM is not 1.0. */
10747 rtx xnum = force_reg (mode, num);
10748 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10749 }
10750
10751 /* Finalize the approximation. */
10752 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10753 return true;
10754 }
10755
10756 /* Return the number of instructions that can be issued per cycle. */
10757 static int
10758 aarch64_sched_issue_rate (void)
10759 {
10760 return aarch64_tune_params.issue_rate;
10761 }
10762
10763 static int
10764 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10765 {
10766 int issue_rate = aarch64_sched_issue_rate ();
10767
10768 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10769 }
10770
10771
10772 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10773 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10774 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10775
10776 static int
10777 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10778 int ready_index)
10779 {
10780 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10781 }
10782
10783
10784 /* Vectorizer cost model target hooks. */
10785
10786 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10787 static int
10788 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10789 tree vectype,
10790 int misalign ATTRIBUTE_UNUSED)
10791 {
10792 unsigned elements;
10793 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10794 bool fp = false;
10795
10796 if (vectype != NULL)
10797 fp = FLOAT_TYPE_P (vectype);
10798
10799 switch (type_of_cost)
10800 {
10801 case scalar_stmt:
10802 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10803
10804 case scalar_load:
10805 return costs->scalar_load_cost;
10806
10807 case scalar_store:
10808 return costs->scalar_store_cost;
10809
10810 case vector_stmt:
10811 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10812
10813 case vector_load:
10814 return costs->vec_align_load_cost;
10815
10816 case vector_store:
10817 return costs->vec_store_cost;
10818
10819 case vec_to_scalar:
10820 return costs->vec_to_scalar_cost;
10821
10822 case scalar_to_vec:
10823 return costs->scalar_to_vec_cost;
10824
10825 case unaligned_load:
10826 case vector_gather_load:
10827 return costs->vec_unalign_load_cost;
10828
10829 case unaligned_store:
10830 case vector_scatter_store:
10831 return costs->vec_unalign_store_cost;
10832
10833 case cond_branch_taken:
10834 return costs->cond_taken_branch_cost;
10835
10836 case cond_branch_not_taken:
10837 return costs->cond_not_taken_branch_cost;
10838
10839 case vec_perm:
10840 return costs->vec_permute_cost;
10841
10842 case vec_promote_demote:
10843 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10844
10845 case vec_construct:
10846 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10847 return elements / 2 + 1;
10848
10849 default:
10850 gcc_unreachable ();
10851 }
10852 }
10853
10854 /* Implement targetm.vectorize.add_stmt_cost. */
10855 static unsigned
10856 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10857 struct _stmt_vec_info *stmt_info, int misalign,
10858 enum vect_cost_model_location where)
10859 {
10860 unsigned *cost = (unsigned *) data;
10861 unsigned retval = 0;
10862
10863 if (flag_vect_cost_model)
10864 {
10865 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10866 int stmt_cost =
10867 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10868
10869 /* Statements in an inner loop relative to the loop being
10870 vectorized are weighted more heavily. The value here is
10871 arbitrary and could potentially be improved with analysis. */
10872 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10873 count *= 50; /* FIXME */
10874
10875 retval = (unsigned) (count * stmt_cost);
10876 cost[where] += retval;
10877 }
10878
10879 return retval;
10880 }
10881
10882 static void initialize_aarch64_code_model (struct gcc_options *);
10883
10884 /* Parse the TO_PARSE string and put the architecture struct that it
10885 selects into RES and the architectural features into ISA_FLAGS.
10886 Return an aarch64_parse_opt_result describing the parse result.
10887 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
10888 When the TO_PARSE string contains an invalid extension,
10889 a copy of the string is created and stored to INVALID_EXTENSION. */
10890
10891 static enum aarch64_parse_opt_result
10892 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10893 unsigned long *isa_flags, std::string *invalid_extension)
10894 {
10895 const char *ext;
10896 const struct processor *arch;
10897 size_t len;
10898
10899 ext = strchr (to_parse, '+');
10900
10901 if (ext != NULL)
10902 len = ext - to_parse;
10903 else
10904 len = strlen (to_parse);
10905
10906 if (len == 0)
10907 return AARCH64_PARSE_MISSING_ARG;
10908
10909
10910 /* Loop through the list of supported ARCHes to find a match. */
10911 for (arch = all_architectures; arch->name != NULL; arch++)
10912 {
10913 if (strlen (arch->name) == len
10914 && strncmp (arch->name, to_parse, len) == 0)
10915 {
10916 unsigned long isa_temp = arch->flags;
10917
10918 if (ext != NULL)
10919 {
10920 /* TO_PARSE string contains at least one extension. */
10921 enum aarch64_parse_opt_result ext_res
10922 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
10923
10924 if (ext_res != AARCH64_PARSE_OK)
10925 return ext_res;
10926 }
10927 /* Extension parsing was successful. Confirm the result
10928 arch and ISA flags. */
10929 *res = arch;
10930 *isa_flags = isa_temp;
10931 return AARCH64_PARSE_OK;
10932 }
10933 }
10934
10935 /* ARCH name not found in list. */
10936 return AARCH64_PARSE_INVALID_ARG;
10937 }
10938
10939 /* Parse the TO_PARSE string and put the result tuning in RES and the
10940 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10941 describing the parse result. If there is an error parsing, RES and
10942 ISA_FLAGS are left unchanged.
10943 When the TO_PARSE string contains an invalid extension,
10944 a copy of the string is created and stored to INVALID_EXTENSION. */
10945
10946 static enum aarch64_parse_opt_result
10947 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10948 unsigned long *isa_flags, std::string *invalid_extension)
10949 {
10950 const char *ext;
10951 const struct processor *cpu;
10952 size_t len;
10953
10954 ext = strchr (to_parse, '+');
10955
10956 if (ext != NULL)
10957 len = ext - to_parse;
10958 else
10959 len = strlen (to_parse);
10960
10961 if (len == 0)
10962 return AARCH64_PARSE_MISSING_ARG;
10963
10964
10965 /* Loop through the list of supported CPUs to find a match. */
10966 for (cpu = all_cores; cpu->name != NULL; cpu++)
10967 {
10968 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
10969 {
10970 unsigned long isa_temp = cpu->flags;
10971
10972
10973 if (ext != NULL)
10974 {
10975 /* TO_PARSE string contains at least one extension. */
10976 enum aarch64_parse_opt_result ext_res
10977 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
10978
10979 if (ext_res != AARCH64_PARSE_OK)
10980 return ext_res;
10981 }
10982 /* Extension parsing was successfull. Confirm the result
10983 cpu and ISA flags. */
10984 *res = cpu;
10985 *isa_flags = isa_temp;
10986 return AARCH64_PARSE_OK;
10987 }
10988 }
10989
10990 /* CPU name not found in list. */
10991 return AARCH64_PARSE_INVALID_ARG;
10992 }
10993
10994 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10995 Return an aarch64_parse_opt_result describing the parse result.
10996 If the parsing fails the RES does not change. */
10997
10998 static enum aarch64_parse_opt_result
10999 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11000 {
11001 const struct processor *cpu;
11002
11003 /* Loop through the list of supported CPUs to find a match. */
11004 for (cpu = all_cores; cpu->name != NULL; cpu++)
11005 {
11006 if (strcmp (cpu->name, to_parse) == 0)
11007 {
11008 *res = cpu;
11009 return AARCH64_PARSE_OK;
11010 }
11011 }
11012
11013 /* CPU name not found in list. */
11014 return AARCH64_PARSE_INVALID_ARG;
11015 }
11016
11017 /* Parse TOKEN, which has length LENGTH to see if it is an option
11018 described in FLAG. If it is, return the index bit for that fusion type.
11019 If not, error (printing OPTION_NAME) and return zero. */
11020
11021 static unsigned int
11022 aarch64_parse_one_option_token (const char *token,
11023 size_t length,
11024 const struct aarch64_flag_desc *flag,
11025 const char *option_name)
11026 {
11027 for (; flag->name != NULL; flag++)
11028 {
11029 if (length == strlen (flag->name)
11030 && !strncmp (flag->name, token, length))
11031 return flag->flag;
11032 }
11033
11034 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
11035 return 0;
11036 }
11037
11038 /* Parse OPTION which is a comma-separated list of flags to enable.
11039 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11040 default state we inherit from the CPU tuning structures. OPTION_NAME
11041 gives the top-level option we are parsing in the -moverride string,
11042 for use in error messages. */
11043
11044 static unsigned int
11045 aarch64_parse_boolean_options (const char *option,
11046 const struct aarch64_flag_desc *flags,
11047 unsigned int initial_state,
11048 const char *option_name)
11049 {
11050 const char separator = '.';
11051 const char* specs = option;
11052 const char* ntoken = option;
11053 unsigned int found_flags = initial_state;
11054
11055 while ((ntoken = strchr (specs, separator)))
11056 {
11057 size_t token_length = ntoken - specs;
11058 unsigned token_ops = aarch64_parse_one_option_token (specs,
11059 token_length,
11060 flags,
11061 option_name);
11062 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11063 in the token stream, reset the supported operations. So:
11064
11065 adrp+add.cmp+branch.none.adrp+add
11066
11067 would have the result of turning on only adrp+add fusion. */
11068 if (!token_ops)
11069 found_flags = 0;
11070
11071 found_flags |= token_ops;
11072 specs = ++ntoken;
11073 }
11074
11075 /* We ended with a comma, print something. */
11076 if (!(*specs))
11077 {
11078 error ("%s string ill-formed\n", option_name);
11079 return 0;
11080 }
11081
11082 /* We still have one more token to parse. */
11083 size_t token_length = strlen (specs);
11084 unsigned token_ops = aarch64_parse_one_option_token (specs,
11085 token_length,
11086 flags,
11087 option_name);
11088 if (!token_ops)
11089 found_flags = 0;
11090
11091 found_flags |= token_ops;
11092 return found_flags;
11093 }
11094
11095 /* Support for overriding instruction fusion. */
11096
11097 static void
11098 aarch64_parse_fuse_string (const char *fuse_string,
11099 struct tune_params *tune)
11100 {
11101 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11102 aarch64_fusible_pairs,
11103 tune->fusible_ops,
11104 "fuse=");
11105 }
11106
11107 /* Support for overriding other tuning flags. */
11108
11109 static void
11110 aarch64_parse_tune_string (const char *tune_string,
11111 struct tune_params *tune)
11112 {
11113 tune->extra_tuning_flags
11114 = aarch64_parse_boolean_options (tune_string,
11115 aarch64_tuning_flags,
11116 tune->extra_tuning_flags,
11117 "tune=");
11118 }
11119
11120 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11121 Accept the valid SVE vector widths allowed by
11122 aarch64_sve_vector_bits_enum and use it to override sve_width
11123 in TUNE. */
11124
11125 static void
11126 aarch64_parse_sve_width_string (const char *tune_string,
11127 struct tune_params *tune)
11128 {
11129 int width = -1;
11130
11131 int n = sscanf (tune_string, "%d", &width);
11132 if (n == EOF)
11133 {
11134 error ("invalid format for sve_width");
11135 return;
11136 }
11137 switch (width)
11138 {
11139 case SVE_128:
11140 case SVE_256:
11141 case SVE_512:
11142 case SVE_1024:
11143 case SVE_2048:
11144 break;
11145 default:
11146 error ("invalid sve_width value: %d", width);
11147 }
11148 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11149 }
11150
11151 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11152 we understand. If it is, extract the option string and handoff to
11153 the appropriate function. */
11154
11155 void
11156 aarch64_parse_one_override_token (const char* token,
11157 size_t length,
11158 struct tune_params *tune)
11159 {
11160 const struct aarch64_tuning_override_function *fn
11161 = aarch64_tuning_override_functions;
11162
11163 const char *option_part = strchr (token, '=');
11164 if (!option_part)
11165 {
11166 error ("tuning string missing in option (%s)", token);
11167 return;
11168 }
11169
11170 /* Get the length of the option name. */
11171 length = option_part - token;
11172 /* Skip the '=' to get to the option string. */
11173 option_part++;
11174
11175 for (; fn->name != NULL; fn++)
11176 {
11177 if (!strncmp (fn->name, token, length))
11178 {
11179 fn->parse_override (option_part, tune);
11180 return;
11181 }
11182 }
11183
11184 error ("unknown tuning option (%s)",token);
11185 return;
11186 }
11187
11188 /* A checking mechanism for the implementation of the tls size. */
11189
11190 static void
11191 initialize_aarch64_tls_size (struct gcc_options *opts)
11192 {
11193 if (aarch64_tls_size == 0)
11194 aarch64_tls_size = 24;
11195
11196 switch (opts->x_aarch64_cmodel_var)
11197 {
11198 case AARCH64_CMODEL_TINY:
11199 /* Both the default and maximum TLS size allowed under tiny is 1M which
11200 needs two instructions to address, so we clamp the size to 24. */
11201 if (aarch64_tls_size > 24)
11202 aarch64_tls_size = 24;
11203 break;
11204 case AARCH64_CMODEL_SMALL:
11205 /* The maximum TLS size allowed under small is 4G. */
11206 if (aarch64_tls_size > 32)
11207 aarch64_tls_size = 32;
11208 break;
11209 case AARCH64_CMODEL_LARGE:
11210 /* The maximum TLS size allowed under large is 16E.
11211 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11212 if (aarch64_tls_size > 48)
11213 aarch64_tls_size = 48;
11214 break;
11215 default:
11216 gcc_unreachable ();
11217 }
11218
11219 return;
11220 }
11221
11222 /* Parse STRING looking for options in the format:
11223 string :: option:string
11224 option :: name=substring
11225 name :: {a-z}
11226 substring :: defined by option. */
11227
11228 static void
11229 aarch64_parse_override_string (const char* input_string,
11230 struct tune_params* tune)
11231 {
11232 const char separator = ':';
11233 size_t string_length = strlen (input_string) + 1;
11234 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11235 char *string = string_root;
11236 strncpy (string, input_string, string_length);
11237 string[string_length - 1] = '\0';
11238
11239 char* ntoken = string;
11240
11241 while ((ntoken = strchr (string, separator)))
11242 {
11243 size_t token_length = ntoken - string;
11244 /* Make this substring look like a string. */
11245 *ntoken = '\0';
11246 aarch64_parse_one_override_token (string, token_length, tune);
11247 string = ++ntoken;
11248 }
11249
11250 /* One last option to parse. */
11251 aarch64_parse_one_override_token (string, strlen (string), tune);
11252 free (string_root);
11253 }
11254
11255
11256 static void
11257 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11258 {
11259 if (accepted_branch_protection_string)
11260 {
11261 opts->x_aarch64_branch_protection_string
11262 = xstrdup (accepted_branch_protection_string);
11263 }
11264
11265 /* PR 70044: We have to be careful about being called multiple times for the
11266 same function. This means all changes should be repeatable. */
11267
11268 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11269 Disable the frame pointer flag so the mid-end will not use a frame
11270 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11271 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11272 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11273 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11274 if (opts->x_flag_omit_frame_pointer == 0)
11275 opts->x_flag_omit_frame_pointer = 2;
11276
11277 /* If not optimizing for size, set the default
11278 alignment to what the target wants. */
11279 if (!opts->x_optimize_size)
11280 {
11281 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11282 opts->x_str_align_loops = aarch64_tune_params.loop_align;
11283 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11284 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11285 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11286 opts->x_str_align_functions = aarch64_tune_params.function_align;
11287 }
11288
11289 /* We default to no pc-relative literal loads. */
11290
11291 aarch64_pcrelative_literal_loads = false;
11292
11293 /* If -mpc-relative-literal-loads is set on the command line, this
11294 implies that the user asked for PC relative literal loads. */
11295 if (opts->x_pcrelative_literal_loads == 1)
11296 aarch64_pcrelative_literal_loads = true;
11297
11298 /* In the tiny memory model it makes no sense to disallow PC relative
11299 literal pool loads. */
11300 if (aarch64_cmodel == AARCH64_CMODEL_TINY
11301 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11302 aarch64_pcrelative_literal_loads = true;
11303
11304 /* When enabling the lower precision Newton series for the square root, also
11305 enable it for the reciprocal square root, since the latter is an
11306 intermediary step for the former. */
11307 if (flag_mlow_precision_sqrt)
11308 flag_mrecip_low_precision_sqrt = true;
11309 }
11310
11311 /* 'Unpack' up the internal tuning structs and update the options
11312 in OPTS. The caller must have set up selected_tune and selected_arch
11313 as all the other target-specific codegen decisions are
11314 derived from them. */
11315
11316 void
11317 aarch64_override_options_internal (struct gcc_options *opts)
11318 {
11319 aarch64_tune_flags = selected_tune->flags;
11320 aarch64_tune = selected_tune->sched_core;
11321 /* Make a copy of the tuning parameters attached to the core, which
11322 we may later overwrite. */
11323 aarch64_tune_params = *(selected_tune->tune);
11324 aarch64_architecture_version = selected_arch->architecture_version;
11325
11326 if (opts->x_aarch64_override_tune_string)
11327 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11328 &aarch64_tune_params);
11329
11330 /* This target defaults to strict volatile bitfields. */
11331 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11332 opts->x_flag_strict_volatile_bitfields = 1;
11333
11334 initialize_aarch64_code_model (opts);
11335 initialize_aarch64_tls_size (opts);
11336
11337 int queue_depth = 0;
11338 switch (aarch64_tune_params.autoprefetcher_model)
11339 {
11340 case tune_params::AUTOPREFETCHER_OFF:
11341 queue_depth = -1;
11342 break;
11343 case tune_params::AUTOPREFETCHER_WEAK:
11344 queue_depth = 0;
11345 break;
11346 case tune_params::AUTOPREFETCHER_STRONG:
11347 queue_depth = max_insn_queue_index + 1;
11348 break;
11349 default:
11350 gcc_unreachable ();
11351 }
11352
11353 /* We don't mind passing in global_options_set here as we don't use
11354 the *options_set structs anyway. */
11355 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11356 queue_depth,
11357 opts->x_param_values,
11358 global_options_set.x_param_values);
11359
11360 /* Set up parameters to be used in prefetching algorithm. Do not
11361 override the defaults unless we are tuning for a core we have
11362 researched values for. */
11363 if (aarch64_tune_params.prefetch->num_slots > 0)
11364 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11365 aarch64_tune_params.prefetch->num_slots,
11366 opts->x_param_values,
11367 global_options_set.x_param_values);
11368 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11369 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11370 aarch64_tune_params.prefetch->l1_cache_size,
11371 opts->x_param_values,
11372 global_options_set.x_param_values);
11373 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11374 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11375 aarch64_tune_params.prefetch->l1_cache_line_size,
11376 opts->x_param_values,
11377 global_options_set.x_param_values);
11378 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11379 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11380 aarch64_tune_params.prefetch->l2_cache_size,
11381 opts->x_param_values,
11382 global_options_set.x_param_values);
11383 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11384 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11385 0,
11386 opts->x_param_values,
11387 global_options_set.x_param_values);
11388 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11389 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11390 aarch64_tune_params.prefetch->minimum_stride,
11391 opts->x_param_values,
11392 global_options_set.x_param_values);
11393
11394 /* Use the alternative scheduling-pressure algorithm by default. */
11395 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11396 opts->x_param_values,
11397 global_options_set.x_param_values);
11398
11399 /* If the user hasn't changed it via configure then set the default to 64 KB
11400 for the backend. */
11401 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11402 DEFAULT_STK_CLASH_GUARD_SIZE == 0
11403 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11404 opts->x_param_values,
11405 global_options_set.x_param_values);
11406
11407 /* Validate the guard size. */
11408 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11409
11410 /* Enforce that interval is the same size as size so the mid-end does the
11411 right thing. */
11412 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11413 guard_size,
11414 opts->x_param_values,
11415 global_options_set.x_param_values);
11416
11417 /* The maybe_set calls won't update the value if the user has explicitly set
11418 one. Which means we need to validate that probing interval and guard size
11419 are equal. */
11420 int probe_interval
11421 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11422 if (guard_size != probe_interval)
11423 error ("stack clash guard size '%d' must be equal to probing interval "
11424 "'%d'", guard_size, probe_interval);
11425
11426 /* Enable sw prefetching at specified optimization level for
11427 CPUS that have prefetch. Lower optimization level threshold by 1
11428 when profiling is enabled. */
11429 if (opts->x_flag_prefetch_loop_arrays < 0
11430 && !opts->x_optimize_size
11431 && aarch64_tune_params.prefetch->default_opt_level >= 0
11432 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11433 opts->x_flag_prefetch_loop_arrays = 1;
11434
11435 if (opts->x_aarch64_arch_string == NULL)
11436 opts->x_aarch64_arch_string = selected_arch->name;
11437 if (opts->x_aarch64_cpu_string == NULL)
11438 opts->x_aarch64_cpu_string = selected_cpu->name;
11439 if (opts->x_aarch64_tune_string == NULL)
11440 opts->x_aarch64_tune_string = selected_tune->name;
11441
11442 aarch64_override_options_after_change_1 (opts);
11443 }
11444
11445 /* Print a hint with a suggestion for a core or architecture name that
11446 most closely resembles what the user passed in STR. ARCH is true if
11447 the user is asking for an architecture name. ARCH is false if the user
11448 is asking for a core name. */
11449
11450 static void
11451 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11452 {
11453 auto_vec<const char *> candidates;
11454 const struct processor *entry = arch ? all_architectures : all_cores;
11455 for (; entry->name != NULL; entry++)
11456 candidates.safe_push (entry->name);
11457
11458 #ifdef HAVE_LOCAL_CPU_DETECT
11459 /* Add also "native" as possible value. */
11460 if (arch)
11461 candidates.safe_push ("native");
11462 #endif
11463
11464 char *s;
11465 const char *hint = candidates_list_and_hint (str, s, candidates);
11466 if (hint)
11467 inform (input_location, "valid arguments are: %s;"
11468 " did you mean %qs?", s, hint);
11469 else
11470 inform (input_location, "valid arguments are: %s", s);
11471
11472 XDELETEVEC (s);
11473 }
11474
11475 /* Print a hint with a suggestion for a core name that most closely resembles
11476 what the user passed in STR. */
11477
11478 inline static void
11479 aarch64_print_hint_for_core (const char *str)
11480 {
11481 aarch64_print_hint_for_core_or_arch (str, false);
11482 }
11483
11484 /* Print a hint with a suggestion for an architecture name that most closely
11485 resembles what the user passed in STR. */
11486
11487 inline static void
11488 aarch64_print_hint_for_arch (const char *str)
11489 {
11490 aarch64_print_hint_for_core_or_arch (str, true);
11491 }
11492
11493
11494 /* Print a hint with a suggestion for an extension name
11495 that most closely resembles what the user passed in STR. */
11496
11497 void
11498 aarch64_print_hint_for_extensions (const std::string &str)
11499 {
11500 auto_vec<const char *> candidates;
11501 aarch64_get_all_extension_candidates (&candidates);
11502 char *s;
11503 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11504 if (hint)
11505 inform (input_location, "valid arguments are: %s;"
11506 " did you mean %qs?", s, hint);
11507 else
11508 inform (input_location, "valid arguments are: %s;", s);
11509
11510 XDELETEVEC (s);
11511 }
11512
11513 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11514 specified in STR and throw errors if appropriate. Put the results if
11515 they are valid in RES and ISA_FLAGS. Return whether the option is
11516 valid. */
11517
11518 static bool
11519 aarch64_validate_mcpu (const char *str, const struct processor **res,
11520 unsigned long *isa_flags)
11521 {
11522 std::string invalid_extension;
11523 enum aarch64_parse_opt_result parse_res
11524 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11525
11526 if (parse_res == AARCH64_PARSE_OK)
11527 return true;
11528
11529 switch (parse_res)
11530 {
11531 case AARCH64_PARSE_MISSING_ARG:
11532 error ("missing cpu name in %<-mcpu=%s%>", str);
11533 break;
11534 case AARCH64_PARSE_INVALID_ARG:
11535 error ("unknown value %qs for -mcpu", str);
11536 aarch64_print_hint_for_core (str);
11537 break;
11538 case AARCH64_PARSE_INVALID_FEATURE:
11539 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11540 invalid_extension.c_str (), str);
11541 aarch64_print_hint_for_extensions (invalid_extension);
11542 break;
11543 default:
11544 gcc_unreachable ();
11545 }
11546
11547 return false;
11548 }
11549
11550 /* Parses CONST_STR for branch protection features specified in
11551 aarch64_branch_protect_types, and set any global variables required. Returns
11552 the parsing result and assigns LAST_STR to the last processed token from
11553 CONST_STR so that it can be used for error reporting. */
11554
11555 static enum
11556 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11557 char** last_str)
11558 {
11559 char *str_root = xstrdup (const_str);
11560 char* token_save = NULL;
11561 char *str = strtok_r (str_root, "+", &token_save);
11562 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11563 if (!str)
11564 res = AARCH64_PARSE_MISSING_ARG;
11565 else
11566 {
11567 char *next_str = strtok_r (NULL, "+", &token_save);
11568 /* Reset the branch protection features to their defaults. */
11569 aarch64_handle_no_branch_protection (NULL, NULL);
11570
11571 while (str && res == AARCH64_PARSE_OK)
11572 {
11573 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11574 bool found = false;
11575 /* Search for this type. */
11576 while (type && type->name && !found && res == AARCH64_PARSE_OK)
11577 {
11578 if (strcmp (str, type->name) == 0)
11579 {
11580 found = true;
11581 res = type->handler (str, next_str);
11582 str = next_str;
11583 next_str = strtok_r (NULL, "+", &token_save);
11584 }
11585 else
11586 type++;
11587 }
11588 if (found && res == AARCH64_PARSE_OK)
11589 {
11590 bool found_subtype = true;
11591 /* Loop through each token until we find one that isn't a
11592 subtype. */
11593 while (found_subtype)
11594 {
11595 found_subtype = false;
11596 const aarch64_branch_protect_type *subtype = type->subtypes;
11597 /* Search for the subtype. */
11598 while (str && subtype && subtype->name && !found_subtype
11599 && res == AARCH64_PARSE_OK)
11600 {
11601 if (strcmp (str, subtype->name) == 0)
11602 {
11603 found_subtype = true;
11604 res = subtype->handler (str, next_str);
11605 str = next_str;
11606 next_str = strtok_r (NULL, "+", &token_save);
11607 }
11608 else
11609 subtype++;
11610 }
11611 }
11612 }
11613 else if (!found)
11614 res = AARCH64_PARSE_INVALID_ARG;
11615 }
11616 }
11617 /* Copy the last processed token into the argument to pass it back.
11618 Used by option and attribute validation to print the offending token. */
11619 if (last_str)
11620 {
11621 if (str) strcpy (*last_str, str);
11622 else *last_str = NULL;
11623 }
11624 if (res == AARCH64_PARSE_OK)
11625 {
11626 /* If needed, alloc the accepted string then copy in const_str.
11627 Used by override_option_after_change_1. */
11628 if (!accepted_branch_protection_string)
11629 accepted_branch_protection_string = (char *) xmalloc (
11630 BRANCH_PROTECT_STR_MAX
11631 + 1);
11632 strncpy (accepted_branch_protection_string, const_str,
11633 BRANCH_PROTECT_STR_MAX + 1);
11634 /* Forcibly null-terminate. */
11635 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11636 }
11637 return res;
11638 }
11639
11640 static bool
11641 aarch64_validate_mbranch_protection (const char *const_str)
11642 {
11643 char *str = (char *) xmalloc (strlen (const_str));
11644 enum aarch64_parse_opt_result res =
11645 aarch64_parse_branch_protection (const_str, &str);
11646 if (res == AARCH64_PARSE_INVALID_ARG)
11647 error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11648 else if (res == AARCH64_PARSE_MISSING_ARG)
11649 error ("missing arg for %<-mbranch-protection=%>");
11650 free (str);
11651 return res == AARCH64_PARSE_OK;
11652 }
11653
11654 /* Validate a command-line -march option. Parse the arch and extensions
11655 (if any) specified in STR and throw errors if appropriate. Put the
11656 results, if they are valid, in RES and ISA_FLAGS. Return whether the
11657 option is valid. */
11658
11659 static bool
11660 aarch64_validate_march (const char *str, const struct processor **res,
11661 unsigned long *isa_flags)
11662 {
11663 std::string invalid_extension;
11664 enum aarch64_parse_opt_result parse_res
11665 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11666
11667 if (parse_res == AARCH64_PARSE_OK)
11668 return true;
11669
11670 switch (parse_res)
11671 {
11672 case AARCH64_PARSE_MISSING_ARG:
11673 error ("missing arch name in %<-march=%s%>", str);
11674 break;
11675 case AARCH64_PARSE_INVALID_ARG:
11676 error ("unknown value %qs for -march", str);
11677 aarch64_print_hint_for_arch (str);
11678 break;
11679 case AARCH64_PARSE_INVALID_FEATURE:
11680 error ("invalid feature modifier %qs in %<-march=%s%>",
11681 invalid_extension.c_str (), str);
11682 aarch64_print_hint_for_extensions (invalid_extension);
11683 break;
11684 default:
11685 gcc_unreachable ();
11686 }
11687
11688 return false;
11689 }
11690
11691 /* Validate a command-line -mtune option. Parse the cpu
11692 specified in STR and throw errors if appropriate. Put the
11693 result, if it is valid, in RES. Return whether the option is
11694 valid. */
11695
11696 static bool
11697 aarch64_validate_mtune (const char *str, const struct processor **res)
11698 {
11699 enum aarch64_parse_opt_result parse_res
11700 = aarch64_parse_tune (str, res);
11701
11702 if (parse_res == AARCH64_PARSE_OK)
11703 return true;
11704
11705 switch (parse_res)
11706 {
11707 case AARCH64_PARSE_MISSING_ARG:
11708 error ("missing cpu name in %<-mtune=%s%>", str);
11709 break;
11710 case AARCH64_PARSE_INVALID_ARG:
11711 error ("unknown value %qs for -mtune", str);
11712 aarch64_print_hint_for_core (str);
11713 break;
11714 default:
11715 gcc_unreachable ();
11716 }
11717 return false;
11718 }
11719
11720 /* Return the CPU corresponding to the enum CPU.
11721 If it doesn't specify a cpu, return the default. */
11722
11723 static const struct processor *
11724 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11725 {
11726 if (cpu != aarch64_none)
11727 return &all_cores[cpu];
11728
11729 /* The & 0x3f is to extract the bottom 6 bits that encode the
11730 default cpu as selected by the --with-cpu GCC configure option
11731 in config.gcc.
11732 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11733 flags mechanism should be reworked to make it more sane. */
11734 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11735 }
11736
11737 /* Return the architecture corresponding to the enum ARCH.
11738 If it doesn't specify a valid architecture, return the default. */
11739
11740 static const struct processor *
11741 aarch64_get_arch (enum aarch64_arch arch)
11742 {
11743 if (arch != aarch64_no_arch)
11744 return &all_architectures[arch];
11745
11746 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11747
11748 return &all_architectures[cpu->arch];
11749 }
11750
11751 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
11752
11753 static poly_uint16
11754 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11755 {
11756 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11757 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11758 deciding which .md file patterns to use and when deciding whether
11759 something is a legitimate address or constant. */
11760 if (value == SVE_SCALABLE || value == SVE_128)
11761 return poly_uint16 (2, 2);
11762 else
11763 return (int) value / 64;
11764 }
11765
11766 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
11767 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11768 tuning structs. In particular it must set selected_tune and
11769 aarch64_isa_flags that define the available ISA features and tuning
11770 decisions. It must also set selected_arch as this will be used to
11771 output the .arch asm tags for each function. */
11772
11773 static void
11774 aarch64_override_options (void)
11775 {
11776 unsigned long cpu_isa = 0;
11777 unsigned long arch_isa = 0;
11778 aarch64_isa_flags = 0;
11779
11780 bool valid_cpu = true;
11781 bool valid_tune = true;
11782 bool valid_arch = true;
11783
11784 selected_cpu = NULL;
11785 selected_arch = NULL;
11786 selected_tune = NULL;
11787
11788 if (aarch64_branch_protection_string)
11789 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11790
11791 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11792 If either of -march or -mtune is given, they override their
11793 respective component of -mcpu. */
11794 if (aarch64_cpu_string)
11795 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11796 &cpu_isa);
11797
11798 if (aarch64_arch_string)
11799 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11800 &arch_isa);
11801
11802 if (aarch64_tune_string)
11803 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11804
11805 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11806 SUBTARGET_OVERRIDE_OPTIONS;
11807 #endif
11808
11809 /* If the user did not specify a processor, choose the default
11810 one for them. This will be the CPU set during configuration using
11811 --with-cpu, otherwise it is "generic". */
11812 if (!selected_cpu)
11813 {
11814 if (selected_arch)
11815 {
11816 selected_cpu = &all_cores[selected_arch->ident];
11817 aarch64_isa_flags = arch_isa;
11818 explicit_arch = selected_arch->arch;
11819 }
11820 else
11821 {
11822 /* Get default configure-time CPU. */
11823 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
11824 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
11825 }
11826
11827 if (selected_tune)
11828 explicit_tune_core = selected_tune->ident;
11829 }
11830 /* If both -mcpu and -march are specified check that they are architecturally
11831 compatible, warn if they're not and prefer the -march ISA flags. */
11832 else if (selected_arch)
11833 {
11834 if (selected_arch->arch != selected_cpu->arch)
11835 {
11836 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
11837 all_architectures[selected_cpu->arch].name,
11838 selected_arch->name);
11839 }
11840 aarch64_isa_flags = arch_isa;
11841 explicit_arch = selected_arch->arch;
11842 explicit_tune_core = selected_tune ? selected_tune->ident
11843 : selected_cpu->ident;
11844 }
11845 else
11846 {
11847 /* -mcpu but no -march. */
11848 aarch64_isa_flags = cpu_isa;
11849 explicit_tune_core = selected_tune ? selected_tune->ident
11850 : selected_cpu->ident;
11851 gcc_assert (selected_cpu);
11852 selected_arch = &all_architectures[selected_cpu->arch];
11853 explicit_arch = selected_arch->arch;
11854 }
11855
11856 /* Set the arch as well as we will need it when outputing
11857 the .arch directive in assembly. */
11858 if (!selected_arch)
11859 {
11860 gcc_assert (selected_cpu);
11861 selected_arch = &all_architectures[selected_cpu->arch];
11862 }
11863
11864 if (!selected_tune)
11865 selected_tune = selected_cpu;
11866
11867 if (aarch64_enable_bti == 2)
11868 {
11869 #ifdef TARGET_ENABLE_BTI
11870 aarch64_enable_bti = 1;
11871 #else
11872 aarch64_enable_bti = 0;
11873 #endif
11874 }
11875
11876 /* Return address signing is currently not supported for ILP32 targets. For
11877 LP64 targets use the configured option in the absence of a command-line
11878 option for -mbranch-protection. */
11879 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
11880 {
11881 #ifdef TARGET_ENABLE_PAC_RET
11882 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
11883 aarch64_ra_sign_key = AARCH64_KEY_A;
11884 #else
11885 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
11886 #endif
11887 }
11888
11889 #ifndef HAVE_AS_MABI_OPTION
11890 /* The compiler may have been configured with 2.23.* binutils, which does
11891 not have support for ILP32. */
11892 if (TARGET_ILP32)
11893 error ("assembler does not support -mabi=ilp32");
11894 #endif
11895
11896 /* Convert -msve-vector-bits to a VG count. */
11897 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11898
11899 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11900 sorry ("return address signing is only supported for -mabi=lp64");
11901
11902 /* Make sure we properly set up the explicit options. */
11903 if ((aarch64_cpu_string && valid_cpu)
11904 || (aarch64_tune_string && valid_tune))
11905 gcc_assert (explicit_tune_core != aarch64_none);
11906
11907 if ((aarch64_cpu_string && valid_cpu)
11908 || (aarch64_arch_string && valid_arch))
11909 gcc_assert (explicit_arch != aarch64_no_arch);
11910
11911 /* The pass to insert speculation tracking runs before
11912 shrink-wrapping and the latter does not know how to update the
11913 tracking status. So disable it in this case. */
11914 if (aarch64_track_speculation)
11915 flag_shrink_wrap = 0;
11916
11917 aarch64_override_options_internal (&global_options);
11918
11919 /* Save these options as the default ones in case we push and pop them later
11920 while processing functions with potential target attributes. */
11921 target_option_default_node = target_option_current_node
11922 = build_target_option_node (&global_options);
11923 }
11924
11925 /* Implement targetm.override_options_after_change. */
11926
11927 static void
11928 aarch64_override_options_after_change (void)
11929 {
11930 aarch64_override_options_after_change_1 (&global_options);
11931 }
11932
11933 static struct machine_function *
11934 aarch64_init_machine_status (void)
11935 {
11936 struct machine_function *machine;
11937 machine = ggc_cleared_alloc<machine_function> ();
11938 return machine;
11939 }
11940
11941 void
11942 aarch64_init_expanders (void)
11943 {
11944 init_machine_status = aarch64_init_machine_status;
11945 }
11946
11947 /* A checking mechanism for the implementation of the various code models. */
11948 static void
11949 initialize_aarch64_code_model (struct gcc_options *opts)
11950 {
11951 if (opts->x_flag_pic)
11952 {
11953 switch (opts->x_aarch64_cmodel_var)
11954 {
11955 case AARCH64_CMODEL_TINY:
11956 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11957 break;
11958 case AARCH64_CMODEL_SMALL:
11959 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11960 aarch64_cmodel = (flag_pic == 2
11961 ? AARCH64_CMODEL_SMALL_PIC
11962 : AARCH64_CMODEL_SMALL_SPIC);
11963 #else
11964 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11965 #endif
11966 break;
11967 case AARCH64_CMODEL_LARGE:
11968 sorry ("code model %qs with -f%s", "large",
11969 opts->x_flag_pic > 1 ? "PIC" : "pic");
11970 break;
11971 default:
11972 gcc_unreachable ();
11973 }
11974 }
11975 else
11976 aarch64_cmodel = opts->x_aarch64_cmodel_var;
11977 }
11978
11979 /* Implement TARGET_OPTION_SAVE. */
11980
11981 static void
11982 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11983 {
11984 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11985 ptr->x_aarch64_branch_protection_string
11986 = opts->x_aarch64_branch_protection_string;
11987 }
11988
11989 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11990 using the information saved in PTR. */
11991
11992 static void
11993 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11994 {
11995 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11996 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11997 opts->x_explicit_arch = ptr->x_explicit_arch;
11998 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11999 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12000 opts->x_aarch64_branch_protection_string
12001 = ptr->x_aarch64_branch_protection_string;
12002 if (opts->x_aarch64_branch_protection_string)
12003 {
12004 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12005 NULL);
12006 }
12007
12008 aarch64_override_options_internal (opts);
12009 }
12010
12011 /* Implement TARGET_OPTION_PRINT. */
12012
12013 static void
12014 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12015 {
12016 const struct processor *cpu
12017 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12018 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
12019 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12020 std::string extension
12021 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12022
12023 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12024 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12025 arch->name, extension.c_str ());
12026 }
12027
12028 static GTY(()) tree aarch64_previous_fndecl;
12029
12030 void
12031 aarch64_reset_previous_fndecl (void)
12032 {
12033 aarch64_previous_fndecl = NULL;
12034 }
12035
12036 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12037 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12038 make sure optab availability predicates are recomputed when necessary. */
12039
12040 void
12041 aarch64_save_restore_target_globals (tree new_tree)
12042 {
12043 if (TREE_TARGET_GLOBALS (new_tree))
12044 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12045 else if (new_tree == target_option_default_node)
12046 restore_target_globals (&default_target_globals);
12047 else
12048 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12049 }
12050
12051 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12052 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12053 of the function, if such exists. This function may be called multiple
12054 times on a single function so use aarch64_previous_fndecl to avoid
12055 setting up identical state. */
12056
12057 static void
12058 aarch64_set_current_function (tree fndecl)
12059 {
12060 if (!fndecl || fndecl == aarch64_previous_fndecl)
12061 return;
12062
12063 tree old_tree = (aarch64_previous_fndecl
12064 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12065 : NULL_TREE);
12066
12067 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12068
12069 /* If current function has no attributes but the previous one did,
12070 use the default node. */
12071 if (!new_tree && old_tree)
12072 new_tree = target_option_default_node;
12073
12074 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12075 the default have been handled by aarch64_save_restore_target_globals from
12076 aarch64_pragma_target_parse. */
12077 if (old_tree == new_tree)
12078 return;
12079
12080 aarch64_previous_fndecl = fndecl;
12081
12082 /* First set the target options. */
12083 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12084
12085 aarch64_save_restore_target_globals (new_tree);
12086 }
12087
12088 /* Enum describing the various ways we can handle attributes.
12089 In many cases we can reuse the generic option handling machinery. */
12090
12091 enum aarch64_attr_opt_type
12092 {
12093 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
12094 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
12095 aarch64_attr_enum, /* Attribute sets an enum variable. */
12096 aarch64_attr_custom /* Attribute requires a custom handling function. */
12097 };
12098
12099 /* All the information needed to handle a target attribute.
12100 NAME is the name of the attribute.
12101 ATTR_TYPE specifies the type of behavior of the attribute as described
12102 in the definition of enum aarch64_attr_opt_type.
12103 ALLOW_NEG is true if the attribute supports a "no-" form.
12104 HANDLER is the function that takes the attribute string as an argument
12105 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12106 OPT_NUM is the enum specifying the option that the attribute modifies.
12107 This is needed for attributes that mirror the behavior of a command-line
12108 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12109 aarch64_attr_enum. */
12110
12111 struct aarch64_attribute_info
12112 {
12113 const char *name;
12114 enum aarch64_attr_opt_type attr_type;
12115 bool allow_neg;
12116 bool (*handler) (const char *);
12117 enum opt_code opt_num;
12118 };
12119
12120 /* Handle the ARCH_STR argument to the arch= target attribute. */
12121
12122 static bool
12123 aarch64_handle_attr_arch (const char *str)
12124 {
12125 const struct processor *tmp_arch = NULL;
12126 std::string invalid_extension;
12127 enum aarch64_parse_opt_result parse_res
12128 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12129
12130 if (parse_res == AARCH64_PARSE_OK)
12131 {
12132 gcc_assert (tmp_arch);
12133 selected_arch = tmp_arch;
12134 explicit_arch = selected_arch->arch;
12135 return true;
12136 }
12137
12138 switch (parse_res)
12139 {
12140 case AARCH64_PARSE_MISSING_ARG:
12141 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12142 break;
12143 case AARCH64_PARSE_INVALID_ARG:
12144 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12145 aarch64_print_hint_for_arch (str);
12146 break;
12147 case AARCH64_PARSE_INVALID_FEATURE:
12148 error ("invalid feature modifier %s of value (\"%s\") in "
12149 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12150 aarch64_print_hint_for_extensions (invalid_extension);
12151 break;
12152 default:
12153 gcc_unreachable ();
12154 }
12155
12156 return false;
12157 }
12158
12159 /* Handle the argument CPU_STR to the cpu= target attribute. */
12160
12161 static bool
12162 aarch64_handle_attr_cpu (const char *str)
12163 {
12164 const struct processor *tmp_cpu = NULL;
12165 std::string invalid_extension;
12166 enum aarch64_parse_opt_result parse_res
12167 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12168
12169 if (parse_res == AARCH64_PARSE_OK)
12170 {
12171 gcc_assert (tmp_cpu);
12172 selected_tune = tmp_cpu;
12173 explicit_tune_core = selected_tune->ident;
12174
12175 selected_arch = &all_architectures[tmp_cpu->arch];
12176 explicit_arch = selected_arch->arch;
12177 return true;
12178 }
12179
12180 switch (parse_res)
12181 {
12182 case AARCH64_PARSE_MISSING_ARG:
12183 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12184 break;
12185 case AARCH64_PARSE_INVALID_ARG:
12186 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12187 aarch64_print_hint_for_core (str);
12188 break;
12189 case AARCH64_PARSE_INVALID_FEATURE:
12190 error ("invalid feature modifier %s of value (\"%s\") in "
12191 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12192 aarch64_print_hint_for_extensions (invalid_extension);
12193 break;
12194 default:
12195 gcc_unreachable ();
12196 }
12197
12198 return false;
12199 }
12200
12201 /* Handle the argument STR to the branch-protection= attribute. */
12202
12203 static bool
12204 aarch64_handle_attr_branch_protection (const char* str)
12205 {
12206 char *err_str = (char *) xmalloc (strlen (str));
12207 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12208 &err_str);
12209 bool success = false;
12210 switch (res)
12211 {
12212 case AARCH64_PARSE_MISSING_ARG:
12213 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12214 " attribute");
12215 break;
12216 case AARCH64_PARSE_INVALID_ARG:
12217 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12218 "=\")%> pragma or attribute", err_str);
12219 break;
12220 case AARCH64_PARSE_OK:
12221 success = true;
12222 /* Fall through. */
12223 case AARCH64_PARSE_INVALID_FEATURE:
12224 break;
12225 default:
12226 gcc_unreachable ();
12227 }
12228 free (err_str);
12229 return success;
12230 }
12231
12232 /* Handle the argument STR to the tune= target attribute. */
12233
12234 static bool
12235 aarch64_handle_attr_tune (const char *str)
12236 {
12237 const struct processor *tmp_tune = NULL;
12238 enum aarch64_parse_opt_result parse_res
12239 = aarch64_parse_tune (str, &tmp_tune);
12240
12241 if (parse_res == AARCH64_PARSE_OK)
12242 {
12243 gcc_assert (tmp_tune);
12244 selected_tune = tmp_tune;
12245 explicit_tune_core = selected_tune->ident;
12246 return true;
12247 }
12248
12249 switch (parse_res)
12250 {
12251 case AARCH64_PARSE_INVALID_ARG:
12252 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12253 aarch64_print_hint_for_core (str);
12254 break;
12255 default:
12256 gcc_unreachable ();
12257 }
12258
12259 return false;
12260 }
12261
12262 /* Parse an architecture extensions target attribute string specified in STR.
12263 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12264 if successful. Update aarch64_isa_flags to reflect the ISA features
12265 modified. */
12266
12267 static bool
12268 aarch64_handle_attr_isa_flags (char *str)
12269 {
12270 enum aarch64_parse_opt_result parse_res;
12271 unsigned long isa_flags = aarch64_isa_flags;
12272
12273 /* We allow "+nothing" in the beginning to clear out all architectural
12274 features if the user wants to handpick specific features. */
12275 if (strncmp ("+nothing", str, 8) == 0)
12276 {
12277 isa_flags = 0;
12278 str += 8;
12279 }
12280
12281 std::string invalid_extension;
12282 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12283
12284 if (parse_res == AARCH64_PARSE_OK)
12285 {
12286 aarch64_isa_flags = isa_flags;
12287 return true;
12288 }
12289
12290 switch (parse_res)
12291 {
12292 case AARCH64_PARSE_MISSING_ARG:
12293 error ("missing value in %<target()%> pragma or attribute");
12294 break;
12295
12296 case AARCH64_PARSE_INVALID_FEATURE:
12297 error ("invalid feature modifier %s of value (\"%s\") in "
12298 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12299 break;
12300
12301 default:
12302 gcc_unreachable ();
12303 }
12304
12305 return false;
12306 }
12307
12308 /* The target attributes that we support. On top of these we also support just
12309 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12310 handled explicitly in aarch64_process_one_target_attr. */
12311
12312 static const struct aarch64_attribute_info aarch64_attributes[] =
12313 {
12314 { "general-regs-only", aarch64_attr_mask, false, NULL,
12315 OPT_mgeneral_regs_only },
12316 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12317 OPT_mfix_cortex_a53_835769 },
12318 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12319 OPT_mfix_cortex_a53_843419 },
12320 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12321 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12322 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12323 OPT_momit_leaf_frame_pointer },
12324 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12325 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12326 OPT_march_ },
12327 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12328 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12329 OPT_mtune_ },
12330 { "branch-protection", aarch64_attr_custom, false,
12331 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12332 { "sign-return-address", aarch64_attr_enum, false, NULL,
12333 OPT_msign_return_address_ },
12334 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12335 };
12336
12337 /* Parse ARG_STR which contains the definition of one target attribute.
12338 Show appropriate errors if any or return true if the attribute is valid. */
12339
12340 static bool
12341 aarch64_process_one_target_attr (char *arg_str)
12342 {
12343 bool invert = false;
12344
12345 size_t len = strlen (arg_str);
12346
12347 if (len == 0)
12348 {
12349 error ("malformed %<target()%> pragma or attribute");
12350 return false;
12351 }
12352
12353 char *str_to_check = (char *) alloca (len + 1);
12354 strcpy (str_to_check, arg_str);
12355
12356 /* Skip leading whitespace. */
12357 while (*str_to_check == ' ' || *str_to_check == '\t')
12358 str_to_check++;
12359
12360 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12361 It is easier to detect and handle it explicitly here rather than going
12362 through the machinery for the rest of the target attributes in this
12363 function. */
12364 if (*str_to_check == '+')
12365 return aarch64_handle_attr_isa_flags (str_to_check);
12366
12367 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12368 {
12369 invert = true;
12370 str_to_check += 3;
12371 }
12372 char *arg = strchr (str_to_check, '=');
12373
12374 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12375 and point ARG to "foo". */
12376 if (arg)
12377 {
12378 *arg = '\0';
12379 arg++;
12380 }
12381 const struct aarch64_attribute_info *p_attr;
12382 bool found = false;
12383 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12384 {
12385 /* If the names don't match up, or the user has given an argument
12386 to an attribute that doesn't accept one, or didn't give an argument
12387 to an attribute that expects one, fail to match. */
12388 if (strcmp (str_to_check, p_attr->name) != 0)
12389 continue;
12390
12391 found = true;
12392 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12393 || p_attr->attr_type == aarch64_attr_enum;
12394
12395 if (attr_need_arg_p ^ (arg != NULL))
12396 {
12397 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12398 return false;
12399 }
12400
12401 /* If the name matches but the attribute does not allow "no-" versions
12402 then we can't match. */
12403 if (invert && !p_attr->allow_neg)
12404 {
12405 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12406 return false;
12407 }
12408
12409 switch (p_attr->attr_type)
12410 {
12411 /* Has a custom handler registered.
12412 For example, cpu=, arch=, tune=. */
12413 case aarch64_attr_custom:
12414 gcc_assert (p_attr->handler);
12415 if (!p_attr->handler (arg))
12416 return false;
12417 break;
12418
12419 /* Either set or unset a boolean option. */
12420 case aarch64_attr_bool:
12421 {
12422 struct cl_decoded_option decoded;
12423
12424 generate_option (p_attr->opt_num, NULL, !invert,
12425 CL_TARGET, &decoded);
12426 aarch64_handle_option (&global_options, &global_options_set,
12427 &decoded, input_location);
12428 break;
12429 }
12430 /* Set or unset a bit in the target_flags. aarch64_handle_option
12431 should know what mask to apply given the option number. */
12432 case aarch64_attr_mask:
12433 {
12434 struct cl_decoded_option decoded;
12435 /* We only need to specify the option number.
12436 aarch64_handle_option will know which mask to apply. */
12437 decoded.opt_index = p_attr->opt_num;
12438 decoded.value = !invert;
12439 aarch64_handle_option (&global_options, &global_options_set,
12440 &decoded, input_location);
12441 break;
12442 }
12443 /* Use the option setting machinery to set an option to an enum. */
12444 case aarch64_attr_enum:
12445 {
12446 gcc_assert (arg);
12447 bool valid;
12448 int value;
12449 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12450 &value, CL_TARGET);
12451 if (valid)
12452 {
12453 set_option (&global_options, NULL, p_attr->opt_num, value,
12454 NULL, DK_UNSPECIFIED, input_location,
12455 global_dc);
12456 }
12457 else
12458 {
12459 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12460 }
12461 break;
12462 }
12463 default:
12464 gcc_unreachable ();
12465 }
12466 }
12467
12468 /* If we reached here we either have found an attribute and validated
12469 it or didn't match any. If we matched an attribute but its arguments
12470 were malformed we will have returned false already. */
12471 return found;
12472 }
12473
12474 /* Count how many times the character C appears in
12475 NULL-terminated string STR. */
12476
12477 static unsigned int
12478 num_occurences_in_str (char c, char *str)
12479 {
12480 unsigned int res = 0;
12481 while (*str != '\0')
12482 {
12483 if (*str == c)
12484 res++;
12485
12486 str++;
12487 }
12488
12489 return res;
12490 }
12491
12492 /* Parse the tree in ARGS that contains the target attribute information
12493 and update the global target options space. */
12494
12495 bool
12496 aarch64_process_target_attr (tree args)
12497 {
12498 if (TREE_CODE (args) == TREE_LIST)
12499 {
12500 do
12501 {
12502 tree head = TREE_VALUE (args);
12503 if (head)
12504 {
12505 if (!aarch64_process_target_attr (head))
12506 return false;
12507 }
12508 args = TREE_CHAIN (args);
12509 } while (args);
12510
12511 return true;
12512 }
12513
12514 if (TREE_CODE (args) != STRING_CST)
12515 {
12516 error ("attribute %<target%> argument not a string");
12517 return false;
12518 }
12519
12520 size_t len = strlen (TREE_STRING_POINTER (args));
12521 char *str_to_check = (char *) alloca (len + 1);
12522 strcpy (str_to_check, TREE_STRING_POINTER (args));
12523
12524 if (len == 0)
12525 {
12526 error ("malformed %<target()%> pragma or attribute");
12527 return false;
12528 }
12529
12530 /* Used to catch empty spaces between commas i.e.
12531 attribute ((target ("attr1,,attr2"))). */
12532 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12533
12534 /* Handle multiple target attributes separated by ','. */
12535 char *token = strtok_r (str_to_check, ",", &str_to_check);
12536
12537 unsigned int num_attrs = 0;
12538 while (token)
12539 {
12540 num_attrs++;
12541 if (!aarch64_process_one_target_attr (token))
12542 {
12543 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12544 return false;
12545 }
12546
12547 token = strtok_r (NULL, ",", &str_to_check);
12548 }
12549
12550 if (num_attrs != num_commas + 1)
12551 {
12552 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12553 return false;
12554 }
12555
12556 return true;
12557 }
12558
12559 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
12560 process attribute ((target ("..."))). */
12561
12562 static bool
12563 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12564 {
12565 struct cl_target_option cur_target;
12566 bool ret;
12567 tree old_optimize;
12568 tree new_target, new_optimize;
12569 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12570
12571 /* If what we're processing is the current pragma string then the
12572 target option node is already stored in target_option_current_node
12573 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
12574 having to re-parse the string. This is especially useful to keep
12575 arm_neon.h compile times down since that header contains a lot
12576 of intrinsics enclosed in pragmas. */
12577 if (!existing_target && args == current_target_pragma)
12578 {
12579 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12580 return true;
12581 }
12582 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12583
12584 old_optimize = build_optimization_node (&global_options);
12585 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12586
12587 /* If the function changed the optimization levels as well as setting
12588 target options, start with the optimizations specified. */
12589 if (func_optimize && func_optimize != old_optimize)
12590 cl_optimization_restore (&global_options,
12591 TREE_OPTIMIZATION (func_optimize));
12592
12593 /* Save the current target options to restore at the end. */
12594 cl_target_option_save (&cur_target, &global_options);
12595
12596 /* If fndecl already has some target attributes applied to it, unpack
12597 them so that we add this attribute on top of them, rather than
12598 overwriting them. */
12599 if (existing_target)
12600 {
12601 struct cl_target_option *existing_options
12602 = TREE_TARGET_OPTION (existing_target);
12603
12604 if (existing_options)
12605 cl_target_option_restore (&global_options, existing_options);
12606 }
12607 else
12608 cl_target_option_restore (&global_options,
12609 TREE_TARGET_OPTION (target_option_current_node));
12610
12611 ret = aarch64_process_target_attr (args);
12612
12613 /* Set up any additional state. */
12614 if (ret)
12615 {
12616 aarch64_override_options_internal (&global_options);
12617 /* Initialize SIMD builtins if we haven't already.
12618 Set current_target_pragma to NULL for the duration so that
12619 the builtin initialization code doesn't try to tag the functions
12620 being built with the attributes specified by any current pragma, thus
12621 going into an infinite recursion. */
12622 if (TARGET_SIMD)
12623 {
12624 tree saved_current_target_pragma = current_target_pragma;
12625 current_target_pragma = NULL;
12626 aarch64_init_simd_builtins ();
12627 current_target_pragma = saved_current_target_pragma;
12628 }
12629 new_target = build_target_option_node (&global_options);
12630 }
12631 else
12632 new_target = NULL;
12633
12634 new_optimize = build_optimization_node (&global_options);
12635
12636 if (fndecl && ret)
12637 {
12638 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12639
12640 if (old_optimize != new_optimize)
12641 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12642 }
12643
12644 cl_target_option_restore (&global_options, &cur_target);
12645
12646 if (old_optimize != new_optimize)
12647 cl_optimization_restore (&global_options,
12648 TREE_OPTIMIZATION (old_optimize));
12649 return ret;
12650 }
12651
12652 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
12653 tri-bool options (yes, no, don't care) and the default value is
12654 DEF, determine whether to reject inlining. */
12655
12656 static bool
12657 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12658 int dont_care, int def)
12659 {
12660 /* If the callee doesn't care, always allow inlining. */
12661 if (callee == dont_care)
12662 return true;
12663
12664 /* If the caller doesn't care, always allow inlining. */
12665 if (caller == dont_care)
12666 return true;
12667
12668 /* Otherwise, allow inlining if either the callee and caller values
12669 agree, or if the callee is using the default value. */
12670 return (callee == caller || callee == def);
12671 }
12672
12673 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
12674 to inline CALLEE into CALLER based on target-specific info.
12675 Make sure that the caller and callee have compatible architectural
12676 features. Then go through the other possible target attributes
12677 and see if they can block inlining. Try not to reject always_inline
12678 callees unless they are incompatible architecturally. */
12679
12680 static bool
12681 aarch64_can_inline_p (tree caller, tree callee)
12682 {
12683 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12684 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12685
12686 struct cl_target_option *caller_opts
12687 = TREE_TARGET_OPTION (caller_tree ? caller_tree
12688 : target_option_default_node);
12689
12690 struct cl_target_option *callee_opts
12691 = TREE_TARGET_OPTION (callee_tree ? callee_tree
12692 : target_option_default_node);
12693
12694 /* Callee's ISA flags should be a subset of the caller's. */
12695 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12696 != callee_opts->x_aarch64_isa_flags)
12697 return false;
12698
12699 /* Allow non-strict aligned functions inlining into strict
12700 aligned ones. */
12701 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12702 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12703 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12704 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12705 return false;
12706
12707 bool always_inline = lookup_attribute ("always_inline",
12708 DECL_ATTRIBUTES (callee));
12709
12710 /* If the architectural features match up and the callee is always_inline
12711 then the other attributes don't matter. */
12712 if (always_inline)
12713 return true;
12714
12715 if (caller_opts->x_aarch64_cmodel_var
12716 != callee_opts->x_aarch64_cmodel_var)
12717 return false;
12718
12719 if (caller_opts->x_aarch64_tls_dialect
12720 != callee_opts->x_aarch64_tls_dialect)
12721 return false;
12722
12723 /* Honour explicit requests to workaround errata. */
12724 if (!aarch64_tribools_ok_for_inlining_p (
12725 caller_opts->x_aarch64_fix_a53_err835769,
12726 callee_opts->x_aarch64_fix_a53_err835769,
12727 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12728 return false;
12729
12730 if (!aarch64_tribools_ok_for_inlining_p (
12731 caller_opts->x_aarch64_fix_a53_err843419,
12732 callee_opts->x_aarch64_fix_a53_err843419,
12733 2, TARGET_FIX_ERR_A53_843419))
12734 return false;
12735
12736 /* If the user explicitly specified -momit-leaf-frame-pointer for the
12737 caller and calle and they don't match up, reject inlining. */
12738 if (!aarch64_tribools_ok_for_inlining_p (
12739 caller_opts->x_flag_omit_leaf_frame_pointer,
12740 callee_opts->x_flag_omit_leaf_frame_pointer,
12741 2, 1))
12742 return false;
12743
12744 /* If the callee has specific tuning overrides, respect them. */
12745 if (callee_opts->x_aarch64_override_tune_string != NULL
12746 && caller_opts->x_aarch64_override_tune_string == NULL)
12747 return false;
12748
12749 /* If the user specified tuning override strings for the
12750 caller and callee and they don't match up, reject inlining.
12751 We just do a string compare here, we don't analyze the meaning
12752 of the string, as it would be too costly for little gain. */
12753 if (callee_opts->x_aarch64_override_tune_string
12754 && caller_opts->x_aarch64_override_tune_string
12755 && (strcmp (callee_opts->x_aarch64_override_tune_string,
12756 caller_opts->x_aarch64_override_tune_string) != 0))
12757 return false;
12758
12759 return true;
12760 }
12761
12762 /* Return true if SYMBOL_REF X binds locally. */
12763
12764 static bool
12765 aarch64_symbol_binds_local_p (const_rtx x)
12766 {
12767 return (SYMBOL_REF_DECL (x)
12768 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12769 : SYMBOL_REF_LOCAL_P (x));
12770 }
12771
12772 /* Return true if SYMBOL_REF X is thread local */
12773 static bool
12774 aarch64_tls_symbol_p (rtx x)
12775 {
12776 if (! TARGET_HAVE_TLS)
12777 return false;
12778
12779 if (GET_CODE (x) != SYMBOL_REF)
12780 return false;
12781
12782 return SYMBOL_REF_TLS_MODEL (x) != 0;
12783 }
12784
12785 /* Classify a TLS symbol into one of the TLS kinds. */
12786 enum aarch64_symbol_type
12787 aarch64_classify_tls_symbol (rtx x)
12788 {
12789 enum tls_model tls_kind = tls_symbolic_operand_type (x);
12790
12791 switch (tls_kind)
12792 {
12793 case TLS_MODEL_GLOBAL_DYNAMIC:
12794 case TLS_MODEL_LOCAL_DYNAMIC:
12795 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12796
12797 case TLS_MODEL_INITIAL_EXEC:
12798 switch (aarch64_cmodel)
12799 {
12800 case AARCH64_CMODEL_TINY:
12801 case AARCH64_CMODEL_TINY_PIC:
12802 return SYMBOL_TINY_TLSIE;
12803 default:
12804 return SYMBOL_SMALL_TLSIE;
12805 }
12806
12807 case TLS_MODEL_LOCAL_EXEC:
12808 if (aarch64_tls_size == 12)
12809 return SYMBOL_TLSLE12;
12810 else if (aarch64_tls_size == 24)
12811 return SYMBOL_TLSLE24;
12812 else if (aarch64_tls_size == 32)
12813 return SYMBOL_TLSLE32;
12814 else if (aarch64_tls_size == 48)
12815 return SYMBOL_TLSLE48;
12816 else
12817 gcc_unreachable ();
12818
12819 case TLS_MODEL_EMULATED:
12820 case TLS_MODEL_NONE:
12821 return SYMBOL_FORCE_TO_MEM;
12822
12823 default:
12824 gcc_unreachable ();
12825 }
12826 }
12827
12828 /* Return the correct method for accessing X + OFFSET, where X is either
12829 a SYMBOL_REF or LABEL_REF. */
12830
12831 enum aarch64_symbol_type
12832 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
12833 {
12834 if (GET_CODE (x) == LABEL_REF)
12835 {
12836 switch (aarch64_cmodel)
12837 {
12838 case AARCH64_CMODEL_LARGE:
12839 return SYMBOL_FORCE_TO_MEM;
12840
12841 case AARCH64_CMODEL_TINY_PIC:
12842 case AARCH64_CMODEL_TINY:
12843 return SYMBOL_TINY_ABSOLUTE;
12844
12845 case AARCH64_CMODEL_SMALL_SPIC:
12846 case AARCH64_CMODEL_SMALL_PIC:
12847 case AARCH64_CMODEL_SMALL:
12848 return SYMBOL_SMALL_ABSOLUTE;
12849
12850 default:
12851 gcc_unreachable ();
12852 }
12853 }
12854
12855 if (GET_CODE (x) == SYMBOL_REF)
12856 {
12857 if (aarch64_tls_symbol_p (x))
12858 return aarch64_classify_tls_symbol (x);
12859
12860 switch (aarch64_cmodel)
12861 {
12862 case AARCH64_CMODEL_TINY:
12863 /* When we retrieve symbol + offset address, we have to make sure
12864 the offset does not cause overflow of the final address. But
12865 we have no way of knowing the address of symbol at compile time
12866 so we can't accurately say if the distance between the PC and
12867 symbol + offset is outside the addressible range of +/-1M in the
12868 TINY code model. So we rely on images not being greater than
12869 1M and cap the offset at 1M and anything beyond 1M will have to
12870 be loaded using an alternative mechanism. Furthermore if the
12871 symbol is a weak reference to something that isn't known to
12872 resolve to a symbol in this module, then force to memory. */
12873 if ((SYMBOL_REF_WEAK (x)
12874 && !aarch64_symbol_binds_local_p (x))
12875 || !IN_RANGE (offset, -1048575, 1048575))
12876 return SYMBOL_FORCE_TO_MEM;
12877 return SYMBOL_TINY_ABSOLUTE;
12878
12879 case AARCH64_CMODEL_SMALL:
12880 /* Same reasoning as the tiny code model, but the offset cap here is
12881 4G. */
12882 if ((SYMBOL_REF_WEAK (x)
12883 && !aarch64_symbol_binds_local_p (x))
12884 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
12885 HOST_WIDE_INT_C (4294967264)))
12886 return SYMBOL_FORCE_TO_MEM;
12887 return SYMBOL_SMALL_ABSOLUTE;
12888
12889 case AARCH64_CMODEL_TINY_PIC:
12890 if (!aarch64_symbol_binds_local_p (x))
12891 return SYMBOL_TINY_GOT;
12892 return SYMBOL_TINY_ABSOLUTE;
12893
12894 case AARCH64_CMODEL_SMALL_SPIC:
12895 case AARCH64_CMODEL_SMALL_PIC:
12896 if (!aarch64_symbol_binds_local_p (x))
12897 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
12898 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
12899 return SYMBOL_SMALL_ABSOLUTE;
12900
12901 case AARCH64_CMODEL_LARGE:
12902 /* This is alright even in PIC code as the constant
12903 pool reference is always PC relative and within
12904 the same translation unit. */
12905 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
12906 return SYMBOL_SMALL_ABSOLUTE;
12907 else
12908 return SYMBOL_FORCE_TO_MEM;
12909
12910 default:
12911 gcc_unreachable ();
12912 }
12913 }
12914
12915 /* By default push everything into the constant pool. */
12916 return SYMBOL_FORCE_TO_MEM;
12917 }
12918
12919 bool
12920 aarch64_constant_address_p (rtx x)
12921 {
12922 return (CONSTANT_P (x) && memory_address_p (DImode, x));
12923 }
12924
12925 bool
12926 aarch64_legitimate_pic_operand_p (rtx x)
12927 {
12928 if (GET_CODE (x) == SYMBOL_REF
12929 || (GET_CODE (x) == CONST
12930 && GET_CODE (XEXP (x, 0)) == PLUS
12931 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
12932 return false;
12933
12934 return true;
12935 }
12936
12937 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
12938 that should be rematerialized rather than spilled. */
12939
12940 static bool
12941 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
12942 {
12943 /* Support CSE and rematerialization of common constants. */
12944 if (CONST_INT_P (x)
12945 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
12946 || GET_CODE (x) == CONST_VECTOR)
12947 return true;
12948
12949 /* Do not allow vector struct mode constants for Advanced SIMD.
12950 We could support 0 and -1 easily, but they need support in
12951 aarch64-simd.md. */
12952 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12953 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12954 return false;
12955
12956 /* Only accept variable-length vector constants if they can be
12957 handled directly.
12958
12959 ??? It would be possible to handle rematerialization of other
12960 constants via secondary reloads. */
12961 if (vec_flags & VEC_ANY_SVE)
12962 return aarch64_simd_valid_immediate (x, NULL);
12963
12964 if (GET_CODE (x) == HIGH)
12965 x = XEXP (x, 0);
12966
12967 /* Accept polynomial constants that can be calculated by using the
12968 destination of a move as the sole temporary. Constants that
12969 require a second temporary cannot be rematerialized (they can't be
12970 forced to memory and also aren't legitimate constants). */
12971 poly_int64 offset;
12972 if (poly_int_rtx_p (x, &offset))
12973 return aarch64_offset_temporaries (false, offset) <= 1;
12974
12975 /* If an offset is being added to something else, we need to allow the
12976 base to be moved into the destination register, meaning that there
12977 are no free temporaries for the offset. */
12978 x = strip_offset (x, &offset);
12979 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12980 return false;
12981
12982 /* Do not allow const (plus (anchor_symbol, const_int)). */
12983 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12984 return false;
12985
12986 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12987 so spilling them is better than rematerialization. */
12988 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12989 return true;
12990
12991 /* Label references are always constant. */
12992 if (GET_CODE (x) == LABEL_REF)
12993 return true;
12994
12995 return false;
12996 }
12997
12998 rtx
12999 aarch64_load_tp (rtx target)
13000 {
13001 if (!target
13002 || GET_MODE (target) != Pmode
13003 || !register_operand (target, Pmode))
13004 target = gen_reg_rtx (Pmode);
13005
13006 /* Can return in any reg. */
13007 emit_insn (gen_aarch64_load_tp_hard (target));
13008 return target;
13009 }
13010
13011 /* On AAPCS systems, this is the "struct __va_list". */
13012 static GTY(()) tree va_list_type;
13013
13014 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13015 Return the type to use as __builtin_va_list.
13016
13017 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13018
13019 struct __va_list
13020 {
13021 void *__stack;
13022 void *__gr_top;
13023 void *__vr_top;
13024 int __gr_offs;
13025 int __vr_offs;
13026 }; */
13027
13028 static tree
13029 aarch64_build_builtin_va_list (void)
13030 {
13031 tree va_list_name;
13032 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13033
13034 /* Create the type. */
13035 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13036 /* Give it the required name. */
13037 va_list_name = build_decl (BUILTINS_LOCATION,
13038 TYPE_DECL,
13039 get_identifier ("__va_list"),
13040 va_list_type);
13041 DECL_ARTIFICIAL (va_list_name) = 1;
13042 TYPE_NAME (va_list_type) = va_list_name;
13043 TYPE_STUB_DECL (va_list_type) = va_list_name;
13044
13045 /* Create the fields. */
13046 f_stack = build_decl (BUILTINS_LOCATION,
13047 FIELD_DECL, get_identifier ("__stack"),
13048 ptr_type_node);
13049 f_grtop = build_decl (BUILTINS_LOCATION,
13050 FIELD_DECL, get_identifier ("__gr_top"),
13051 ptr_type_node);
13052 f_vrtop = build_decl (BUILTINS_LOCATION,
13053 FIELD_DECL, get_identifier ("__vr_top"),
13054 ptr_type_node);
13055 f_groff = build_decl (BUILTINS_LOCATION,
13056 FIELD_DECL, get_identifier ("__gr_offs"),
13057 integer_type_node);
13058 f_vroff = build_decl (BUILTINS_LOCATION,
13059 FIELD_DECL, get_identifier ("__vr_offs"),
13060 integer_type_node);
13061
13062 /* Tell tree-stdarg pass about our internal offset fields.
13063 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13064 purpose to identify whether the code is updating va_list internal
13065 offset fields through irregular way. */
13066 va_list_gpr_counter_field = f_groff;
13067 va_list_fpr_counter_field = f_vroff;
13068
13069 DECL_ARTIFICIAL (f_stack) = 1;
13070 DECL_ARTIFICIAL (f_grtop) = 1;
13071 DECL_ARTIFICIAL (f_vrtop) = 1;
13072 DECL_ARTIFICIAL (f_groff) = 1;
13073 DECL_ARTIFICIAL (f_vroff) = 1;
13074
13075 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13076 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13077 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13078 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13079 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13080
13081 TYPE_FIELDS (va_list_type) = f_stack;
13082 DECL_CHAIN (f_stack) = f_grtop;
13083 DECL_CHAIN (f_grtop) = f_vrtop;
13084 DECL_CHAIN (f_vrtop) = f_groff;
13085 DECL_CHAIN (f_groff) = f_vroff;
13086
13087 /* Compute its layout. */
13088 layout_type (va_list_type);
13089
13090 return va_list_type;
13091 }
13092
13093 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13094 static void
13095 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13096 {
13097 const CUMULATIVE_ARGS *cum;
13098 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13099 tree stack, grtop, vrtop, groff, vroff;
13100 tree t;
13101 int gr_save_area_size = cfun->va_list_gpr_size;
13102 int vr_save_area_size = cfun->va_list_fpr_size;
13103 int vr_offset;
13104
13105 cum = &crtl->args.info;
13106 if (cfun->va_list_gpr_size)
13107 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13108 cfun->va_list_gpr_size);
13109 if (cfun->va_list_fpr_size)
13110 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13111 * UNITS_PER_VREG, cfun->va_list_fpr_size);
13112
13113 if (!TARGET_FLOAT)
13114 {
13115 gcc_assert (cum->aapcs_nvrn == 0);
13116 vr_save_area_size = 0;
13117 }
13118
13119 f_stack = TYPE_FIELDS (va_list_type_node);
13120 f_grtop = DECL_CHAIN (f_stack);
13121 f_vrtop = DECL_CHAIN (f_grtop);
13122 f_groff = DECL_CHAIN (f_vrtop);
13123 f_vroff = DECL_CHAIN (f_groff);
13124
13125 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13126 NULL_TREE);
13127 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13128 NULL_TREE);
13129 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13130 NULL_TREE);
13131 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13132 NULL_TREE);
13133 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13134 NULL_TREE);
13135
13136 /* Emit code to initialize STACK, which points to the next varargs stack
13137 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13138 by named arguments. STACK is 8-byte aligned. */
13139 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13140 if (cum->aapcs_stack_size > 0)
13141 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13142 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13143 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13144
13145 /* Emit code to initialize GRTOP, the top of the GR save area.
13146 virtual_incoming_args_rtx should have been 16 byte aligned. */
13147 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13148 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13149 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13150
13151 /* Emit code to initialize VRTOP, the top of the VR save area.
13152 This address is gr_save_area_bytes below GRTOP, rounded
13153 down to the next 16-byte boundary. */
13154 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13155 vr_offset = ROUND_UP (gr_save_area_size,
13156 STACK_BOUNDARY / BITS_PER_UNIT);
13157
13158 if (vr_offset)
13159 t = fold_build_pointer_plus_hwi (t, -vr_offset);
13160 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13161 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13162
13163 /* Emit code to initialize GROFF, the offset from GRTOP of the
13164 next GPR argument. */
13165 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13166 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13167 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13168
13169 /* Likewise emit code to initialize VROFF, the offset from FTOP
13170 of the next VR argument. */
13171 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13172 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13173 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13174 }
13175
13176 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13177
13178 static tree
13179 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13180 gimple_seq *post_p ATTRIBUTE_UNUSED)
13181 {
13182 tree addr;
13183 bool indirect_p;
13184 bool is_ha; /* is HFA or HVA. */
13185 bool dw_align; /* double-word align. */
13186 machine_mode ag_mode = VOIDmode;
13187 int nregs;
13188 machine_mode mode;
13189
13190 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13191 tree stack, f_top, f_off, off, arg, roundup, on_stack;
13192 HOST_WIDE_INT size, rsize, adjust, align;
13193 tree t, u, cond1, cond2;
13194
13195 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13196 if (indirect_p)
13197 type = build_pointer_type (type);
13198
13199 mode = TYPE_MODE (type);
13200
13201 f_stack = TYPE_FIELDS (va_list_type_node);
13202 f_grtop = DECL_CHAIN (f_stack);
13203 f_vrtop = DECL_CHAIN (f_grtop);
13204 f_groff = DECL_CHAIN (f_vrtop);
13205 f_vroff = DECL_CHAIN (f_groff);
13206
13207 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13208 f_stack, NULL_TREE);
13209 size = int_size_in_bytes (type);
13210 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
13211
13212 dw_align = false;
13213 adjust = 0;
13214 if (aarch64_vfp_is_call_or_return_candidate (mode,
13215 type,
13216 &ag_mode,
13217 &nregs,
13218 &is_ha))
13219 {
13220 /* No frontends can create types with variable-sized modes, so we
13221 shouldn't be asked to pass or return them. */
13222 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13223
13224 /* TYPE passed in fp/simd registers. */
13225 if (!TARGET_FLOAT)
13226 aarch64_err_no_fpadvsimd (mode);
13227
13228 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13229 unshare_expr (valist), f_vrtop, NULL_TREE);
13230 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13231 unshare_expr (valist), f_vroff, NULL_TREE);
13232
13233 rsize = nregs * UNITS_PER_VREG;
13234
13235 if (is_ha)
13236 {
13237 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13238 adjust = UNITS_PER_VREG - ag_size;
13239 }
13240 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13241 && size < UNITS_PER_VREG)
13242 {
13243 adjust = UNITS_PER_VREG - size;
13244 }
13245 }
13246 else
13247 {
13248 /* TYPE passed in general registers. */
13249 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13250 unshare_expr (valist), f_grtop, NULL_TREE);
13251 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13252 unshare_expr (valist), f_groff, NULL_TREE);
13253 rsize = ROUND_UP (size, UNITS_PER_WORD);
13254 nregs = rsize / UNITS_PER_WORD;
13255
13256 if (align > 8)
13257 dw_align = true;
13258
13259 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13260 && size < UNITS_PER_WORD)
13261 {
13262 adjust = UNITS_PER_WORD - size;
13263 }
13264 }
13265
13266 /* Get a local temporary for the field value. */
13267 off = get_initialized_tmp_var (f_off, pre_p, NULL);
13268
13269 /* Emit code to branch if off >= 0. */
13270 t = build2 (GE_EXPR, boolean_type_node, off,
13271 build_int_cst (TREE_TYPE (off), 0));
13272 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13273
13274 if (dw_align)
13275 {
13276 /* Emit: offs = (offs + 15) & -16. */
13277 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13278 build_int_cst (TREE_TYPE (off), 15));
13279 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13280 build_int_cst (TREE_TYPE (off), -16));
13281 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13282 }
13283 else
13284 roundup = NULL;
13285
13286 /* Update ap.__[g|v]r_offs */
13287 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13288 build_int_cst (TREE_TYPE (off), rsize));
13289 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13290
13291 /* String up. */
13292 if (roundup)
13293 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13294
13295 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13296 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13297 build_int_cst (TREE_TYPE (f_off), 0));
13298 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13299
13300 /* String up: make sure the assignment happens before the use. */
13301 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13302 COND_EXPR_ELSE (cond1) = t;
13303
13304 /* Prepare the trees handling the argument that is passed on the stack;
13305 the top level node will store in ON_STACK. */
13306 arg = get_initialized_tmp_var (stack, pre_p, NULL);
13307 if (align > 8)
13308 {
13309 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13310 t = fold_build_pointer_plus_hwi (arg, 15);
13311 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13312 build_int_cst (TREE_TYPE (t), -16));
13313 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13314 }
13315 else
13316 roundup = NULL;
13317 /* Advance ap.__stack */
13318 t = fold_build_pointer_plus_hwi (arg, size + 7);
13319 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13320 build_int_cst (TREE_TYPE (t), -8));
13321 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13322 /* String up roundup and advance. */
13323 if (roundup)
13324 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13325 /* String up with arg */
13326 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13327 /* Big-endianness related address adjustment. */
13328 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13329 && size < UNITS_PER_WORD)
13330 {
13331 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13332 size_int (UNITS_PER_WORD - size));
13333 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13334 }
13335
13336 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13337 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13338
13339 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13340 t = off;
13341 if (adjust)
13342 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13343 build_int_cst (TREE_TYPE (off), adjust));
13344
13345 t = fold_convert (sizetype, t);
13346 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13347
13348 if (is_ha)
13349 {
13350 /* type ha; // treat as "struct {ftype field[n];}"
13351 ... [computing offs]
13352 for (i = 0; i <nregs; ++i, offs += 16)
13353 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13354 return ha; */
13355 int i;
13356 tree tmp_ha, field_t, field_ptr_t;
13357
13358 /* Declare a local variable. */
13359 tmp_ha = create_tmp_var_raw (type, "ha");
13360 gimple_add_tmp_var (tmp_ha);
13361
13362 /* Establish the base type. */
13363 switch (ag_mode)
13364 {
13365 case E_SFmode:
13366 field_t = float_type_node;
13367 field_ptr_t = float_ptr_type_node;
13368 break;
13369 case E_DFmode:
13370 field_t = double_type_node;
13371 field_ptr_t = double_ptr_type_node;
13372 break;
13373 case E_TFmode:
13374 field_t = long_double_type_node;
13375 field_ptr_t = long_double_ptr_type_node;
13376 break;
13377 case E_HFmode:
13378 field_t = aarch64_fp16_type_node;
13379 field_ptr_t = aarch64_fp16_ptr_type_node;
13380 break;
13381 case E_V2SImode:
13382 case E_V4SImode:
13383 {
13384 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13385 field_t = build_vector_type_for_mode (innertype, ag_mode);
13386 field_ptr_t = build_pointer_type (field_t);
13387 }
13388 break;
13389 default:
13390 gcc_assert (0);
13391 }
13392
13393 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13394 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13395 addr = t;
13396 t = fold_convert (field_ptr_t, addr);
13397 t = build2 (MODIFY_EXPR, field_t,
13398 build1 (INDIRECT_REF, field_t, tmp_ha),
13399 build1 (INDIRECT_REF, field_t, t));
13400
13401 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13402 for (i = 1; i < nregs; ++i)
13403 {
13404 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13405 u = fold_convert (field_ptr_t, addr);
13406 u = build2 (MODIFY_EXPR, field_t,
13407 build2 (MEM_REF, field_t, tmp_ha,
13408 build_int_cst (field_ptr_t,
13409 (i *
13410 int_size_in_bytes (field_t)))),
13411 build1 (INDIRECT_REF, field_t, u));
13412 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13413 }
13414
13415 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13416 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13417 }
13418
13419 COND_EXPR_ELSE (cond2) = t;
13420 addr = fold_convert (build_pointer_type (type), cond1);
13421 addr = build_va_arg_indirect_ref (addr);
13422
13423 if (indirect_p)
13424 addr = build_va_arg_indirect_ref (addr);
13425
13426 return addr;
13427 }
13428
13429 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13430
13431 static void
13432 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13433 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13434 int no_rtl)
13435 {
13436 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13437 CUMULATIVE_ARGS local_cum;
13438 int gr_saved = cfun->va_list_gpr_size;
13439 int vr_saved = cfun->va_list_fpr_size;
13440
13441 /* The caller has advanced CUM up to, but not beyond, the last named
13442 argument. Advance a local copy of CUM past the last "real" named
13443 argument, to find out how many registers are left over. */
13444 local_cum = *cum;
13445 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13446
13447 /* Found out how many registers we need to save.
13448 Honor tree-stdvar analysis results. */
13449 if (cfun->va_list_gpr_size)
13450 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13451 cfun->va_list_gpr_size / UNITS_PER_WORD);
13452 if (cfun->va_list_fpr_size)
13453 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13454 cfun->va_list_fpr_size / UNITS_PER_VREG);
13455
13456 if (!TARGET_FLOAT)
13457 {
13458 gcc_assert (local_cum.aapcs_nvrn == 0);
13459 vr_saved = 0;
13460 }
13461
13462 if (!no_rtl)
13463 {
13464 if (gr_saved > 0)
13465 {
13466 rtx ptr, mem;
13467
13468 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13469 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13470 - gr_saved * UNITS_PER_WORD);
13471 mem = gen_frame_mem (BLKmode, ptr);
13472 set_mem_alias_set (mem, get_varargs_alias_set ());
13473
13474 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13475 mem, gr_saved);
13476 }
13477 if (vr_saved > 0)
13478 {
13479 /* We can't use move_block_from_reg, because it will use
13480 the wrong mode, storing D regs only. */
13481 machine_mode mode = TImode;
13482 int off, i, vr_start;
13483
13484 /* Set OFF to the offset from virtual_incoming_args_rtx of
13485 the first vector register. The VR save area lies below
13486 the GR one, and is aligned to 16 bytes. */
13487 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13488 STACK_BOUNDARY / BITS_PER_UNIT);
13489 off -= vr_saved * UNITS_PER_VREG;
13490
13491 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13492 for (i = 0; i < vr_saved; ++i)
13493 {
13494 rtx ptr, mem;
13495
13496 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13497 mem = gen_frame_mem (mode, ptr);
13498 set_mem_alias_set (mem, get_varargs_alias_set ());
13499 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13500 off += UNITS_PER_VREG;
13501 }
13502 }
13503 }
13504
13505 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13506 any complication of having crtl->args.pretend_args_size changed. */
13507 cfun->machine->frame.saved_varargs_size
13508 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13509 STACK_BOUNDARY / BITS_PER_UNIT)
13510 + vr_saved * UNITS_PER_VREG);
13511 }
13512
13513 static void
13514 aarch64_conditional_register_usage (void)
13515 {
13516 int i;
13517 if (!TARGET_FLOAT)
13518 {
13519 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13520 {
13521 fixed_regs[i] = 1;
13522 call_used_regs[i] = 1;
13523 }
13524 }
13525 if (!TARGET_SVE)
13526 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13527 {
13528 fixed_regs[i] = 1;
13529 call_used_regs[i] = 1;
13530 }
13531
13532 /* When tracking speculation, we need a couple of call-clobbered registers
13533 to track the speculation state. It would be nice to just use
13534 IP0 and IP1, but currently there are numerous places that just
13535 assume these registers are free for other uses (eg pointer
13536 authentication). */
13537 if (aarch64_track_speculation)
13538 {
13539 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13540 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13541 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13542 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13543 }
13544 }
13545
13546 /* Walk down the type tree of TYPE counting consecutive base elements.
13547 If *MODEP is VOIDmode, then set it to the first valid floating point
13548 type. If a non-floating point type is found, or if a floating point
13549 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13550 otherwise return the count in the sub-tree. */
13551 static int
13552 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13553 {
13554 machine_mode mode;
13555 HOST_WIDE_INT size;
13556
13557 switch (TREE_CODE (type))
13558 {
13559 case REAL_TYPE:
13560 mode = TYPE_MODE (type);
13561 if (mode != DFmode && mode != SFmode
13562 && mode != TFmode && mode != HFmode)
13563 return -1;
13564
13565 if (*modep == VOIDmode)
13566 *modep = mode;
13567
13568 if (*modep == mode)
13569 return 1;
13570
13571 break;
13572
13573 case COMPLEX_TYPE:
13574 mode = TYPE_MODE (TREE_TYPE (type));
13575 if (mode != DFmode && mode != SFmode
13576 && mode != TFmode && mode != HFmode)
13577 return -1;
13578
13579 if (*modep == VOIDmode)
13580 *modep = mode;
13581
13582 if (*modep == mode)
13583 return 2;
13584
13585 break;
13586
13587 case VECTOR_TYPE:
13588 /* Use V2SImode and V4SImode as representatives of all 64-bit
13589 and 128-bit vector types. */
13590 size = int_size_in_bytes (type);
13591 switch (size)
13592 {
13593 case 8:
13594 mode = V2SImode;
13595 break;
13596 case 16:
13597 mode = V4SImode;
13598 break;
13599 default:
13600 return -1;
13601 }
13602
13603 if (*modep == VOIDmode)
13604 *modep = mode;
13605
13606 /* Vector modes are considered to be opaque: two vectors are
13607 equivalent for the purposes of being homogeneous aggregates
13608 if they are the same size. */
13609 if (*modep == mode)
13610 return 1;
13611
13612 break;
13613
13614 case ARRAY_TYPE:
13615 {
13616 int count;
13617 tree index = TYPE_DOMAIN (type);
13618
13619 /* Can't handle incomplete types nor sizes that are not
13620 fixed. */
13621 if (!COMPLETE_TYPE_P (type)
13622 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13623 return -1;
13624
13625 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13626 if (count == -1
13627 || !index
13628 || !TYPE_MAX_VALUE (index)
13629 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13630 || !TYPE_MIN_VALUE (index)
13631 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13632 || count < 0)
13633 return -1;
13634
13635 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13636 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13637
13638 /* There must be no padding. */
13639 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13640 count * GET_MODE_BITSIZE (*modep)))
13641 return -1;
13642
13643 return count;
13644 }
13645
13646 case RECORD_TYPE:
13647 {
13648 int count = 0;
13649 int sub_count;
13650 tree field;
13651
13652 /* Can't handle incomplete types nor sizes that are not
13653 fixed. */
13654 if (!COMPLETE_TYPE_P (type)
13655 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13656 return -1;
13657
13658 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13659 {
13660 if (TREE_CODE (field) != FIELD_DECL)
13661 continue;
13662
13663 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13664 if (sub_count < 0)
13665 return -1;
13666 count += sub_count;
13667 }
13668
13669 /* There must be no padding. */
13670 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13671 count * GET_MODE_BITSIZE (*modep)))
13672 return -1;
13673
13674 return count;
13675 }
13676
13677 case UNION_TYPE:
13678 case QUAL_UNION_TYPE:
13679 {
13680 /* These aren't very interesting except in a degenerate case. */
13681 int count = 0;
13682 int sub_count;
13683 tree field;
13684
13685 /* Can't handle incomplete types nor sizes that are not
13686 fixed. */
13687 if (!COMPLETE_TYPE_P (type)
13688 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13689 return -1;
13690
13691 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13692 {
13693 if (TREE_CODE (field) != FIELD_DECL)
13694 continue;
13695
13696 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13697 if (sub_count < 0)
13698 return -1;
13699 count = count > sub_count ? count : sub_count;
13700 }
13701
13702 /* There must be no padding. */
13703 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13704 count * GET_MODE_BITSIZE (*modep)))
13705 return -1;
13706
13707 return count;
13708 }
13709
13710 default:
13711 break;
13712 }
13713
13714 return -1;
13715 }
13716
13717 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13718 type as described in AAPCS64 \S 4.1.2.
13719
13720 See the comment above aarch64_composite_type_p for the notes on MODE. */
13721
13722 static bool
13723 aarch64_short_vector_p (const_tree type,
13724 machine_mode mode)
13725 {
13726 poly_int64 size = -1;
13727
13728 if (type && TREE_CODE (type) == VECTOR_TYPE)
13729 size = int_size_in_bytes (type);
13730 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13731 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13732 size = GET_MODE_SIZE (mode);
13733
13734 return known_eq (size, 8) || known_eq (size, 16);
13735 }
13736
13737 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13738 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
13739 array types. The C99 floating-point complex types are also considered
13740 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
13741 types, which are GCC extensions and out of the scope of AAPCS64, are
13742 treated as composite types here as well.
13743
13744 Note that MODE itself is not sufficient in determining whether a type
13745 is such a composite type or not. This is because
13746 stor-layout.c:compute_record_mode may have already changed the MODE
13747 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
13748 structure with only one field may have its MODE set to the mode of the
13749 field. Also an integer mode whose size matches the size of the
13750 RECORD_TYPE type may be used to substitute the original mode
13751 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
13752 solely relied on. */
13753
13754 static bool
13755 aarch64_composite_type_p (const_tree type,
13756 machine_mode mode)
13757 {
13758 if (aarch64_short_vector_p (type, mode))
13759 return false;
13760
13761 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13762 return true;
13763
13764 if (mode == BLKmode
13765 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13766 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13767 return true;
13768
13769 return false;
13770 }
13771
13772 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13773 shall be passed or returned in simd/fp register(s) (providing these
13774 parameter passing registers are available).
13775
13776 Upon successful return, *COUNT returns the number of needed registers,
13777 *BASE_MODE returns the mode of the individual register and when IS_HAF
13778 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13779 floating-point aggregate or a homogeneous short-vector aggregate. */
13780
13781 static bool
13782 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13783 const_tree type,
13784 machine_mode *base_mode,
13785 int *count,
13786 bool *is_ha)
13787 {
13788 machine_mode new_mode = VOIDmode;
13789 bool composite_p = aarch64_composite_type_p (type, mode);
13790
13791 if (is_ha != NULL) *is_ha = false;
13792
13793 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13794 || aarch64_short_vector_p (type, mode))
13795 {
13796 *count = 1;
13797 new_mode = mode;
13798 }
13799 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13800 {
13801 if (is_ha != NULL) *is_ha = true;
13802 *count = 2;
13803 new_mode = GET_MODE_INNER (mode);
13804 }
13805 else if (type && composite_p)
13806 {
13807 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
13808
13809 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
13810 {
13811 if (is_ha != NULL) *is_ha = true;
13812 *count = ag_count;
13813 }
13814 else
13815 return false;
13816 }
13817 else
13818 return false;
13819
13820 *base_mode = new_mode;
13821 return true;
13822 }
13823
13824 /* Implement TARGET_STRUCT_VALUE_RTX. */
13825
13826 static rtx
13827 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
13828 int incoming ATTRIBUTE_UNUSED)
13829 {
13830 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
13831 }
13832
13833 /* Implements target hook vector_mode_supported_p. */
13834 static bool
13835 aarch64_vector_mode_supported_p (machine_mode mode)
13836 {
13837 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13838 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
13839 }
13840
13841 /* Return appropriate SIMD container
13842 for MODE within a vector of WIDTH bits. */
13843 static machine_mode
13844 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
13845 {
13846 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
13847 switch (mode)
13848 {
13849 case E_DFmode:
13850 return VNx2DFmode;
13851 case E_SFmode:
13852 return VNx4SFmode;
13853 case E_HFmode:
13854 return VNx8HFmode;
13855 case E_DImode:
13856 return VNx2DImode;
13857 case E_SImode:
13858 return VNx4SImode;
13859 case E_HImode:
13860 return VNx8HImode;
13861 case E_QImode:
13862 return VNx16QImode;
13863 default:
13864 return word_mode;
13865 }
13866
13867 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
13868 if (TARGET_SIMD)
13869 {
13870 if (known_eq (width, 128))
13871 switch (mode)
13872 {
13873 case E_DFmode:
13874 return V2DFmode;
13875 case E_SFmode:
13876 return V4SFmode;
13877 case E_HFmode:
13878 return V8HFmode;
13879 case E_SImode:
13880 return V4SImode;
13881 case E_HImode:
13882 return V8HImode;
13883 case E_QImode:
13884 return V16QImode;
13885 case E_DImode:
13886 return V2DImode;
13887 default:
13888 break;
13889 }
13890 else
13891 switch (mode)
13892 {
13893 case E_SFmode:
13894 return V2SFmode;
13895 case E_HFmode:
13896 return V4HFmode;
13897 case E_SImode:
13898 return V2SImode;
13899 case E_HImode:
13900 return V4HImode;
13901 case E_QImode:
13902 return V8QImode;
13903 default:
13904 break;
13905 }
13906 }
13907 return word_mode;
13908 }
13909
13910 /* Return 128-bit container as the preferred SIMD mode for MODE. */
13911 static machine_mode
13912 aarch64_preferred_simd_mode (scalar_mode mode)
13913 {
13914 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
13915 return aarch64_simd_container_mode (mode, bits);
13916 }
13917
13918 /* Return a list of possible vector sizes for the vectorizer
13919 to iterate over. */
13920 static void
13921 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
13922 {
13923 if (TARGET_SVE)
13924 sizes->safe_push (BYTES_PER_SVE_VECTOR);
13925 sizes->safe_push (16);
13926 sizes->safe_push (8);
13927 }
13928
13929 /* Implement TARGET_MANGLE_TYPE. */
13930
13931 static const char *
13932 aarch64_mangle_type (const_tree type)
13933 {
13934 /* The AArch64 ABI documents say that "__va_list" has to be
13935 mangled as if it is in the "std" namespace. */
13936 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
13937 return "St9__va_list";
13938
13939 /* Half-precision float. */
13940 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
13941 return "Dh";
13942
13943 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
13944 builtin types. */
13945 if (TYPE_NAME (type) != NULL)
13946 return aarch64_mangle_builtin_type (type);
13947
13948 /* Use the default mangling. */
13949 return NULL;
13950 }
13951
13952 /* Find the first rtx_insn before insn that will generate an assembly
13953 instruction. */
13954
13955 static rtx_insn *
13956 aarch64_prev_real_insn (rtx_insn *insn)
13957 {
13958 if (!insn)
13959 return NULL;
13960
13961 do
13962 {
13963 insn = prev_real_insn (insn);
13964 }
13965 while (insn && recog_memoized (insn) < 0);
13966
13967 return insn;
13968 }
13969
13970 static bool
13971 is_madd_op (enum attr_type t1)
13972 {
13973 unsigned int i;
13974 /* A number of these may be AArch32 only. */
13975 enum attr_type mlatypes[] = {
13976 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13977 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13978 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13979 };
13980
13981 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13982 {
13983 if (t1 == mlatypes[i])
13984 return true;
13985 }
13986
13987 return false;
13988 }
13989
13990 /* Check if there is a register dependency between a load and the insn
13991 for which we hold recog_data. */
13992
13993 static bool
13994 dep_between_memop_and_curr (rtx memop)
13995 {
13996 rtx load_reg;
13997 int opno;
13998
13999 gcc_assert (GET_CODE (memop) == SET);
14000
14001 if (!REG_P (SET_DEST (memop)))
14002 return false;
14003
14004 load_reg = SET_DEST (memop);
14005 for (opno = 1; opno < recog_data.n_operands; opno++)
14006 {
14007 rtx operand = recog_data.operand[opno];
14008 if (REG_P (operand)
14009 && reg_overlap_mentioned_p (load_reg, operand))
14010 return true;
14011
14012 }
14013 return false;
14014 }
14015
14016
14017 /* When working around the Cortex-A53 erratum 835769,
14018 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14019 instruction and has a preceding memory instruction such that a NOP
14020 should be inserted between them. */
14021
14022 bool
14023 aarch64_madd_needs_nop (rtx_insn* insn)
14024 {
14025 enum attr_type attr_type;
14026 rtx_insn *prev;
14027 rtx body;
14028
14029 if (!TARGET_FIX_ERR_A53_835769)
14030 return false;
14031
14032 if (!INSN_P (insn) || recog_memoized (insn) < 0)
14033 return false;
14034
14035 attr_type = get_attr_type (insn);
14036 if (!is_madd_op (attr_type))
14037 return false;
14038
14039 prev = aarch64_prev_real_insn (insn);
14040 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14041 Restore recog state to INSN to avoid state corruption. */
14042 extract_constrain_insn_cached (insn);
14043
14044 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14045 return false;
14046
14047 body = single_set (prev);
14048
14049 /* If the previous insn is a memory op and there is no dependency between
14050 it and the DImode madd, emit a NOP between them. If body is NULL then we
14051 have a complex memory operation, probably a load/store pair.
14052 Be conservative for now and emit a NOP. */
14053 if (GET_MODE (recog_data.operand[0]) == DImode
14054 && (!body || !dep_between_memop_and_curr (body)))
14055 return true;
14056
14057 return false;
14058
14059 }
14060
14061
14062 /* Implement FINAL_PRESCAN_INSN. */
14063
14064 void
14065 aarch64_final_prescan_insn (rtx_insn *insn)
14066 {
14067 if (aarch64_madd_needs_nop (insn))
14068 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14069 }
14070
14071
14072 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14073 instruction. */
14074
14075 bool
14076 aarch64_sve_index_immediate_p (rtx base_or_step)
14077 {
14078 return (CONST_INT_P (base_or_step)
14079 && IN_RANGE (INTVAL (base_or_step), -16, 15));
14080 }
14081
14082 /* Return true if X is a valid immediate for the SVE ADD and SUB
14083 instructions. Negate X first if NEGATE_P is true. */
14084
14085 bool
14086 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14087 {
14088 rtx elt;
14089
14090 if (!const_vec_duplicate_p (x, &elt)
14091 || !CONST_INT_P (elt))
14092 return false;
14093
14094 HOST_WIDE_INT val = INTVAL (elt);
14095 if (negate_p)
14096 val = -val;
14097 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14098
14099 if (val & 0xff)
14100 return IN_RANGE (val, 0, 0xff);
14101 return IN_RANGE (val, 0, 0xff00);
14102 }
14103
14104 /* Return true if X is a valid immediate operand for an SVE logical
14105 instruction such as AND. */
14106
14107 bool
14108 aarch64_sve_bitmask_immediate_p (rtx x)
14109 {
14110 rtx elt;
14111
14112 return (const_vec_duplicate_p (x, &elt)
14113 && CONST_INT_P (elt)
14114 && aarch64_bitmask_imm (INTVAL (elt),
14115 GET_MODE_INNER (GET_MODE (x))));
14116 }
14117
14118 /* Return true if X is a valid immediate for the SVE DUP and CPY
14119 instructions. */
14120
14121 bool
14122 aarch64_sve_dup_immediate_p (rtx x)
14123 {
14124 rtx elt;
14125
14126 if (!const_vec_duplicate_p (x, &elt)
14127 || !CONST_INT_P (elt))
14128 return false;
14129
14130 HOST_WIDE_INT val = INTVAL (elt);
14131 if (val & 0xff)
14132 return IN_RANGE (val, -0x80, 0x7f);
14133 return IN_RANGE (val, -0x8000, 0x7f00);
14134 }
14135
14136 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14137 SIGNED_P says whether the operand is signed rather than unsigned. */
14138
14139 bool
14140 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14141 {
14142 rtx elt;
14143
14144 return (const_vec_duplicate_p (x, &elt)
14145 && CONST_INT_P (elt)
14146 && (signed_p
14147 ? IN_RANGE (INTVAL (elt), -16, 15)
14148 : IN_RANGE (INTVAL (elt), 0, 127)));
14149 }
14150
14151 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14152 instruction. Negate X first if NEGATE_P is true. */
14153
14154 bool
14155 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14156 {
14157 rtx elt;
14158 REAL_VALUE_TYPE r;
14159
14160 if (!const_vec_duplicate_p (x, &elt)
14161 || GET_CODE (elt) != CONST_DOUBLE)
14162 return false;
14163
14164 r = *CONST_DOUBLE_REAL_VALUE (elt);
14165
14166 if (negate_p)
14167 r = real_value_negate (&r);
14168
14169 if (real_equal (&r, &dconst1))
14170 return true;
14171 if (real_equal (&r, &dconsthalf))
14172 return true;
14173 return false;
14174 }
14175
14176 /* Return true if X is a valid immediate operand for an SVE FMUL
14177 instruction. */
14178
14179 bool
14180 aarch64_sve_float_mul_immediate_p (rtx x)
14181 {
14182 rtx elt;
14183
14184 /* GCC will never generate a multiply with an immediate of 2, so there is no
14185 point testing for it (even though it is a valid constant). */
14186 return (const_vec_duplicate_p (x, &elt)
14187 && GET_CODE (elt) == CONST_DOUBLE
14188 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14189 }
14190
14191 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14192 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14193 is nonnull, use it to describe valid immediates. */
14194 static bool
14195 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14196 simd_immediate_info *info,
14197 enum simd_immediate_check which,
14198 simd_immediate_info::insn_type insn)
14199 {
14200 /* Try a 4-byte immediate with LSL. */
14201 for (unsigned int shift = 0; shift < 32; shift += 8)
14202 if ((val32 & (0xff << shift)) == val32)
14203 {
14204 if (info)
14205 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14206 simd_immediate_info::LSL, shift);
14207 return true;
14208 }
14209
14210 /* Try a 2-byte immediate with LSL. */
14211 unsigned int imm16 = val32 & 0xffff;
14212 if (imm16 == (val32 >> 16))
14213 for (unsigned int shift = 0; shift < 16; shift += 8)
14214 if ((imm16 & (0xff << shift)) == imm16)
14215 {
14216 if (info)
14217 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14218 simd_immediate_info::LSL, shift);
14219 return true;
14220 }
14221
14222 /* Try a 4-byte immediate with MSL, except for cases that MVN
14223 can handle. */
14224 if (which == AARCH64_CHECK_MOV)
14225 for (unsigned int shift = 8; shift < 24; shift += 8)
14226 {
14227 unsigned int low = (1 << shift) - 1;
14228 if (((val32 & (0xff << shift)) | low) == val32)
14229 {
14230 if (info)
14231 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14232 simd_immediate_info::MSL, shift);
14233 return true;
14234 }
14235 }
14236
14237 return false;
14238 }
14239
14240 /* Return true if replicating VAL64 is a valid immediate for the
14241 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14242 use it to describe valid immediates. */
14243 static bool
14244 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14245 simd_immediate_info *info,
14246 enum simd_immediate_check which)
14247 {
14248 unsigned int val32 = val64 & 0xffffffff;
14249 unsigned int val16 = val64 & 0xffff;
14250 unsigned int val8 = val64 & 0xff;
14251
14252 if (val32 == (val64 >> 32))
14253 {
14254 if ((which & AARCH64_CHECK_ORR) != 0
14255 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14256 simd_immediate_info::MOV))
14257 return true;
14258
14259 if ((which & AARCH64_CHECK_BIC) != 0
14260 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14261 simd_immediate_info::MVN))
14262 return true;
14263
14264 /* Try using a replicated byte. */
14265 if (which == AARCH64_CHECK_MOV
14266 && val16 == (val32 >> 16)
14267 && val8 == (val16 >> 8))
14268 {
14269 if (info)
14270 *info = simd_immediate_info (QImode, val8);
14271 return true;
14272 }
14273 }
14274
14275 /* Try using a bit-to-bytemask. */
14276 if (which == AARCH64_CHECK_MOV)
14277 {
14278 unsigned int i;
14279 for (i = 0; i < 64; i += 8)
14280 {
14281 unsigned char byte = (val64 >> i) & 0xff;
14282 if (byte != 0 && byte != 0xff)
14283 break;
14284 }
14285 if (i == 64)
14286 {
14287 if (info)
14288 *info = simd_immediate_info (DImode, val64);
14289 return true;
14290 }
14291 }
14292 return false;
14293 }
14294
14295 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14296 instruction. If INFO is nonnull, use it to describe valid immediates. */
14297
14298 static bool
14299 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14300 simd_immediate_info *info)
14301 {
14302 scalar_int_mode mode = DImode;
14303 unsigned int val32 = val64 & 0xffffffff;
14304 if (val32 == (val64 >> 32))
14305 {
14306 mode = SImode;
14307 unsigned int val16 = val32 & 0xffff;
14308 if (val16 == (val32 >> 16))
14309 {
14310 mode = HImode;
14311 unsigned int val8 = val16 & 0xff;
14312 if (val8 == (val16 >> 8))
14313 mode = QImode;
14314 }
14315 }
14316 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14317 if (IN_RANGE (val, -0x80, 0x7f))
14318 {
14319 /* DUP with no shift. */
14320 if (info)
14321 *info = simd_immediate_info (mode, val);
14322 return true;
14323 }
14324 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14325 {
14326 /* DUP with LSL #8. */
14327 if (info)
14328 *info = simd_immediate_info (mode, val);
14329 return true;
14330 }
14331 if (aarch64_bitmask_imm (val64, mode))
14332 {
14333 /* DUPM. */
14334 if (info)
14335 *info = simd_immediate_info (mode, val);
14336 return true;
14337 }
14338 return false;
14339 }
14340
14341 /* Return true if OP is a valid SIMD immediate for the operation
14342 described by WHICH. If INFO is nonnull, use it to describe valid
14343 immediates. */
14344 bool
14345 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14346 enum simd_immediate_check which)
14347 {
14348 machine_mode mode = GET_MODE (op);
14349 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14350 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14351 return false;
14352
14353 scalar_mode elt_mode = GET_MODE_INNER (mode);
14354 rtx base, step;
14355 unsigned int n_elts;
14356 if (GET_CODE (op) == CONST_VECTOR
14357 && CONST_VECTOR_DUPLICATE_P (op))
14358 n_elts = CONST_VECTOR_NPATTERNS (op);
14359 else if ((vec_flags & VEC_SVE_DATA)
14360 && const_vec_series_p (op, &base, &step))
14361 {
14362 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14363 if (!aarch64_sve_index_immediate_p (base)
14364 || !aarch64_sve_index_immediate_p (step))
14365 return false;
14366
14367 if (info)
14368 *info = simd_immediate_info (elt_mode, base, step);
14369 return true;
14370 }
14371 else if (GET_CODE (op) == CONST_VECTOR
14372 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14373 /* N_ELTS set above. */;
14374 else
14375 return false;
14376
14377 /* Handle PFALSE and PTRUE. */
14378 if (vec_flags & VEC_SVE_PRED)
14379 return (op == CONST0_RTX (mode)
14380 || op == CONSTM1_RTX (mode));
14381
14382 scalar_float_mode elt_float_mode;
14383 if (n_elts == 1
14384 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14385 {
14386 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14387 if (aarch64_float_const_zero_rtx_p (elt)
14388 || aarch64_float_const_representable_p (elt))
14389 {
14390 if (info)
14391 *info = simd_immediate_info (elt_float_mode, elt);
14392 return true;
14393 }
14394 }
14395
14396 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14397 if (elt_size > 8)
14398 return false;
14399
14400 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14401
14402 /* Expand the vector constant out into a byte vector, with the least
14403 significant byte of the register first. */
14404 auto_vec<unsigned char, 16> bytes;
14405 bytes.reserve (n_elts * elt_size);
14406 for (unsigned int i = 0; i < n_elts; i++)
14407 {
14408 /* The vector is provided in gcc endian-neutral fashion.
14409 For aarch64_be Advanced SIMD, it must be laid out in the vector
14410 register in reverse order. */
14411 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14412 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14413
14414 if (elt_mode != elt_int_mode)
14415 elt = gen_lowpart (elt_int_mode, elt);
14416
14417 if (!CONST_INT_P (elt))
14418 return false;
14419
14420 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14421 for (unsigned int byte = 0; byte < elt_size; byte++)
14422 {
14423 bytes.quick_push (elt_val & 0xff);
14424 elt_val >>= BITS_PER_UNIT;
14425 }
14426 }
14427
14428 /* The immediate must repeat every eight bytes. */
14429 unsigned int nbytes = bytes.length ();
14430 for (unsigned i = 8; i < nbytes; ++i)
14431 if (bytes[i] != bytes[i - 8])
14432 return false;
14433
14434 /* Get the repeating 8-byte value as an integer. No endian correction
14435 is needed here because bytes is already in lsb-first order. */
14436 unsigned HOST_WIDE_INT val64 = 0;
14437 for (unsigned int i = 0; i < 8; i++)
14438 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14439 << (i * BITS_PER_UNIT));
14440
14441 if (vec_flags & VEC_SVE_DATA)
14442 return aarch64_sve_valid_immediate (val64, info);
14443 else
14444 return aarch64_advsimd_valid_immediate (val64, info, which);
14445 }
14446
14447 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14448 has a step in the range of INDEX. Return the index expression if so,
14449 otherwise return null. */
14450 rtx
14451 aarch64_check_zero_based_sve_index_immediate (rtx x)
14452 {
14453 rtx base, step;
14454 if (const_vec_series_p (x, &base, &step)
14455 && base == const0_rtx
14456 && aarch64_sve_index_immediate_p (step))
14457 return step;
14458 return NULL_RTX;
14459 }
14460
14461 /* Check of immediate shift constants are within range. */
14462 bool
14463 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14464 {
14465 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14466 if (left)
14467 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14468 else
14469 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14470 }
14471
14472 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14473 operation of width WIDTH at bit position POS. */
14474
14475 rtx
14476 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14477 {
14478 gcc_assert (CONST_INT_P (width));
14479 gcc_assert (CONST_INT_P (pos));
14480
14481 unsigned HOST_WIDE_INT mask
14482 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14483 return GEN_INT (mask << UINTVAL (pos));
14484 }
14485
14486 bool
14487 aarch64_mov_operand_p (rtx x, machine_mode mode)
14488 {
14489 if (GET_CODE (x) == HIGH
14490 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14491 return true;
14492
14493 if (CONST_INT_P (x))
14494 return true;
14495
14496 if (VECTOR_MODE_P (GET_MODE (x)))
14497 return aarch64_simd_valid_immediate (x, NULL);
14498
14499 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14500 return true;
14501
14502 if (aarch64_sve_cnt_immediate_p (x))
14503 return true;
14504
14505 return aarch64_classify_symbolic_expression (x)
14506 == SYMBOL_TINY_ABSOLUTE;
14507 }
14508
14509 /* Return a const_int vector of VAL. */
14510 rtx
14511 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14512 {
14513 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14514 return gen_const_vec_duplicate (mode, c);
14515 }
14516
14517 /* Check OP is a legal scalar immediate for the MOVI instruction. */
14518
14519 bool
14520 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14521 {
14522 machine_mode vmode;
14523
14524 vmode = aarch64_simd_container_mode (mode, 64);
14525 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14526 return aarch64_simd_valid_immediate (op_v, NULL);
14527 }
14528
14529 /* Construct and return a PARALLEL RTX vector with elements numbering the
14530 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14531 the vector - from the perspective of the architecture. This does not
14532 line up with GCC's perspective on lane numbers, so we end up with
14533 different masks depending on our target endian-ness. The diagram
14534 below may help. We must draw the distinction when building masks
14535 which select one half of the vector. An instruction selecting
14536 architectural low-lanes for a big-endian target, must be described using
14537 a mask selecting GCC high-lanes.
14538
14539 Big-Endian Little-Endian
14540
14541 GCC 0 1 2 3 3 2 1 0
14542 | x | x | x | x | | x | x | x | x |
14543 Architecture 3 2 1 0 3 2 1 0
14544
14545 Low Mask: { 2, 3 } { 0, 1 }
14546 High Mask: { 0, 1 } { 2, 3 }
14547
14548 MODE Is the mode of the vector and NUNITS is the number of units in it. */
14549
14550 rtx
14551 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14552 {
14553 rtvec v = rtvec_alloc (nunits / 2);
14554 int high_base = nunits / 2;
14555 int low_base = 0;
14556 int base;
14557 rtx t1;
14558 int i;
14559
14560 if (BYTES_BIG_ENDIAN)
14561 base = high ? low_base : high_base;
14562 else
14563 base = high ? high_base : low_base;
14564
14565 for (i = 0; i < nunits / 2; i++)
14566 RTVEC_ELT (v, i) = GEN_INT (base + i);
14567
14568 t1 = gen_rtx_PARALLEL (mode, v);
14569 return t1;
14570 }
14571
14572 /* Check OP for validity as a PARALLEL RTX vector with elements
14573 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14574 from the perspective of the architecture. See the diagram above
14575 aarch64_simd_vect_par_cnst_half for more details. */
14576
14577 bool
14578 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14579 bool high)
14580 {
14581 int nelts;
14582 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14583 return false;
14584
14585 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14586 HOST_WIDE_INT count_op = XVECLEN (op, 0);
14587 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14588 int i = 0;
14589
14590 if (count_op != count_ideal)
14591 return false;
14592
14593 for (i = 0; i < count_ideal; i++)
14594 {
14595 rtx elt_op = XVECEXP (op, 0, i);
14596 rtx elt_ideal = XVECEXP (ideal, 0, i);
14597
14598 if (!CONST_INT_P (elt_op)
14599 || INTVAL (elt_ideal) != INTVAL (elt_op))
14600 return false;
14601 }
14602 return true;
14603 }
14604
14605 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
14606 HIGH (exclusive). */
14607 void
14608 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14609 const_tree exp)
14610 {
14611 HOST_WIDE_INT lane;
14612 gcc_assert (CONST_INT_P (operand));
14613 lane = INTVAL (operand);
14614
14615 if (lane < low || lane >= high)
14616 {
14617 if (exp)
14618 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14619 else
14620 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14621 }
14622 }
14623
14624 /* Peform endian correction on lane number N, which indexes a vector
14625 of mode MODE, and return the result as an SImode rtx. */
14626
14627 rtx
14628 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14629 {
14630 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14631 }
14632
14633 /* Return TRUE if OP is a valid vector addressing mode. */
14634
14635 bool
14636 aarch64_simd_mem_operand_p (rtx op)
14637 {
14638 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14639 || REG_P (XEXP (op, 0)));
14640 }
14641
14642 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
14643
14644 bool
14645 aarch64_sve_ld1r_operand_p (rtx op)
14646 {
14647 struct aarch64_address_info addr;
14648 scalar_mode mode;
14649
14650 return (MEM_P (op)
14651 && is_a <scalar_mode> (GET_MODE (op), &mode)
14652 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14653 && addr.type == ADDRESS_REG_IMM
14654 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14655 }
14656
14657 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14658 The conditions for STR are the same. */
14659 bool
14660 aarch64_sve_ldr_operand_p (rtx op)
14661 {
14662 struct aarch64_address_info addr;
14663
14664 return (MEM_P (op)
14665 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14666 false, ADDR_QUERY_ANY)
14667 && addr.type == ADDRESS_REG_IMM);
14668 }
14669
14670 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14671 We need to be able to access the individual pieces, so the range
14672 is different from LD[234] and ST[234]. */
14673 bool
14674 aarch64_sve_struct_memory_operand_p (rtx op)
14675 {
14676 if (!MEM_P (op))
14677 return false;
14678
14679 machine_mode mode = GET_MODE (op);
14680 struct aarch64_address_info addr;
14681 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14682 ADDR_QUERY_ANY)
14683 || addr.type != ADDRESS_REG_IMM)
14684 return false;
14685
14686 poly_int64 first = addr.const_offset;
14687 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14688 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14689 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14690 }
14691
14692 /* Emit a register copy from operand to operand, taking care not to
14693 early-clobber source registers in the process.
14694
14695 COUNT is the number of components into which the copy needs to be
14696 decomposed. */
14697 void
14698 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14699 unsigned int count)
14700 {
14701 unsigned int i;
14702 int rdest = REGNO (operands[0]);
14703 int rsrc = REGNO (operands[1]);
14704
14705 if (!reg_overlap_mentioned_p (operands[0], operands[1])
14706 || rdest < rsrc)
14707 for (i = 0; i < count; i++)
14708 emit_move_insn (gen_rtx_REG (mode, rdest + i),
14709 gen_rtx_REG (mode, rsrc + i));
14710 else
14711 for (i = 0; i < count; i++)
14712 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14713 gen_rtx_REG (mode, rsrc + count - i - 1));
14714 }
14715
14716 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14717 one of VSTRUCT modes: OI, CI, or XI. */
14718 int
14719 aarch64_simd_attr_length_rglist (machine_mode mode)
14720 {
14721 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
14722 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14723 }
14724
14725 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
14726 alignment of a vector to 128 bits. SVE predicates have an alignment of
14727 16 bits. */
14728 static HOST_WIDE_INT
14729 aarch64_simd_vector_alignment (const_tree type)
14730 {
14731 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14732 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14733 be set for non-predicate vectors of booleans. Modes are the most
14734 direct way we have of identifying real SVE predicate types. */
14735 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14736 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
14737 return MIN (align, 128);
14738 }
14739
14740 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
14741 static poly_uint64
14742 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14743 {
14744 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14745 {
14746 /* If the length of the vector is fixed, try to align to that length,
14747 otherwise don't try to align at all. */
14748 HOST_WIDE_INT result;
14749 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14750 result = TYPE_ALIGN (TREE_TYPE (type));
14751 return result;
14752 }
14753 return TYPE_ALIGN (type);
14754 }
14755
14756 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
14757 static bool
14758 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14759 {
14760 if (is_packed)
14761 return false;
14762
14763 /* For fixed-length vectors, check that the vectorizer will aim for
14764 full-vector alignment. This isn't true for generic GCC vectors
14765 that are wider than the ABI maximum of 128 bits. */
14766 poly_uint64 preferred_alignment =
14767 aarch64_vectorize_preferred_vector_alignment (type);
14768 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14769 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14770 preferred_alignment))
14771 return false;
14772
14773 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
14774 return true;
14775 }
14776
14777 /* Return true if the vector misalignment factor is supported by the
14778 target. */
14779 static bool
14780 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14781 const_tree type, int misalignment,
14782 bool is_packed)
14783 {
14784 if (TARGET_SIMD && STRICT_ALIGNMENT)
14785 {
14786 /* Return if movmisalign pattern is not supported for this mode. */
14787 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14788 return false;
14789
14790 /* Misalignment factor is unknown at compile time. */
14791 if (misalignment == -1)
14792 return false;
14793 }
14794 return default_builtin_support_vector_misalignment (mode, type, misalignment,
14795 is_packed);
14796 }
14797
14798 /* If VALS is a vector constant that can be loaded into a register
14799 using DUP, generate instructions to do so and return an RTX to
14800 assign to the register. Otherwise return NULL_RTX. */
14801 static rtx
14802 aarch64_simd_dup_constant (rtx vals)
14803 {
14804 machine_mode mode = GET_MODE (vals);
14805 machine_mode inner_mode = GET_MODE_INNER (mode);
14806 rtx x;
14807
14808 if (!const_vec_duplicate_p (vals, &x))
14809 return NULL_RTX;
14810
14811 /* We can load this constant by using DUP and a constant in a
14812 single ARM register. This will be cheaper than a vector
14813 load. */
14814 x = copy_to_mode_reg (inner_mode, x);
14815 return gen_vec_duplicate (mode, x);
14816 }
14817
14818
14819 /* Generate code to load VALS, which is a PARALLEL containing only
14820 constants (for vec_init) or CONST_VECTOR, efficiently into a
14821 register. Returns an RTX to copy into the register, or NULL_RTX
14822 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
14823 static rtx
14824 aarch64_simd_make_constant (rtx vals)
14825 {
14826 machine_mode mode = GET_MODE (vals);
14827 rtx const_dup;
14828 rtx const_vec = NULL_RTX;
14829 int n_const = 0;
14830 int i;
14831
14832 if (GET_CODE (vals) == CONST_VECTOR)
14833 const_vec = vals;
14834 else if (GET_CODE (vals) == PARALLEL)
14835 {
14836 /* A CONST_VECTOR must contain only CONST_INTs and
14837 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
14838 Only store valid constants in a CONST_VECTOR. */
14839 int n_elts = XVECLEN (vals, 0);
14840 for (i = 0; i < n_elts; ++i)
14841 {
14842 rtx x = XVECEXP (vals, 0, i);
14843 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14844 n_const++;
14845 }
14846 if (n_const == n_elts)
14847 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
14848 }
14849 else
14850 gcc_unreachable ();
14851
14852 if (const_vec != NULL_RTX
14853 && aarch64_simd_valid_immediate (const_vec, NULL))
14854 /* Load using MOVI/MVNI. */
14855 return const_vec;
14856 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
14857 /* Loaded using DUP. */
14858 return const_dup;
14859 else if (const_vec != NULL_RTX)
14860 /* Load from constant pool. We cannot take advantage of single-cycle
14861 LD1 because we need a PC-relative addressing mode. */
14862 return const_vec;
14863 else
14864 /* A PARALLEL containing something not valid inside CONST_VECTOR.
14865 We cannot construct an initializer. */
14866 return NULL_RTX;
14867 }
14868
14869 /* Expand a vector initialisation sequence, such that TARGET is
14870 initialised to contain VALS. */
14871
14872 void
14873 aarch64_expand_vector_init (rtx target, rtx vals)
14874 {
14875 machine_mode mode = GET_MODE (target);
14876 scalar_mode inner_mode = GET_MODE_INNER (mode);
14877 /* The number of vector elements. */
14878 int n_elts = XVECLEN (vals, 0);
14879 /* The number of vector elements which are not constant. */
14880 int n_var = 0;
14881 rtx any_const = NULL_RTX;
14882 /* The first element of vals. */
14883 rtx v0 = XVECEXP (vals, 0, 0);
14884 bool all_same = true;
14885
14886 /* Count the number of variable elements to initialise. */
14887 for (int i = 0; i < n_elts; ++i)
14888 {
14889 rtx x = XVECEXP (vals, 0, i);
14890 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
14891 ++n_var;
14892 else
14893 any_const = x;
14894
14895 all_same &= rtx_equal_p (x, v0);
14896 }
14897
14898 /* No variable elements, hand off to aarch64_simd_make_constant which knows
14899 how best to handle this. */
14900 if (n_var == 0)
14901 {
14902 rtx constant = aarch64_simd_make_constant (vals);
14903 if (constant != NULL_RTX)
14904 {
14905 emit_move_insn (target, constant);
14906 return;
14907 }
14908 }
14909
14910 /* Splat a single non-constant element if we can. */
14911 if (all_same)
14912 {
14913 rtx x = copy_to_mode_reg (inner_mode, v0);
14914 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14915 return;
14916 }
14917
14918 enum insn_code icode = optab_handler (vec_set_optab, mode);
14919 gcc_assert (icode != CODE_FOR_nothing);
14920
14921 /* If there are only variable elements, try to optimize
14922 the insertion using dup for the most common element
14923 followed by insertions. */
14924
14925 /* The algorithm will fill matches[*][0] with the earliest matching element,
14926 and matches[X][1] with the count of duplicate elements (if X is the
14927 earliest element which has duplicates). */
14928
14929 if (n_var == n_elts && n_elts <= 16)
14930 {
14931 int matches[16][2] = {0};
14932 for (int i = 0; i < n_elts; i++)
14933 {
14934 for (int j = 0; j <= i; j++)
14935 {
14936 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
14937 {
14938 matches[i][0] = j;
14939 matches[j][1]++;
14940 break;
14941 }
14942 }
14943 }
14944 int maxelement = 0;
14945 int maxv = 0;
14946 for (int i = 0; i < n_elts; i++)
14947 if (matches[i][1] > maxv)
14948 {
14949 maxelement = i;
14950 maxv = matches[i][1];
14951 }
14952
14953 /* Create a duplicate of the most common element, unless all elements
14954 are equally useless to us, in which case just immediately set the
14955 vector register using the first element. */
14956
14957 if (maxv == 1)
14958 {
14959 /* For vectors of two 64-bit elements, we can do even better. */
14960 if (n_elts == 2
14961 && (inner_mode == E_DImode
14962 || inner_mode == E_DFmode))
14963
14964 {
14965 rtx x0 = XVECEXP (vals, 0, 0);
14966 rtx x1 = XVECEXP (vals, 0, 1);
14967 /* Combine can pick up this case, but handling it directly
14968 here leaves clearer RTL.
14969
14970 This is load_pair_lanes<mode>, and also gives us a clean-up
14971 for store_pair_lanes<mode>. */
14972 if (memory_operand (x0, inner_mode)
14973 && memory_operand (x1, inner_mode)
14974 && !STRICT_ALIGNMENT
14975 && rtx_equal_p (XEXP (x1, 0),
14976 plus_constant (Pmode,
14977 XEXP (x0, 0),
14978 GET_MODE_SIZE (inner_mode))))
14979 {
14980 rtx t;
14981 if (inner_mode == DFmode)
14982 t = gen_load_pair_lanesdf (target, x0, x1);
14983 else
14984 t = gen_load_pair_lanesdi (target, x0, x1);
14985 emit_insn (t);
14986 return;
14987 }
14988 }
14989 /* The subreg-move sequence below will move into lane zero of the
14990 vector register. For big-endian we want that position to hold
14991 the last element of VALS. */
14992 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14993 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14994 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14995 }
14996 else
14997 {
14998 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14999 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15000 }
15001
15002 /* Insert the rest. */
15003 for (int i = 0; i < n_elts; i++)
15004 {
15005 rtx x = XVECEXP (vals, 0, i);
15006 if (matches[i][0] == maxelement)
15007 continue;
15008 x = copy_to_mode_reg (inner_mode, x);
15009 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15010 }
15011 return;
15012 }
15013
15014 /* Initialise a vector which is part-variable. We want to first try
15015 to build those lanes which are constant in the most efficient way we
15016 can. */
15017 if (n_var != n_elts)
15018 {
15019 rtx copy = copy_rtx (vals);
15020
15021 /* Load constant part of vector. We really don't care what goes into the
15022 parts we will overwrite, but we're more likely to be able to load the
15023 constant efficiently if it has fewer, larger, repeating parts
15024 (see aarch64_simd_valid_immediate). */
15025 for (int i = 0; i < n_elts; i++)
15026 {
15027 rtx x = XVECEXP (vals, 0, i);
15028 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15029 continue;
15030 rtx subst = any_const;
15031 for (int bit = n_elts / 2; bit > 0; bit /= 2)
15032 {
15033 /* Look in the copied vector, as more elements are const. */
15034 rtx test = XVECEXP (copy, 0, i ^ bit);
15035 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15036 {
15037 subst = test;
15038 break;
15039 }
15040 }
15041 XVECEXP (copy, 0, i) = subst;
15042 }
15043 aarch64_expand_vector_init (target, copy);
15044 }
15045
15046 /* Insert the variable lanes directly. */
15047 for (int i = 0; i < n_elts; i++)
15048 {
15049 rtx x = XVECEXP (vals, 0, i);
15050 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15051 continue;
15052 x = copy_to_mode_reg (inner_mode, x);
15053 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15054 }
15055 }
15056
15057 static unsigned HOST_WIDE_INT
15058 aarch64_shift_truncation_mask (machine_mode mode)
15059 {
15060 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15061 return 0;
15062 return GET_MODE_UNIT_BITSIZE (mode) - 1;
15063 }
15064
15065 /* Select a format to encode pointers in exception handling data. */
15066 int
15067 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15068 {
15069 int type;
15070 switch (aarch64_cmodel)
15071 {
15072 case AARCH64_CMODEL_TINY:
15073 case AARCH64_CMODEL_TINY_PIC:
15074 case AARCH64_CMODEL_SMALL:
15075 case AARCH64_CMODEL_SMALL_PIC:
15076 case AARCH64_CMODEL_SMALL_SPIC:
15077 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15078 for everything. */
15079 type = DW_EH_PE_sdata4;
15080 break;
15081 default:
15082 /* No assumptions here. 8-byte relocs required. */
15083 type = DW_EH_PE_sdata8;
15084 break;
15085 }
15086 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15087 }
15088
15089 /* The last .arch and .tune assembly strings that we printed. */
15090 static std::string aarch64_last_printed_arch_string;
15091 static std::string aarch64_last_printed_tune_string;
15092
15093 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15094 by the function fndecl. */
15095
15096 void
15097 aarch64_declare_function_name (FILE *stream, const char* name,
15098 tree fndecl)
15099 {
15100 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15101
15102 struct cl_target_option *targ_options;
15103 if (target_parts)
15104 targ_options = TREE_TARGET_OPTION (target_parts);
15105 else
15106 targ_options = TREE_TARGET_OPTION (target_option_current_node);
15107 gcc_assert (targ_options);
15108
15109 const struct processor *this_arch
15110 = aarch64_get_arch (targ_options->x_explicit_arch);
15111
15112 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15113 std::string extension
15114 = aarch64_get_extension_string_for_isa_flags (isa_flags,
15115 this_arch->flags);
15116 /* Only update the assembler .arch string if it is distinct from the last
15117 such string we printed. */
15118 std::string to_print = this_arch->name + extension;
15119 if (to_print != aarch64_last_printed_arch_string)
15120 {
15121 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15122 aarch64_last_printed_arch_string = to_print;
15123 }
15124
15125 /* Print the cpu name we're tuning for in the comments, might be
15126 useful to readers of the generated asm. Do it only when it changes
15127 from function to function and verbose assembly is requested. */
15128 const struct processor *this_tune
15129 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15130
15131 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15132 {
15133 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15134 this_tune->name);
15135 aarch64_last_printed_tune_string = this_tune->name;
15136 }
15137
15138 /* Don't forget the type directive for ELF. */
15139 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15140 ASM_OUTPUT_LABEL (stream, name);
15141 }
15142
15143 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15144
15145 static void
15146 aarch64_start_file (void)
15147 {
15148 struct cl_target_option *default_options
15149 = TREE_TARGET_OPTION (target_option_default_node);
15150
15151 const struct processor *default_arch
15152 = aarch64_get_arch (default_options->x_explicit_arch);
15153 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15154 std::string extension
15155 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15156 default_arch->flags);
15157
15158 aarch64_last_printed_arch_string = default_arch->name + extension;
15159 aarch64_last_printed_tune_string = "";
15160 asm_fprintf (asm_out_file, "\t.arch %s\n",
15161 aarch64_last_printed_arch_string.c_str ());
15162
15163 default_file_start ();
15164 }
15165
15166 /* Emit load exclusive. */
15167
15168 static void
15169 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15170 rtx mem, rtx model_rtx)
15171 {
15172 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15173 }
15174
15175 /* Emit store exclusive. */
15176
15177 static void
15178 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15179 rtx rval, rtx mem, rtx model_rtx)
15180 {
15181 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15182 }
15183
15184 /* Mark the previous jump instruction as unlikely. */
15185
15186 static void
15187 aarch64_emit_unlikely_jump (rtx insn)
15188 {
15189 rtx_insn *jump = emit_jump_insn (insn);
15190 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15191 }
15192
15193 /* Expand a compare and swap pattern. */
15194
15195 void
15196 aarch64_expand_compare_and_swap (rtx operands[])
15197 {
15198 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15199 machine_mode mode, r_mode;
15200
15201 bval = operands[0];
15202 rval = operands[1];
15203 mem = operands[2];
15204 oldval = operands[3];
15205 newval = operands[4];
15206 is_weak = operands[5];
15207 mod_s = operands[6];
15208 mod_f = operands[7];
15209 mode = GET_MODE (mem);
15210
15211 /* Normally the succ memory model must be stronger than fail, but in the
15212 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15213 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15214 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15215 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15216 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15217
15218 r_mode = mode;
15219 if (mode == QImode || mode == HImode)
15220 {
15221 r_mode = SImode;
15222 rval = gen_reg_rtx (r_mode);
15223 }
15224
15225 if (TARGET_LSE)
15226 {
15227 /* The CAS insn requires oldval and rval overlap, but we need to
15228 have a copy of oldval saved across the operation to tell if
15229 the operation is successful. */
15230 if (reg_overlap_mentioned_p (rval, oldval))
15231 rval = copy_to_mode_reg (r_mode, oldval);
15232 else
15233 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15234
15235 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15236 newval, mod_s));
15237 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15238 }
15239 else
15240 {
15241 /* The oldval predicate varies by mode. Test it and force to reg. */
15242 insn_code code = code_for_aarch64_compare_and_swap (mode);
15243 if (!insn_data[code].operand[2].predicate (oldval, mode))
15244 oldval = force_reg (mode, oldval);
15245
15246 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15247 is_weak, mod_s, mod_f));
15248 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15249 }
15250
15251 if (r_mode != mode)
15252 rval = gen_lowpart (mode, rval);
15253 emit_move_insn (operands[1], rval);
15254
15255 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15256 emit_insn (gen_rtx_SET (bval, x));
15257 }
15258
15259 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15260 sequence implementing an atomic operation. */
15261
15262 static void
15263 aarch64_emit_post_barrier (enum memmodel model)
15264 {
15265 const enum memmodel base_model = memmodel_base (model);
15266
15267 if (is_mm_sync (model)
15268 && (base_model == MEMMODEL_ACQUIRE
15269 || base_model == MEMMODEL_ACQ_REL
15270 || base_model == MEMMODEL_SEQ_CST))
15271 {
15272 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15273 }
15274 }
15275
15276 /* Split a compare and swap pattern. */
15277
15278 void
15279 aarch64_split_compare_and_swap (rtx operands[])
15280 {
15281 rtx rval, mem, oldval, newval, scratch;
15282 machine_mode mode;
15283 bool is_weak;
15284 rtx_code_label *label1, *label2;
15285 rtx x, cond;
15286 enum memmodel model;
15287 rtx model_rtx;
15288
15289 rval = operands[0];
15290 mem = operands[1];
15291 oldval = operands[2];
15292 newval = operands[3];
15293 is_weak = (operands[4] != const0_rtx);
15294 model_rtx = operands[5];
15295 scratch = operands[7];
15296 mode = GET_MODE (mem);
15297 model = memmodel_from_int (INTVAL (model_rtx));
15298
15299 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15300 loop:
15301 .label1:
15302 LD[A]XR rval, [mem]
15303 CBNZ rval, .label2
15304 ST[L]XR scratch, newval, [mem]
15305 CBNZ scratch, .label1
15306 .label2:
15307 CMP rval, 0. */
15308 bool strong_zero_p = !is_weak && oldval == const0_rtx;
15309
15310 label1 = NULL;
15311 if (!is_weak)
15312 {
15313 label1 = gen_label_rtx ();
15314 emit_label (label1);
15315 }
15316 label2 = gen_label_rtx ();
15317
15318 /* The initial load can be relaxed for a __sync operation since a final
15319 barrier will be emitted to stop code hoisting. */
15320 if (is_mm_sync (model))
15321 aarch64_emit_load_exclusive (mode, rval, mem,
15322 GEN_INT (MEMMODEL_RELAXED));
15323 else
15324 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15325
15326 if (strong_zero_p)
15327 {
15328 if (aarch64_track_speculation)
15329 {
15330 /* Emit an explicit compare instruction, so that we can correctly
15331 track the condition codes. */
15332 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15333 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15334 }
15335 else
15336 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15337
15338 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15339 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15340 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15341 }
15342 else
15343 {
15344 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15345 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15346 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15347 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15348 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15349 }
15350
15351 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15352
15353 if (!is_weak)
15354 {
15355 if (aarch64_track_speculation)
15356 {
15357 /* Emit an explicit compare instruction, so that we can correctly
15358 track the condition codes. */
15359 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15360 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15361 }
15362 else
15363 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15364
15365 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15366 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15367 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15368 }
15369 else
15370 {
15371 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15372 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15373 emit_insn (gen_rtx_SET (cond, x));
15374 }
15375
15376 emit_label (label2);
15377 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15378 to set the condition flags. If this is not used it will be removed by
15379 later passes. */
15380 if (strong_zero_p)
15381 {
15382 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15383 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15384 emit_insn (gen_rtx_SET (cond, x));
15385 }
15386 /* Emit any final barrier needed for a __sync operation. */
15387 if (is_mm_sync (model))
15388 aarch64_emit_post_barrier (model);
15389 }
15390
15391 /* Split an atomic operation. */
15392
15393 void
15394 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15395 rtx value, rtx model_rtx, rtx cond)
15396 {
15397 machine_mode mode = GET_MODE (mem);
15398 machine_mode wmode = (mode == DImode ? DImode : SImode);
15399 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15400 const bool is_sync = is_mm_sync (model);
15401 rtx_code_label *label;
15402 rtx x;
15403
15404 /* Split the atomic operation into a sequence. */
15405 label = gen_label_rtx ();
15406 emit_label (label);
15407
15408 if (new_out)
15409 new_out = gen_lowpart (wmode, new_out);
15410 if (old_out)
15411 old_out = gen_lowpart (wmode, old_out);
15412 else
15413 old_out = new_out;
15414 value = simplify_gen_subreg (wmode, value, mode, 0);
15415
15416 /* The initial load can be relaxed for a __sync operation since a final
15417 barrier will be emitted to stop code hoisting. */
15418 if (is_sync)
15419 aarch64_emit_load_exclusive (mode, old_out, mem,
15420 GEN_INT (MEMMODEL_RELAXED));
15421 else
15422 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15423
15424 switch (code)
15425 {
15426 case SET:
15427 new_out = value;
15428 break;
15429
15430 case NOT:
15431 x = gen_rtx_AND (wmode, old_out, value);
15432 emit_insn (gen_rtx_SET (new_out, x));
15433 x = gen_rtx_NOT (wmode, new_out);
15434 emit_insn (gen_rtx_SET (new_out, x));
15435 break;
15436
15437 case MINUS:
15438 if (CONST_INT_P (value))
15439 {
15440 value = GEN_INT (-INTVAL (value));
15441 code = PLUS;
15442 }
15443 /* Fall through. */
15444
15445 default:
15446 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15447 emit_insn (gen_rtx_SET (new_out, x));
15448 break;
15449 }
15450
15451 aarch64_emit_store_exclusive (mode, cond, mem,
15452 gen_lowpart (mode, new_out), model_rtx);
15453
15454 if (aarch64_track_speculation)
15455 {
15456 /* Emit an explicit compare instruction, so that we can correctly
15457 track the condition codes. */
15458 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15459 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15460 }
15461 else
15462 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15463
15464 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15465 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15466 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15467
15468 /* Emit any final barrier needed for a __sync operation. */
15469 if (is_sync)
15470 aarch64_emit_post_barrier (model);
15471 }
15472
15473 static void
15474 aarch64_init_libfuncs (void)
15475 {
15476 /* Half-precision float operations. The compiler handles all operations
15477 with NULL libfuncs by converting to SFmode. */
15478
15479 /* Conversions. */
15480 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15481 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15482
15483 /* Arithmetic. */
15484 set_optab_libfunc (add_optab, HFmode, NULL);
15485 set_optab_libfunc (sdiv_optab, HFmode, NULL);
15486 set_optab_libfunc (smul_optab, HFmode, NULL);
15487 set_optab_libfunc (neg_optab, HFmode, NULL);
15488 set_optab_libfunc (sub_optab, HFmode, NULL);
15489
15490 /* Comparisons. */
15491 set_optab_libfunc (eq_optab, HFmode, NULL);
15492 set_optab_libfunc (ne_optab, HFmode, NULL);
15493 set_optab_libfunc (lt_optab, HFmode, NULL);
15494 set_optab_libfunc (le_optab, HFmode, NULL);
15495 set_optab_libfunc (ge_optab, HFmode, NULL);
15496 set_optab_libfunc (gt_optab, HFmode, NULL);
15497 set_optab_libfunc (unord_optab, HFmode, NULL);
15498 }
15499
15500 /* Target hook for c_mode_for_suffix. */
15501 static machine_mode
15502 aarch64_c_mode_for_suffix (char suffix)
15503 {
15504 if (suffix == 'q')
15505 return TFmode;
15506
15507 return VOIDmode;
15508 }
15509
15510 /* We can only represent floating point constants which will fit in
15511 "quarter-precision" values. These values are characterised by
15512 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
15513 by:
15514
15515 (-1)^s * (n/16) * 2^r
15516
15517 Where:
15518 's' is the sign bit.
15519 'n' is an integer in the range 16 <= n <= 31.
15520 'r' is an integer in the range -3 <= r <= 4. */
15521
15522 /* Return true iff X can be represented by a quarter-precision
15523 floating point immediate operand X. Note, we cannot represent 0.0. */
15524 bool
15525 aarch64_float_const_representable_p (rtx x)
15526 {
15527 /* This represents our current view of how many bits
15528 make up the mantissa. */
15529 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15530 int exponent;
15531 unsigned HOST_WIDE_INT mantissa, mask;
15532 REAL_VALUE_TYPE r, m;
15533 bool fail;
15534
15535 if (!CONST_DOUBLE_P (x))
15536 return false;
15537
15538 if (GET_MODE (x) == VOIDmode
15539 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15540 return false;
15541
15542 r = *CONST_DOUBLE_REAL_VALUE (x);
15543
15544 /* We cannot represent infinities, NaNs or +/-zero. We won't
15545 know if we have +zero until we analyse the mantissa, but we
15546 can reject the other invalid values. */
15547 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15548 || REAL_VALUE_MINUS_ZERO (r))
15549 return false;
15550
15551 /* Extract exponent. */
15552 r = real_value_abs (&r);
15553 exponent = REAL_EXP (&r);
15554
15555 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15556 highest (sign) bit, with a fixed binary point at bit point_pos.
15557 m1 holds the low part of the mantissa, m2 the high part.
15558 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15559 bits for the mantissa, this can fail (low bits will be lost). */
15560 real_ldexp (&m, &r, point_pos - exponent);
15561 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15562
15563 /* If the low part of the mantissa has bits set we cannot represent
15564 the value. */
15565 if (w.ulow () != 0)
15566 return false;
15567 /* We have rejected the lower HOST_WIDE_INT, so update our
15568 understanding of how many bits lie in the mantissa and
15569 look only at the high HOST_WIDE_INT. */
15570 mantissa = w.elt (1);
15571 point_pos -= HOST_BITS_PER_WIDE_INT;
15572
15573 /* We can only represent values with a mantissa of the form 1.xxxx. */
15574 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15575 if ((mantissa & mask) != 0)
15576 return false;
15577
15578 /* Having filtered unrepresentable values, we may now remove all
15579 but the highest 5 bits. */
15580 mantissa >>= point_pos - 5;
15581
15582 /* We cannot represent the value 0.0, so reject it. This is handled
15583 elsewhere. */
15584 if (mantissa == 0)
15585 return false;
15586
15587 /* Then, as bit 4 is always set, we can mask it off, leaving
15588 the mantissa in the range [0, 15]. */
15589 mantissa &= ~(1 << 4);
15590 gcc_assert (mantissa <= 15);
15591
15592 /* GCC internally does not use IEEE754-like encoding (where normalized
15593 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
15594 Our mantissa values are shifted 4 places to the left relative to
15595 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15596 by 5 places to correct for GCC's representation. */
15597 exponent = 5 - exponent;
15598
15599 return (exponent >= 0 && exponent <= 7);
15600 }
15601
15602 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15603 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
15604 output MOVI/MVNI, ORR or BIC immediate. */
15605 char*
15606 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15607 enum simd_immediate_check which)
15608 {
15609 bool is_valid;
15610 static char templ[40];
15611 const char *mnemonic;
15612 const char *shift_op;
15613 unsigned int lane_count = 0;
15614 char element_char;
15615
15616 struct simd_immediate_info info;
15617
15618 /* This will return true to show const_vector is legal for use as either
15619 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15620 It will also update INFO to show how the immediate should be generated.
15621 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
15622 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15623 gcc_assert (is_valid);
15624
15625 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15626 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15627
15628 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15629 {
15630 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15631 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15632 move immediate path. */
15633 if (aarch64_float_const_zero_rtx_p (info.value))
15634 info.value = GEN_INT (0);
15635 else
15636 {
15637 const unsigned int buf_size = 20;
15638 char float_buf[buf_size] = {'\0'};
15639 real_to_decimal_for_mode (float_buf,
15640 CONST_DOUBLE_REAL_VALUE (info.value),
15641 buf_size, buf_size, 1, info.elt_mode);
15642
15643 if (lane_count == 1)
15644 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15645 else
15646 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15647 lane_count, element_char, float_buf);
15648 return templ;
15649 }
15650 }
15651
15652 gcc_assert (CONST_INT_P (info.value));
15653
15654 if (which == AARCH64_CHECK_MOV)
15655 {
15656 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15657 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15658 if (lane_count == 1)
15659 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15660 mnemonic, UINTVAL (info.value));
15661 else if (info.shift)
15662 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15663 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15664 element_char, UINTVAL (info.value), shift_op, info.shift);
15665 else
15666 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15667 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15668 element_char, UINTVAL (info.value));
15669 }
15670 else
15671 {
15672 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15673 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15674 if (info.shift)
15675 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15676 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15677 element_char, UINTVAL (info.value), "lsl", info.shift);
15678 else
15679 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15680 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15681 element_char, UINTVAL (info.value));
15682 }
15683 return templ;
15684 }
15685
15686 char*
15687 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15688 {
15689
15690 /* If a floating point number was passed and we desire to use it in an
15691 integer mode do the conversion to integer. */
15692 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15693 {
15694 unsigned HOST_WIDE_INT ival;
15695 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15696 gcc_unreachable ();
15697 immediate = gen_int_mode (ival, mode);
15698 }
15699
15700 machine_mode vmode;
15701 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15702 a 128 bit vector mode. */
15703 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15704
15705 vmode = aarch64_simd_container_mode (mode, width);
15706 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15707 return aarch64_output_simd_mov_immediate (v_op, width);
15708 }
15709
15710 /* Return the output string to use for moving immediate CONST_VECTOR
15711 into an SVE register. */
15712
15713 char *
15714 aarch64_output_sve_mov_immediate (rtx const_vector)
15715 {
15716 static char templ[40];
15717 struct simd_immediate_info info;
15718 char element_char;
15719
15720 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15721 gcc_assert (is_valid);
15722
15723 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15724
15725 if (info.step)
15726 {
15727 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15728 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15729 element_char, INTVAL (info.value), INTVAL (info.step));
15730 return templ;
15731 }
15732
15733 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15734 {
15735 if (aarch64_float_const_zero_rtx_p (info.value))
15736 info.value = GEN_INT (0);
15737 else
15738 {
15739 const int buf_size = 20;
15740 char float_buf[buf_size] = {};
15741 real_to_decimal_for_mode (float_buf,
15742 CONST_DOUBLE_REAL_VALUE (info.value),
15743 buf_size, buf_size, 1, info.elt_mode);
15744
15745 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15746 element_char, float_buf);
15747 return templ;
15748 }
15749 }
15750
15751 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15752 element_char, INTVAL (info.value));
15753 return templ;
15754 }
15755
15756 /* Return the asm format for a PTRUE instruction whose destination has
15757 mode MODE. SUFFIX is the element size suffix. */
15758
15759 char *
15760 aarch64_output_ptrue (machine_mode mode, char suffix)
15761 {
15762 unsigned int nunits;
15763 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15764 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15765 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15766 else
15767 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15768 return buf;
15769 }
15770
15771 /* Split operands into moves from op[1] + op[2] into op[0]. */
15772
15773 void
15774 aarch64_split_combinev16qi (rtx operands[3])
15775 {
15776 unsigned int dest = REGNO (operands[0]);
15777 unsigned int src1 = REGNO (operands[1]);
15778 unsigned int src2 = REGNO (operands[2]);
15779 machine_mode halfmode = GET_MODE (operands[1]);
15780 unsigned int halfregs = REG_NREGS (operands[1]);
15781 rtx destlo, desthi;
15782
15783 gcc_assert (halfmode == V16QImode);
15784
15785 if (src1 == dest && src2 == dest + halfregs)
15786 {
15787 /* No-op move. Can't split to nothing; emit something. */
15788 emit_note (NOTE_INSN_DELETED);
15789 return;
15790 }
15791
15792 /* Preserve register attributes for variable tracking. */
15793 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15794 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15795 GET_MODE_SIZE (halfmode));
15796
15797 /* Special case of reversed high/low parts. */
15798 if (reg_overlap_mentioned_p (operands[2], destlo)
15799 && reg_overlap_mentioned_p (operands[1], desthi))
15800 {
15801 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15802 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15803 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15804 }
15805 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15806 {
15807 /* Try to avoid unnecessary moves if part of the result
15808 is in the right place already. */
15809 if (src1 != dest)
15810 emit_move_insn (destlo, operands[1]);
15811 if (src2 != dest + halfregs)
15812 emit_move_insn (desthi, operands[2]);
15813 }
15814 else
15815 {
15816 if (src2 != dest + halfregs)
15817 emit_move_insn (desthi, operands[2]);
15818 if (src1 != dest)
15819 emit_move_insn (destlo, operands[1]);
15820 }
15821 }
15822
15823 /* vec_perm support. */
15824
15825 struct expand_vec_perm_d
15826 {
15827 rtx target, op0, op1;
15828 vec_perm_indices perm;
15829 machine_mode vmode;
15830 unsigned int vec_flags;
15831 bool one_vector_p;
15832 bool testing_p;
15833 };
15834
15835 /* Generate a variable permutation. */
15836
15837 static void
15838 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15839 {
15840 machine_mode vmode = GET_MODE (target);
15841 bool one_vector_p = rtx_equal_p (op0, op1);
15842
15843 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15844 gcc_checking_assert (GET_MODE (op0) == vmode);
15845 gcc_checking_assert (GET_MODE (op1) == vmode);
15846 gcc_checking_assert (GET_MODE (sel) == vmode);
15847 gcc_checking_assert (TARGET_SIMD);
15848
15849 if (one_vector_p)
15850 {
15851 if (vmode == V8QImode)
15852 {
15853 /* Expand the argument to a V16QI mode by duplicating it. */
15854 rtx pair = gen_reg_rtx (V16QImode);
15855 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15856 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15857 }
15858 else
15859 {
15860 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15861 }
15862 }
15863 else
15864 {
15865 rtx pair;
15866
15867 if (vmode == V8QImode)
15868 {
15869 pair = gen_reg_rtx (V16QImode);
15870 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15871 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15872 }
15873 else
15874 {
15875 pair = gen_reg_rtx (OImode);
15876 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15877 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15878 }
15879 }
15880 }
15881
15882 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15883 NELT is the number of elements in the vector. */
15884
15885 void
15886 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15887 unsigned int nelt)
15888 {
15889 machine_mode vmode = GET_MODE (target);
15890 bool one_vector_p = rtx_equal_p (op0, op1);
15891 rtx mask;
15892
15893 /* The TBL instruction does not use a modulo index, so we must take care
15894 of that ourselves. */
15895 mask = aarch64_simd_gen_const_vector_dup (vmode,
15896 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15897 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15898
15899 /* For big-endian, we also need to reverse the index within the vector
15900 (but not which vector). */
15901 if (BYTES_BIG_ENDIAN)
15902 {
15903 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15904 if (!one_vector_p)
15905 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15906 sel = expand_simple_binop (vmode, XOR, sel, mask,
15907 NULL, 0, OPTAB_LIB_WIDEN);
15908 }
15909 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15910 }
15911
15912 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15913
15914 static void
15915 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15916 {
15917 emit_insn (gen_rtx_SET (target,
15918 gen_rtx_UNSPEC (GET_MODE (target),
15919 gen_rtvec (2, op0, op1), code)));
15920 }
15921
15922 /* Expand an SVE vec_perm with the given operands. */
15923
15924 void
15925 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15926 {
15927 machine_mode data_mode = GET_MODE (target);
15928 machine_mode sel_mode = GET_MODE (sel);
15929 /* Enforced by the pattern condition. */
15930 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15931
15932 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15933 size of the two value vectors, i.e. the upper bits of the indices
15934 are effectively ignored. SVE TBL instead produces 0 for any
15935 out-of-range indices, so we need to modulo all the vec_perm indices
15936 to ensure they are all in range. */
15937 rtx sel_reg = force_reg (sel_mode, sel);
15938
15939 /* Check if the sel only references the first values vector. */
15940 if (GET_CODE (sel) == CONST_VECTOR
15941 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15942 {
15943 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15944 return;
15945 }
15946
15947 /* Check if the two values vectors are the same. */
15948 if (rtx_equal_p (op0, op1))
15949 {
15950 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15951 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15952 NULL, 0, OPTAB_DIRECT);
15953 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15954 return;
15955 }
15956
15957 /* Run TBL on for each value vector and combine the results. */
15958
15959 rtx res0 = gen_reg_rtx (data_mode);
15960 rtx res1 = gen_reg_rtx (data_mode);
15961 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15962 if (GET_CODE (sel) != CONST_VECTOR
15963 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15964 {
15965 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15966 2 * nunits - 1);
15967 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15968 NULL, 0, OPTAB_DIRECT);
15969 }
15970 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15971 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15972 NULL, 0, OPTAB_DIRECT);
15973 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15974 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15975 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15976 else
15977 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15978 }
15979
15980 /* Recognize patterns suitable for the TRN instructions. */
15981 static bool
15982 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15983 {
15984 HOST_WIDE_INT odd;
15985 poly_uint64 nelt = d->perm.length ();
15986 rtx out, in0, in1, x;
15987 machine_mode vmode = d->vmode;
15988
15989 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15990 return false;
15991
15992 /* Note that these are little-endian tests.
15993 We correct for big-endian later. */
15994 if (!d->perm[0].is_constant (&odd)
15995 || (odd != 0 && odd != 1)
15996 || !d->perm.series_p (0, 2, odd, 2)
15997 || !d->perm.series_p (1, 2, nelt + odd, 2))
15998 return false;
15999
16000 /* Success! */
16001 if (d->testing_p)
16002 return true;
16003
16004 in0 = d->op0;
16005 in1 = d->op1;
16006 /* We don't need a big-endian lane correction for SVE; see the comment
16007 at the head of aarch64-sve.md for details. */
16008 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16009 {
16010 x = in0, in0 = in1, in1 = x;
16011 odd = !odd;
16012 }
16013 out = d->target;
16014
16015 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16016 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16017 return true;
16018 }
16019
16020 /* Recognize patterns suitable for the UZP instructions. */
16021 static bool
16022 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16023 {
16024 HOST_WIDE_INT odd;
16025 rtx out, in0, in1, x;
16026 machine_mode vmode = d->vmode;
16027
16028 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16029 return false;
16030
16031 /* Note that these are little-endian tests.
16032 We correct for big-endian later. */
16033 if (!d->perm[0].is_constant (&odd)
16034 || (odd != 0 && odd != 1)
16035 || !d->perm.series_p (0, 1, odd, 2))
16036 return false;
16037
16038 /* Success! */
16039 if (d->testing_p)
16040 return true;
16041
16042 in0 = d->op0;
16043 in1 = d->op1;
16044 /* We don't need a big-endian lane correction for SVE; see the comment
16045 at the head of aarch64-sve.md for details. */
16046 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16047 {
16048 x = in0, in0 = in1, in1 = x;
16049 odd = !odd;
16050 }
16051 out = d->target;
16052
16053 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16054 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16055 return true;
16056 }
16057
16058 /* Recognize patterns suitable for the ZIP instructions. */
16059 static bool
16060 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16061 {
16062 unsigned int high;
16063 poly_uint64 nelt = d->perm.length ();
16064 rtx out, in0, in1, x;
16065 machine_mode vmode = d->vmode;
16066
16067 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16068 return false;
16069
16070 /* Note that these are little-endian tests.
16071 We correct for big-endian later. */
16072 poly_uint64 first = d->perm[0];
16073 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16074 || !d->perm.series_p (0, 2, first, 1)
16075 || !d->perm.series_p (1, 2, first + nelt, 1))
16076 return false;
16077 high = maybe_ne (first, 0U);
16078
16079 /* Success! */
16080 if (d->testing_p)
16081 return true;
16082
16083 in0 = d->op0;
16084 in1 = d->op1;
16085 /* We don't need a big-endian lane correction for SVE; see the comment
16086 at the head of aarch64-sve.md for details. */
16087 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16088 {
16089 x = in0, in0 = in1, in1 = x;
16090 high = !high;
16091 }
16092 out = d->target;
16093
16094 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16095 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16096 return true;
16097 }
16098
16099 /* Recognize patterns for the EXT insn. */
16100
16101 static bool
16102 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16103 {
16104 HOST_WIDE_INT location;
16105 rtx offset;
16106
16107 /* The first element always refers to the first vector.
16108 Check if the extracted indices are increasing by one. */
16109 if (d->vec_flags == VEC_SVE_PRED
16110 || !d->perm[0].is_constant (&location)
16111 || !d->perm.series_p (0, 1, location, 1))
16112 return false;
16113
16114 /* Success! */
16115 if (d->testing_p)
16116 return true;
16117
16118 /* The case where (location == 0) is a no-op for both big- and little-endian,
16119 and is removed by the mid-end at optimization levels -O1 and higher.
16120
16121 We don't need a big-endian lane correction for SVE; see the comment
16122 at the head of aarch64-sve.md for details. */
16123 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16124 {
16125 /* After setup, we want the high elements of the first vector (stored
16126 at the LSB end of the register), and the low elements of the second
16127 vector (stored at the MSB end of the register). So swap. */
16128 std::swap (d->op0, d->op1);
16129 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16130 to_constant () is safe since this is restricted to Advanced SIMD
16131 vectors. */
16132 location = d->perm.length ().to_constant () - location;
16133 }
16134
16135 offset = GEN_INT (location);
16136 emit_set_insn (d->target,
16137 gen_rtx_UNSPEC (d->vmode,
16138 gen_rtvec (3, d->op0, d->op1, offset),
16139 UNSPEC_EXT));
16140 return true;
16141 }
16142
16143 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16144 within each 64-bit, 32-bit or 16-bit granule. */
16145
16146 static bool
16147 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16148 {
16149 HOST_WIDE_INT diff;
16150 unsigned int i, size, unspec;
16151 machine_mode pred_mode;
16152
16153 if (d->vec_flags == VEC_SVE_PRED
16154 || !d->one_vector_p
16155 || !d->perm[0].is_constant (&diff))
16156 return false;
16157
16158 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16159 if (size == 8)
16160 {
16161 unspec = UNSPEC_REV64;
16162 pred_mode = VNx2BImode;
16163 }
16164 else if (size == 4)
16165 {
16166 unspec = UNSPEC_REV32;
16167 pred_mode = VNx4BImode;
16168 }
16169 else if (size == 2)
16170 {
16171 unspec = UNSPEC_REV16;
16172 pred_mode = VNx8BImode;
16173 }
16174 else
16175 return false;
16176
16177 unsigned int step = diff + 1;
16178 for (i = 0; i < step; ++i)
16179 if (!d->perm.series_p (i, step, diff - i, step))
16180 return false;
16181
16182 /* Success! */
16183 if (d->testing_p)
16184 return true;
16185
16186 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16187 if (d->vec_flags == VEC_SVE_DATA)
16188 {
16189 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16190 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16191 UNSPEC_MERGE_PTRUE);
16192 }
16193 emit_set_insn (d->target, src);
16194 return true;
16195 }
16196
16197 /* Recognize patterns for the REV insn, which reverses elements within
16198 a full vector. */
16199
16200 static bool
16201 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16202 {
16203 poly_uint64 nelt = d->perm.length ();
16204
16205 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16206 return false;
16207
16208 if (!d->perm.series_p (0, 1, nelt - 1, -1))
16209 return false;
16210
16211 /* Success! */
16212 if (d->testing_p)
16213 return true;
16214
16215 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16216 emit_set_insn (d->target, src);
16217 return true;
16218 }
16219
16220 static bool
16221 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16222 {
16223 rtx out = d->target;
16224 rtx in0;
16225 HOST_WIDE_INT elt;
16226 machine_mode vmode = d->vmode;
16227 rtx lane;
16228
16229 if (d->vec_flags == VEC_SVE_PRED
16230 || d->perm.encoding ().encoded_nelts () != 1
16231 || !d->perm[0].is_constant (&elt))
16232 return false;
16233
16234 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16235 return false;
16236
16237 /* Success! */
16238 if (d->testing_p)
16239 return true;
16240
16241 /* The generic preparation in aarch64_expand_vec_perm_const_1
16242 swaps the operand order and the permute indices if it finds
16243 d->perm[0] to be in the second operand. Thus, we can always
16244 use d->op0 and need not do any extra arithmetic to get the
16245 correct lane number. */
16246 in0 = d->op0;
16247 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
16248
16249 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16250 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16251 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16252 return true;
16253 }
16254
16255 static bool
16256 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16257 {
16258 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16259 machine_mode vmode = d->vmode;
16260
16261 /* Make sure that the indices are constant. */
16262 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16263 for (unsigned int i = 0; i < encoded_nelts; ++i)
16264 if (!d->perm[i].is_constant ())
16265 return false;
16266
16267 if (d->testing_p)
16268 return true;
16269
16270 /* Generic code will try constant permutation twice. Once with the
16271 original mode and again with the elements lowered to QImode.
16272 So wait and don't do the selector expansion ourselves. */
16273 if (vmode != V8QImode && vmode != V16QImode)
16274 return false;
16275
16276 /* to_constant is safe since this routine is specific to Advanced SIMD
16277 vectors. */
16278 unsigned int nelt = d->perm.length ().to_constant ();
16279 for (unsigned int i = 0; i < nelt; ++i)
16280 /* If big-endian and two vectors we end up with a weird mixed-endian
16281 mode on NEON. Reverse the index within each word but not the word
16282 itself. to_constant is safe because we checked is_constant above. */
16283 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16284 ? d->perm[i].to_constant () ^ (nelt - 1)
16285 : d->perm[i].to_constant ());
16286
16287 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16288 sel = force_reg (vmode, sel);
16289
16290 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16291 return true;
16292 }
16293
16294 /* Try to implement D using an SVE TBL instruction. */
16295
16296 static bool
16297 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16298 {
16299 unsigned HOST_WIDE_INT nelt;
16300
16301 /* Permuting two variable-length vectors could overflow the
16302 index range. */
16303 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16304 return false;
16305
16306 if (d->testing_p)
16307 return true;
16308
16309 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16310 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16311 if (d->one_vector_p)
16312 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16313 else
16314 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16315 return true;
16316 }
16317
16318 static bool
16319 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16320 {
16321 /* The pattern matching functions above are written to look for a small
16322 number to begin the sequence (0, 1, N/2). If we begin with an index
16323 from the second operand, we can swap the operands. */
16324 poly_int64 nelt = d->perm.length ();
16325 if (known_ge (d->perm[0], nelt))
16326 {
16327 d->perm.rotate_inputs (1);
16328 std::swap (d->op0, d->op1);
16329 }
16330
16331 if ((d->vec_flags == VEC_ADVSIMD
16332 || d->vec_flags == VEC_SVE_DATA
16333 || d->vec_flags == VEC_SVE_PRED)
16334 && known_gt (nelt, 1))
16335 {
16336 if (aarch64_evpc_rev_local (d))
16337 return true;
16338 else if (aarch64_evpc_rev_global (d))
16339 return true;
16340 else if (aarch64_evpc_ext (d))
16341 return true;
16342 else if (aarch64_evpc_dup (d))
16343 return true;
16344 else if (aarch64_evpc_zip (d))
16345 return true;
16346 else if (aarch64_evpc_uzp (d))
16347 return true;
16348 else if (aarch64_evpc_trn (d))
16349 return true;
16350 if (d->vec_flags == VEC_SVE_DATA)
16351 return aarch64_evpc_sve_tbl (d);
16352 else if (d->vec_flags == VEC_ADVSIMD)
16353 return aarch64_evpc_tbl (d);
16354 }
16355 return false;
16356 }
16357
16358 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
16359
16360 static bool
16361 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16362 rtx op1, const vec_perm_indices &sel)
16363 {
16364 struct expand_vec_perm_d d;
16365
16366 /* Check whether the mask can be applied to a single vector. */
16367 if (sel.ninputs () == 1
16368 || (op0 && rtx_equal_p (op0, op1)))
16369 d.one_vector_p = true;
16370 else if (sel.all_from_input_p (0))
16371 {
16372 d.one_vector_p = true;
16373 op1 = op0;
16374 }
16375 else if (sel.all_from_input_p (1))
16376 {
16377 d.one_vector_p = true;
16378 op0 = op1;
16379 }
16380 else
16381 d.one_vector_p = false;
16382
16383 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16384 sel.nelts_per_input ());
16385 d.vmode = vmode;
16386 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16387 d.target = target;
16388 d.op0 = op0;
16389 d.op1 = op1;
16390 d.testing_p = !target;
16391
16392 if (!d.testing_p)
16393 return aarch64_expand_vec_perm_const_1 (&d);
16394
16395 rtx_insn *last = get_last_insn ();
16396 bool ret = aarch64_expand_vec_perm_const_1 (&d);
16397 gcc_assert (last == get_last_insn ());
16398
16399 return ret;
16400 }
16401
16402 /* Generate a byte permute mask for a register of mode MODE,
16403 which has NUNITS units. */
16404
16405 rtx
16406 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16407 {
16408 /* We have to reverse each vector because we dont have
16409 a permuted load that can reverse-load according to ABI rules. */
16410 rtx mask;
16411 rtvec v = rtvec_alloc (16);
16412 unsigned int i, j;
16413 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16414
16415 gcc_assert (BYTES_BIG_ENDIAN);
16416 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16417
16418 for (i = 0; i < nunits; i++)
16419 for (j = 0; j < usize; j++)
16420 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16421 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16422 return force_reg (V16QImode, mask);
16423 }
16424
16425 /* Return true if X is a valid second operand for the SVE instruction
16426 that implements integer comparison OP_CODE. */
16427
16428 static bool
16429 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16430 {
16431 if (register_operand (x, VOIDmode))
16432 return true;
16433
16434 switch (op_code)
16435 {
16436 case LTU:
16437 case LEU:
16438 case GEU:
16439 case GTU:
16440 return aarch64_sve_cmp_immediate_p (x, false);
16441 case LT:
16442 case LE:
16443 case GE:
16444 case GT:
16445 case NE:
16446 case EQ:
16447 return aarch64_sve_cmp_immediate_p (x, true);
16448 default:
16449 gcc_unreachable ();
16450 }
16451 }
16452
16453 /* Use predicated SVE instructions to implement the equivalent of:
16454
16455 (set TARGET OP)
16456
16457 given that PTRUE is an all-true predicate of the appropriate mode. */
16458
16459 static void
16460 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16461 {
16462 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16463 gen_rtvec (2, ptrue, op),
16464 UNSPEC_MERGE_PTRUE);
16465 rtx_insn *insn = emit_set_insn (target, unspec);
16466 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16467 }
16468
16469 /* Likewise, but also clobber the condition codes. */
16470
16471 static void
16472 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16473 {
16474 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16475 gen_rtvec (2, ptrue, op),
16476 UNSPEC_MERGE_PTRUE);
16477 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16478 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16479 }
16480
16481 /* Return the UNSPEC_COND_* code for comparison CODE. */
16482
16483 static unsigned int
16484 aarch64_unspec_cond_code (rtx_code code)
16485 {
16486 switch (code)
16487 {
16488 case NE:
16489 return UNSPEC_COND_NE;
16490 case EQ:
16491 return UNSPEC_COND_EQ;
16492 case LT:
16493 return UNSPEC_COND_LT;
16494 case GT:
16495 return UNSPEC_COND_GT;
16496 case LE:
16497 return UNSPEC_COND_LE;
16498 case GE:
16499 return UNSPEC_COND_GE;
16500 default:
16501 gcc_unreachable ();
16502 }
16503 }
16504
16505 /* Emit:
16506
16507 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16508
16509 where <X> is the operation associated with comparison CODE. This form
16510 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16511 semantics, such as when PRED might not be all-true and when comparing
16512 inactive lanes could have side effects. */
16513
16514 static void
16515 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16516 rtx pred, rtx op0, rtx op1)
16517 {
16518 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16519 gen_rtvec (3, pred, op0, op1),
16520 aarch64_unspec_cond_code (code));
16521 emit_set_insn (target, unspec);
16522 }
16523
16524 /* Expand an SVE integer comparison using the SVE equivalent of:
16525
16526 (set TARGET (CODE OP0 OP1)). */
16527
16528 void
16529 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16530 {
16531 machine_mode pred_mode = GET_MODE (target);
16532 machine_mode data_mode = GET_MODE (op0);
16533
16534 if (!aarch64_sve_cmp_operand_p (code, op1))
16535 op1 = force_reg (data_mode, op1);
16536
16537 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16538 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16539 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16540 }
16541
16542 /* Emit the SVE equivalent of:
16543
16544 (set TMP1 (CODE1 OP0 OP1))
16545 (set TMP2 (CODE2 OP0 OP1))
16546 (set TARGET (ior:PRED_MODE TMP1 TMP2))
16547
16548 PTRUE is an all-true predicate with the same mode as TARGET. */
16549
16550 static void
16551 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16552 rtx ptrue, rtx op0, rtx op1)
16553 {
16554 machine_mode pred_mode = GET_MODE (ptrue);
16555 rtx tmp1 = gen_reg_rtx (pred_mode);
16556 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16557 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16558 rtx tmp2 = gen_reg_rtx (pred_mode);
16559 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16560 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16561 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16562 }
16563
16564 /* Emit the SVE equivalent of:
16565
16566 (set TMP (CODE OP0 OP1))
16567 (set TARGET (not TMP))
16568
16569 PTRUE is an all-true predicate with the same mode as TARGET. */
16570
16571 static void
16572 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16573 rtx op0, rtx op1)
16574 {
16575 machine_mode pred_mode = GET_MODE (ptrue);
16576 rtx tmp = gen_reg_rtx (pred_mode);
16577 aarch64_emit_sve_ptrue_op (tmp, ptrue,
16578 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16579 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16580 }
16581
16582 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16583
16584 (set TARGET (CODE OP0 OP1))
16585
16586 If CAN_INVERT_P is true, the caller can also handle inverted results;
16587 return true if the result is in fact inverted. */
16588
16589 bool
16590 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16591 rtx op0, rtx op1, bool can_invert_p)
16592 {
16593 machine_mode pred_mode = GET_MODE (target);
16594 machine_mode data_mode = GET_MODE (op0);
16595
16596 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16597 switch (code)
16598 {
16599 case UNORDERED:
16600 /* UNORDERED has no immediate form. */
16601 op1 = force_reg (data_mode, op1);
16602 /* fall through */
16603 case LT:
16604 case LE:
16605 case GT:
16606 case GE:
16607 case EQ:
16608 case NE:
16609 {
16610 /* There is native support for the comparison. */
16611 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16612 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16613 return false;
16614 }
16615
16616 case LTGT:
16617 /* This is a trapping operation (LT or GT). */
16618 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16619 return false;
16620
16621 case UNEQ:
16622 if (!flag_trapping_math)
16623 {
16624 /* This would trap for signaling NaNs. */
16625 op1 = force_reg (data_mode, op1);
16626 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16627 return false;
16628 }
16629 /* fall through */
16630 case UNLT:
16631 case UNLE:
16632 case UNGT:
16633 case UNGE:
16634 if (flag_trapping_math)
16635 {
16636 /* Work out which elements are ordered. */
16637 rtx ordered = gen_reg_rtx (pred_mode);
16638 op1 = force_reg (data_mode, op1);
16639 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16640
16641 /* Test the opposite condition for the ordered elements,
16642 then invert the result. */
16643 if (code == UNEQ)
16644 code = NE;
16645 else
16646 code = reverse_condition_maybe_unordered (code);
16647 if (can_invert_p)
16648 {
16649 aarch64_emit_sve_predicated_cond (target, code,
16650 ordered, op0, op1);
16651 return true;
16652 }
16653 rtx tmp = gen_reg_rtx (pred_mode);
16654 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16655 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16656 return false;
16657 }
16658 break;
16659
16660 case ORDERED:
16661 /* ORDERED has no immediate form. */
16662 op1 = force_reg (data_mode, op1);
16663 break;
16664
16665 default:
16666 gcc_unreachable ();
16667 }
16668
16669 /* There is native support for the inverse comparison. */
16670 code = reverse_condition_maybe_unordered (code);
16671 if (can_invert_p)
16672 {
16673 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16674 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16675 return true;
16676 }
16677 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16678 return false;
16679 }
16680
16681 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16682 of the data being selected and CMP_MODE is the mode of the values being
16683 compared. */
16684
16685 void
16686 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16687 rtx *ops)
16688 {
16689 machine_mode pred_mode
16690 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16691 GET_MODE_SIZE (cmp_mode)).require ();
16692 rtx pred = gen_reg_rtx (pred_mode);
16693 if (FLOAT_MODE_P (cmp_mode))
16694 {
16695 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16696 ops[4], ops[5], true))
16697 std::swap (ops[1], ops[2]);
16698 }
16699 else
16700 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16701
16702 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16703 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16704 }
16705
16706 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16707 true. However due to issues with register allocation it is preferable
16708 to avoid tieing integer scalar and FP scalar modes. Executing integer
16709 operations in general registers is better than treating them as scalar
16710 vector operations. This reduces latency and avoids redundant int<->FP
16711 moves. So tie modes if they are either the same class, or vector modes
16712 with other vector modes, vector structs or any scalar mode. */
16713
16714 static bool
16715 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16716 {
16717 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16718 return true;
16719
16720 /* We specifically want to allow elements of "structure" modes to
16721 be tieable to the structure. This more general condition allows
16722 other rarer situations too. The reason we don't extend this to
16723 predicate modes is that there are no predicate structure modes
16724 nor any specific instructions for extracting part of a predicate
16725 register. */
16726 if (aarch64_vector_data_mode_p (mode1)
16727 && aarch64_vector_data_mode_p (mode2))
16728 return true;
16729
16730 /* Also allow any scalar modes with vectors. */
16731 if (aarch64_vector_mode_supported_p (mode1)
16732 || aarch64_vector_mode_supported_p (mode2))
16733 return true;
16734
16735 return false;
16736 }
16737
16738 /* Return a new RTX holding the result of moving POINTER forward by
16739 AMOUNT bytes. */
16740
16741 static rtx
16742 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16743 {
16744 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16745
16746 return adjust_automodify_address (pointer, GET_MODE (pointer),
16747 next, amount);
16748 }
16749
16750 /* Return a new RTX holding the result of moving POINTER forward by the
16751 size of the mode it points to. */
16752
16753 static rtx
16754 aarch64_progress_pointer (rtx pointer)
16755 {
16756 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16757 }
16758
16759 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16760 MODE bytes. */
16761
16762 static void
16763 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16764 machine_mode mode)
16765 {
16766 rtx reg = gen_reg_rtx (mode);
16767
16768 /* "Cast" the pointers to the correct mode. */
16769 *src = adjust_address (*src, mode, 0);
16770 *dst = adjust_address (*dst, mode, 0);
16771 /* Emit the memcpy. */
16772 emit_move_insn (reg, *src);
16773 emit_move_insn (*dst, reg);
16774 /* Move the pointers forward. */
16775 *src = aarch64_progress_pointer (*src);
16776 *dst = aarch64_progress_pointer (*dst);
16777 }
16778
16779 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16780 we succeed, otherwise return false. */
16781
16782 bool
16783 aarch64_expand_movmem (rtx *operands)
16784 {
16785 int n, mode_bits;
16786 rtx dst = operands[0];
16787 rtx src = operands[1];
16788 rtx base;
16789 machine_mode cur_mode = BLKmode, next_mode;
16790 bool speed_p = !optimize_function_for_size_p (cfun);
16791
16792 /* When optimizing for size, give a better estimate of the length of a
16793 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16794 will always require an even number of instructions to do now. And each
16795 operation requires both a load+store, so devide the max number by 2. */
16796 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16797
16798 /* We can't do anything smart if the amount to copy is not constant. */
16799 if (!CONST_INT_P (operands[2]))
16800 return false;
16801
16802 n = INTVAL (operands[2]);
16803
16804 /* Try to keep the number of instructions low. For all cases we will do at
16805 most two moves for the residual amount, since we'll always overlap the
16806 remainder. */
16807 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16808 return false;
16809
16810 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16811 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16812
16813 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16814 src = adjust_automodify_address (src, VOIDmode, base, 0);
16815
16816 /* Convert n to bits to make the rest of the code simpler. */
16817 n = n * BITS_PER_UNIT;
16818
16819 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
16820 larger than TImode, but we should not use them for loads/stores here. */
16821 const int copy_limit = GET_MODE_BITSIZE (TImode);
16822
16823 while (n > 0)
16824 {
16825 /* Find the largest mode in which to do the copy in without over reading
16826 or writing. */
16827 opt_scalar_int_mode mode_iter;
16828 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16829 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
16830 cur_mode = mode_iter.require ();
16831
16832 gcc_assert (cur_mode != BLKmode);
16833
16834 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16835 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16836
16837 n -= mode_bits;
16838
16839 /* Do certain trailing copies as overlapping if it's going to be
16840 cheaper. i.e. less instructions to do so. For instance doing a 15
16841 byte copy it's more efficient to do two overlapping 8 byte copies than
16842 8 + 6 + 1. */
16843 if (n > 0 && n <= 8 * BITS_PER_UNIT)
16844 {
16845 next_mode = smallest_mode_for_size (n, MODE_INT);
16846 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16847 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16848 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16849 n = n_bits;
16850 }
16851 }
16852
16853 return true;
16854 }
16855
16856 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16857 SImode stores. Handle the case when the constant has identical
16858 bottom and top halves. This is beneficial when the two stores can be
16859 merged into an STP and we avoid synthesising potentially expensive
16860 immediates twice. Return true if such a split is possible. */
16861
16862 bool
16863 aarch64_split_dimode_const_store (rtx dst, rtx src)
16864 {
16865 rtx lo = gen_lowpart (SImode, src);
16866 rtx hi = gen_highpart_mode (SImode, DImode, src);
16867
16868 bool size_p = optimize_function_for_size_p (cfun);
16869
16870 if (!rtx_equal_p (lo, hi))
16871 return false;
16872
16873 unsigned int orig_cost
16874 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16875 unsigned int lo_cost
16876 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16877
16878 /* We want to transform:
16879 MOV x1, 49370
16880 MOVK x1, 0x140, lsl 16
16881 MOVK x1, 0xc0da, lsl 32
16882 MOVK x1, 0x140, lsl 48
16883 STR x1, [x0]
16884 into:
16885 MOV w1, 49370
16886 MOVK w1, 0x140, lsl 16
16887 STP w1, w1, [x0]
16888 So we want to perform this only when we save two instructions
16889 or more. When optimizing for size, however, accept any code size
16890 savings we can. */
16891 if (size_p && orig_cost <= lo_cost)
16892 return false;
16893
16894 if (!size_p
16895 && (orig_cost <= lo_cost + 1))
16896 return false;
16897
16898 rtx mem_lo = adjust_address (dst, SImode, 0);
16899 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16900 return false;
16901
16902 rtx tmp_reg = gen_reg_rtx (SImode);
16903 aarch64_expand_mov_immediate (tmp_reg, lo);
16904 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16905 /* Don't emit an explicit store pair as this may not be always profitable.
16906 Let the sched-fusion logic decide whether to merge them. */
16907 emit_move_insn (mem_lo, tmp_reg);
16908 emit_move_insn (mem_hi, tmp_reg);
16909
16910 return true;
16911 }
16912
16913 /* Generate RTL for a conditional branch with rtx comparison CODE in
16914 mode CC_MODE. The destination of the unlikely conditional branch
16915 is LABEL_REF. */
16916
16917 void
16918 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
16919 rtx label_ref)
16920 {
16921 rtx x;
16922 x = gen_rtx_fmt_ee (code, VOIDmode,
16923 gen_rtx_REG (cc_mode, CC_REGNUM),
16924 const0_rtx);
16925
16926 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16927 gen_rtx_LABEL_REF (VOIDmode, label_ref),
16928 pc_rtx);
16929 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16930 }
16931
16932 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16933
16934 OP1 represents the TImode destination operand 1
16935 OP2 represents the TImode destination operand 2
16936 LOW_DEST represents the low half (DImode) of TImode operand 0
16937 LOW_IN1 represents the low half (DImode) of TImode operand 1
16938 LOW_IN2 represents the low half (DImode) of TImode operand 2
16939 HIGH_DEST represents the high half (DImode) of TImode operand 0
16940 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16941 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16942
16943 void
16944 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16945 rtx *low_in1, rtx *low_in2,
16946 rtx *high_dest, rtx *high_in1,
16947 rtx *high_in2)
16948 {
16949 *low_dest = gen_reg_rtx (DImode);
16950 *low_in1 = gen_lowpart (DImode, op1);
16951 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16952 subreg_lowpart_offset (DImode, TImode));
16953 *high_dest = gen_reg_rtx (DImode);
16954 *high_in1 = gen_highpart (DImode, op1);
16955 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16956 subreg_highpart_offset (DImode, TImode));
16957 }
16958
16959 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16960
16961 This function differs from 'arch64_addti_scratch_regs' in that
16962 OP1 can be an immediate constant (zero). We must call
16963 subreg_highpart_offset with DImode and TImode arguments, otherwise
16964 VOIDmode will be used for the const_int which generates an internal
16965 error from subreg_size_highpart_offset which does not expect a size of zero.
16966
16967 OP1 represents the TImode destination operand 1
16968 OP2 represents the TImode destination operand 2
16969 LOW_DEST represents the low half (DImode) of TImode operand 0
16970 LOW_IN1 represents the low half (DImode) of TImode operand 1
16971 LOW_IN2 represents the low half (DImode) of TImode operand 2
16972 HIGH_DEST represents the high half (DImode) of TImode operand 0
16973 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16974 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16975
16976
16977 void
16978 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16979 rtx *low_in1, rtx *low_in2,
16980 rtx *high_dest, rtx *high_in1,
16981 rtx *high_in2)
16982 {
16983 *low_dest = gen_reg_rtx (DImode);
16984 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
16985 subreg_lowpart_offset (DImode, TImode));
16986
16987 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16988 subreg_lowpart_offset (DImode, TImode));
16989 *high_dest = gen_reg_rtx (DImode);
16990
16991 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
16992 subreg_highpart_offset (DImode, TImode));
16993 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16994 subreg_highpart_offset (DImode, TImode));
16995 }
16996
16997 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16998
16999 OP0 represents the TImode destination operand 0
17000 LOW_DEST represents the low half (DImode) of TImode operand 0
17001 LOW_IN1 represents the low half (DImode) of TImode operand 1
17002 LOW_IN2 represents the low half (DImode) of TImode operand 2
17003 HIGH_DEST represents the high half (DImode) of TImode operand 0
17004 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17005 HIGH_IN2 represents the high half (DImode) of TImode operand 2
17006 UNSIGNED_P is true if the operation is being performed on unsigned
17007 values. */
17008 void
17009 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17010 rtx low_in2, rtx high_dest, rtx high_in1,
17011 rtx high_in2, bool unsigned_p)
17012 {
17013 if (low_in2 == const0_rtx)
17014 {
17015 low_dest = low_in1;
17016 high_in2 = force_reg (DImode, high_in2);
17017 if (unsigned_p)
17018 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17019 else
17020 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17021 }
17022 else
17023 {
17024 if (CONST_INT_P (low_in2))
17025 {
17026 high_in2 = force_reg (DImode, high_in2);
17027 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17028 GEN_INT (-INTVAL (low_in2))));
17029 }
17030 else
17031 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17032
17033 if (unsigned_p)
17034 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17035 else
17036 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17037 }
17038
17039 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17040 emit_move_insn (gen_highpart (DImode, op0), high_dest);
17041
17042 }
17043
17044 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
17045
17046 static unsigned HOST_WIDE_INT
17047 aarch64_asan_shadow_offset (void)
17048 {
17049 return (HOST_WIDE_INT_1 << 36);
17050 }
17051
17052 static rtx
17053 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17054 int code, tree treeop0, tree treeop1)
17055 {
17056 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17057 rtx op0, op1;
17058 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17059 insn_code icode;
17060 struct expand_operand ops[4];
17061
17062 start_sequence ();
17063 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17064
17065 op_mode = GET_MODE (op0);
17066 if (op_mode == VOIDmode)
17067 op_mode = GET_MODE (op1);
17068
17069 switch (op_mode)
17070 {
17071 case E_QImode:
17072 case E_HImode:
17073 case E_SImode:
17074 cmp_mode = SImode;
17075 icode = CODE_FOR_cmpsi;
17076 break;
17077
17078 case E_DImode:
17079 cmp_mode = DImode;
17080 icode = CODE_FOR_cmpdi;
17081 break;
17082
17083 case E_SFmode:
17084 cmp_mode = SFmode;
17085 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17086 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17087 break;
17088
17089 case E_DFmode:
17090 cmp_mode = DFmode;
17091 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17092 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17093 break;
17094
17095 default:
17096 end_sequence ();
17097 return NULL_RTX;
17098 }
17099
17100 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17101 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17102 if (!op0 || !op1)
17103 {
17104 end_sequence ();
17105 return NULL_RTX;
17106 }
17107 *prep_seq = get_insns ();
17108 end_sequence ();
17109
17110 create_fixed_operand (&ops[0], op0);
17111 create_fixed_operand (&ops[1], op1);
17112
17113 start_sequence ();
17114 if (!maybe_expand_insn (icode, 2, ops))
17115 {
17116 end_sequence ();
17117 return NULL_RTX;
17118 }
17119 *gen_seq = get_insns ();
17120 end_sequence ();
17121
17122 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17123 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17124 }
17125
17126 static rtx
17127 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17128 int cmp_code, tree treeop0, tree treeop1, int bit_code)
17129 {
17130 rtx op0, op1, target;
17131 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17132 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17133 insn_code icode;
17134 struct expand_operand ops[6];
17135 int aarch64_cond;
17136
17137 push_to_sequence (*prep_seq);
17138 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17139
17140 op_mode = GET_MODE (op0);
17141 if (op_mode == VOIDmode)
17142 op_mode = GET_MODE (op1);
17143
17144 switch (op_mode)
17145 {
17146 case E_QImode:
17147 case E_HImode:
17148 case E_SImode:
17149 cmp_mode = SImode;
17150 icode = CODE_FOR_ccmpsi;
17151 break;
17152
17153 case E_DImode:
17154 cmp_mode = DImode;
17155 icode = CODE_FOR_ccmpdi;
17156 break;
17157
17158 case E_SFmode:
17159 cmp_mode = SFmode;
17160 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17161 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17162 break;
17163
17164 case E_DFmode:
17165 cmp_mode = DFmode;
17166 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17167 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17168 break;
17169
17170 default:
17171 end_sequence ();
17172 return NULL_RTX;
17173 }
17174
17175 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17176 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17177 if (!op0 || !op1)
17178 {
17179 end_sequence ();
17180 return NULL_RTX;
17181 }
17182 *prep_seq = get_insns ();
17183 end_sequence ();
17184
17185 target = gen_rtx_REG (cc_mode, CC_REGNUM);
17186 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17187
17188 if (bit_code != AND)
17189 {
17190 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17191 GET_MODE (XEXP (prev, 0))),
17192 VOIDmode, XEXP (prev, 0), const0_rtx);
17193 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17194 }
17195
17196 create_fixed_operand (&ops[0], XEXP (prev, 0));
17197 create_fixed_operand (&ops[1], target);
17198 create_fixed_operand (&ops[2], op0);
17199 create_fixed_operand (&ops[3], op1);
17200 create_fixed_operand (&ops[4], prev);
17201 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17202
17203 push_to_sequence (*gen_seq);
17204 if (!maybe_expand_insn (icode, 6, ops))
17205 {
17206 end_sequence ();
17207 return NULL_RTX;
17208 }
17209
17210 *gen_seq = get_insns ();
17211 end_sequence ();
17212
17213 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17214 }
17215
17216 #undef TARGET_GEN_CCMP_FIRST
17217 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17218
17219 #undef TARGET_GEN_CCMP_NEXT
17220 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17221
17222 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17223 instruction fusion of some sort. */
17224
17225 static bool
17226 aarch64_macro_fusion_p (void)
17227 {
17228 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17229 }
17230
17231
17232 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17233 should be kept together during scheduling. */
17234
17235 static bool
17236 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17237 {
17238 rtx set_dest;
17239 rtx prev_set = single_set (prev);
17240 rtx curr_set = single_set (curr);
17241 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17242 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17243
17244 if (!aarch64_macro_fusion_p ())
17245 return false;
17246
17247 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17248 {
17249 /* We are trying to match:
17250 prev (mov) == (set (reg r0) (const_int imm16))
17251 curr (movk) == (set (zero_extract (reg r0)
17252 (const_int 16)
17253 (const_int 16))
17254 (const_int imm16_1)) */
17255
17256 set_dest = SET_DEST (curr_set);
17257
17258 if (GET_CODE (set_dest) == ZERO_EXTRACT
17259 && CONST_INT_P (SET_SRC (curr_set))
17260 && CONST_INT_P (SET_SRC (prev_set))
17261 && CONST_INT_P (XEXP (set_dest, 2))
17262 && INTVAL (XEXP (set_dest, 2)) == 16
17263 && REG_P (XEXP (set_dest, 0))
17264 && REG_P (SET_DEST (prev_set))
17265 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17266 {
17267 return true;
17268 }
17269 }
17270
17271 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17272 {
17273
17274 /* We're trying to match:
17275 prev (adrp) == (set (reg r1)
17276 (high (symbol_ref ("SYM"))))
17277 curr (add) == (set (reg r0)
17278 (lo_sum (reg r1)
17279 (symbol_ref ("SYM"))))
17280 Note that r0 need not necessarily be the same as r1, especially
17281 during pre-regalloc scheduling. */
17282
17283 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17284 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17285 {
17286 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17287 && REG_P (XEXP (SET_SRC (curr_set), 0))
17288 && REGNO (XEXP (SET_SRC (curr_set), 0))
17289 == REGNO (SET_DEST (prev_set))
17290 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17291 XEXP (SET_SRC (curr_set), 1)))
17292 return true;
17293 }
17294 }
17295
17296 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17297 {
17298
17299 /* We're trying to match:
17300 prev (movk) == (set (zero_extract (reg r0)
17301 (const_int 16)
17302 (const_int 32))
17303 (const_int imm16_1))
17304 curr (movk) == (set (zero_extract (reg r0)
17305 (const_int 16)
17306 (const_int 48))
17307 (const_int imm16_2)) */
17308
17309 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17310 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17311 && REG_P (XEXP (SET_DEST (prev_set), 0))
17312 && REG_P (XEXP (SET_DEST (curr_set), 0))
17313 && REGNO (XEXP (SET_DEST (prev_set), 0))
17314 == REGNO (XEXP (SET_DEST (curr_set), 0))
17315 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17316 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17317 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17318 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17319 && CONST_INT_P (SET_SRC (prev_set))
17320 && CONST_INT_P (SET_SRC (curr_set)))
17321 return true;
17322
17323 }
17324 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17325 {
17326 /* We're trying to match:
17327 prev (adrp) == (set (reg r0)
17328 (high (symbol_ref ("SYM"))))
17329 curr (ldr) == (set (reg r1)
17330 (mem (lo_sum (reg r0)
17331 (symbol_ref ("SYM")))))
17332 or
17333 curr (ldr) == (set (reg r1)
17334 (zero_extend (mem
17335 (lo_sum (reg r0)
17336 (symbol_ref ("SYM")))))) */
17337 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17338 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17339 {
17340 rtx curr_src = SET_SRC (curr_set);
17341
17342 if (GET_CODE (curr_src) == ZERO_EXTEND)
17343 curr_src = XEXP (curr_src, 0);
17344
17345 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17346 && REG_P (XEXP (XEXP (curr_src, 0), 0))
17347 && REGNO (XEXP (XEXP (curr_src, 0), 0))
17348 == REGNO (SET_DEST (prev_set))
17349 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17350 XEXP (SET_SRC (prev_set), 0)))
17351 return true;
17352 }
17353 }
17354
17355 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17356 && aarch_crypto_can_dual_issue (prev, curr))
17357 return true;
17358
17359 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17360 && any_condjump_p (curr))
17361 {
17362 unsigned int condreg1, condreg2;
17363 rtx cc_reg_1;
17364 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17365 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17366
17367 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17368 && prev
17369 && modified_in_p (cc_reg_1, prev))
17370 {
17371 enum attr_type prev_type = get_attr_type (prev);
17372
17373 /* FIXME: this misses some which is considered simple arthematic
17374 instructions for ThunderX. Simple shifts are missed here. */
17375 if (prev_type == TYPE_ALUS_SREG
17376 || prev_type == TYPE_ALUS_IMM
17377 || prev_type == TYPE_LOGICS_REG
17378 || prev_type == TYPE_LOGICS_IMM)
17379 return true;
17380 }
17381 }
17382
17383 if (prev_set
17384 && curr_set
17385 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17386 && any_condjump_p (curr))
17387 {
17388 /* We're trying to match:
17389 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17390 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
17391 (const_int 0))
17392 (label_ref ("SYM"))
17393 (pc)) */
17394 if (SET_DEST (curr_set) == (pc_rtx)
17395 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17396 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17397 && REG_P (SET_DEST (prev_set))
17398 && REGNO (SET_DEST (prev_set))
17399 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17400 {
17401 /* Fuse ALU operations followed by conditional branch instruction. */
17402 switch (get_attr_type (prev))
17403 {
17404 case TYPE_ALU_IMM:
17405 case TYPE_ALU_SREG:
17406 case TYPE_ADC_REG:
17407 case TYPE_ADC_IMM:
17408 case TYPE_ADCS_REG:
17409 case TYPE_ADCS_IMM:
17410 case TYPE_LOGIC_REG:
17411 case TYPE_LOGIC_IMM:
17412 case TYPE_CSEL:
17413 case TYPE_ADR:
17414 case TYPE_MOV_IMM:
17415 case TYPE_SHIFT_REG:
17416 case TYPE_SHIFT_IMM:
17417 case TYPE_BFM:
17418 case TYPE_RBIT:
17419 case TYPE_REV:
17420 case TYPE_EXTEND:
17421 return true;
17422
17423 default:;
17424 }
17425 }
17426 }
17427
17428 return false;
17429 }
17430
17431 /* Return true iff the instruction fusion described by OP is enabled. */
17432
17433 bool
17434 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17435 {
17436 return (aarch64_tune_params.fusible_ops & op) != 0;
17437 }
17438
17439 /* If MEM is in the form of [base+offset], extract the two parts
17440 of address and set to BASE and OFFSET, otherwise return false
17441 after clearing BASE and OFFSET. */
17442
17443 bool
17444 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17445 {
17446 rtx addr;
17447
17448 gcc_assert (MEM_P (mem));
17449
17450 addr = XEXP (mem, 0);
17451
17452 if (REG_P (addr))
17453 {
17454 *base = addr;
17455 *offset = const0_rtx;
17456 return true;
17457 }
17458
17459 if (GET_CODE (addr) == PLUS
17460 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17461 {
17462 *base = XEXP (addr, 0);
17463 *offset = XEXP (addr, 1);
17464 return true;
17465 }
17466
17467 *base = NULL_RTX;
17468 *offset = NULL_RTX;
17469
17470 return false;
17471 }
17472
17473 /* Types for scheduling fusion. */
17474 enum sched_fusion_type
17475 {
17476 SCHED_FUSION_NONE = 0,
17477 SCHED_FUSION_LD_SIGN_EXTEND,
17478 SCHED_FUSION_LD_ZERO_EXTEND,
17479 SCHED_FUSION_LD,
17480 SCHED_FUSION_ST,
17481 SCHED_FUSION_NUM
17482 };
17483
17484 /* If INSN is a load or store of address in the form of [base+offset],
17485 extract the two parts and set to BASE and OFFSET. Return scheduling
17486 fusion type this INSN is. */
17487
17488 static enum sched_fusion_type
17489 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17490 {
17491 rtx x, dest, src;
17492 enum sched_fusion_type fusion = SCHED_FUSION_LD;
17493
17494 gcc_assert (INSN_P (insn));
17495 x = PATTERN (insn);
17496 if (GET_CODE (x) != SET)
17497 return SCHED_FUSION_NONE;
17498
17499 src = SET_SRC (x);
17500 dest = SET_DEST (x);
17501
17502 machine_mode dest_mode = GET_MODE (dest);
17503
17504 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17505 return SCHED_FUSION_NONE;
17506
17507 if (GET_CODE (src) == SIGN_EXTEND)
17508 {
17509 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17510 src = XEXP (src, 0);
17511 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17512 return SCHED_FUSION_NONE;
17513 }
17514 else if (GET_CODE (src) == ZERO_EXTEND)
17515 {
17516 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17517 src = XEXP (src, 0);
17518 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17519 return SCHED_FUSION_NONE;
17520 }
17521
17522 if (GET_CODE (src) == MEM && REG_P (dest))
17523 extract_base_offset_in_addr (src, base, offset);
17524 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17525 {
17526 fusion = SCHED_FUSION_ST;
17527 extract_base_offset_in_addr (dest, base, offset);
17528 }
17529 else
17530 return SCHED_FUSION_NONE;
17531
17532 if (*base == NULL_RTX || *offset == NULL_RTX)
17533 fusion = SCHED_FUSION_NONE;
17534
17535 return fusion;
17536 }
17537
17538 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17539
17540 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17541 and PRI are only calculated for these instructions. For other instruction,
17542 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
17543 type instruction fusion can be added by returning different priorities.
17544
17545 It's important that irrelevant instructions get the largest FUSION_PRI. */
17546
17547 static void
17548 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17549 int *fusion_pri, int *pri)
17550 {
17551 int tmp, off_val;
17552 rtx base, offset;
17553 enum sched_fusion_type fusion;
17554
17555 gcc_assert (INSN_P (insn));
17556
17557 tmp = max_pri - 1;
17558 fusion = fusion_load_store (insn, &base, &offset);
17559 if (fusion == SCHED_FUSION_NONE)
17560 {
17561 *pri = tmp;
17562 *fusion_pri = tmp;
17563 return;
17564 }
17565
17566 /* Set FUSION_PRI according to fusion type and base register. */
17567 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17568
17569 /* Calculate PRI. */
17570 tmp /= 2;
17571
17572 /* INSN with smaller offset goes first. */
17573 off_val = (int)(INTVAL (offset));
17574 if (off_val >= 0)
17575 tmp -= (off_val & 0xfffff);
17576 else
17577 tmp += ((- off_val) & 0xfffff);
17578
17579 *pri = tmp;
17580 return;
17581 }
17582
17583 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17584 Adjust priority of sha1h instructions so they are scheduled before
17585 other SHA1 instructions. */
17586
17587 static int
17588 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17589 {
17590 rtx x = PATTERN (insn);
17591
17592 if (GET_CODE (x) == SET)
17593 {
17594 x = SET_SRC (x);
17595
17596 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17597 return priority + 10;
17598 }
17599
17600 return priority;
17601 }
17602
17603 /* Given OPERANDS of consecutive load/store, check if we can merge
17604 them into ldp/stp. LOAD is true if they are load instructions.
17605 MODE is the mode of memory operands. */
17606
17607 bool
17608 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17609 machine_mode mode)
17610 {
17611 HOST_WIDE_INT offval_1, offval_2, msize;
17612 enum reg_class rclass_1, rclass_2;
17613 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17614
17615 if (load)
17616 {
17617 mem_1 = operands[1];
17618 mem_2 = operands[3];
17619 reg_1 = operands[0];
17620 reg_2 = operands[2];
17621 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17622 if (REGNO (reg_1) == REGNO (reg_2))
17623 return false;
17624 }
17625 else
17626 {
17627 mem_1 = operands[0];
17628 mem_2 = operands[2];
17629 reg_1 = operands[1];
17630 reg_2 = operands[3];
17631 }
17632
17633 /* The mems cannot be volatile. */
17634 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17635 return false;
17636
17637 /* If we have SImode and slow unaligned ldp,
17638 check the alignment to be at least 8 byte. */
17639 if (mode == SImode
17640 && (aarch64_tune_params.extra_tuning_flags
17641 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17642 && !optimize_size
17643 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17644 return false;
17645
17646 /* Check if the addresses are in the form of [base+offset]. */
17647 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17648 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17649 return false;
17650 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17651 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17652 return false;
17653
17654 /* Check if the bases are same. */
17655 if (!rtx_equal_p (base_1, base_2))
17656 return false;
17657
17658 /* The operands must be of the same size. */
17659 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17660 GET_MODE_SIZE (GET_MODE (mem_2))));
17661
17662 offval_1 = INTVAL (offset_1);
17663 offval_2 = INTVAL (offset_2);
17664 /* We should only be trying this for fixed-sized modes. There is no
17665 SVE LDP/STP instruction. */
17666 msize = GET_MODE_SIZE (mode).to_constant ();
17667 /* Check if the offsets are consecutive. */
17668 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17669 return false;
17670
17671 /* Check if the addresses are clobbered by load. */
17672 if (load)
17673 {
17674 if (reg_mentioned_p (reg_1, mem_1))
17675 return false;
17676
17677 /* In increasing order, the last load can clobber the address. */
17678 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17679 return false;
17680 }
17681
17682 /* One of the memory accesses must be a mempair operand.
17683 If it is not the first one, they need to be swapped by the
17684 peephole. */
17685 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17686 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17687 return false;
17688
17689 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17690 rclass_1 = FP_REGS;
17691 else
17692 rclass_1 = GENERAL_REGS;
17693
17694 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17695 rclass_2 = FP_REGS;
17696 else
17697 rclass_2 = GENERAL_REGS;
17698
17699 /* Check if the registers are of same class. */
17700 if (rclass_1 != rclass_2)
17701 return false;
17702
17703 return true;
17704 }
17705
17706 /* Given OPERANDS of consecutive load/store that can be merged,
17707 swap them if they are not in ascending order. */
17708 void
17709 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17710 {
17711 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17712 HOST_WIDE_INT offval_1, offval_2;
17713
17714 if (load)
17715 {
17716 mem_1 = operands[1];
17717 mem_2 = operands[3];
17718 }
17719 else
17720 {
17721 mem_1 = operands[0];
17722 mem_2 = operands[2];
17723 }
17724
17725 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17726 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17727
17728 offval_1 = INTVAL (offset_1);
17729 offval_2 = INTVAL (offset_2);
17730
17731 if (offval_1 > offval_2)
17732 {
17733 /* Irrespective of whether this is a load or a store,
17734 we do the same swap. */
17735 std::swap (operands[0], operands[2]);
17736 std::swap (operands[1], operands[3]);
17737 }
17738 }
17739
17740 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17741 comparison between the two. */
17742 int
17743 aarch64_host_wide_int_compare (const void *x, const void *y)
17744 {
17745 return wi::cmps (* ((const HOST_WIDE_INT *) x),
17746 * ((const HOST_WIDE_INT *) y));
17747 }
17748
17749 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17750 other pointing to a REG rtx containing an offset, compare the offsets
17751 of the two pairs.
17752
17753 Return:
17754
17755 1 iff offset (X) > offset (Y)
17756 0 iff offset (X) == offset (Y)
17757 -1 iff offset (X) < offset (Y) */
17758 int
17759 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17760 {
17761 const rtx * operands_1 = (const rtx *) x;
17762 const rtx * operands_2 = (const rtx *) y;
17763 rtx mem_1, mem_2, base, offset_1, offset_2;
17764
17765 if (MEM_P (operands_1[0]))
17766 mem_1 = operands_1[0];
17767 else
17768 mem_1 = operands_1[1];
17769
17770 if (MEM_P (operands_2[0]))
17771 mem_2 = operands_2[0];
17772 else
17773 mem_2 = operands_2[1];
17774
17775 /* Extract the offsets. */
17776 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17777 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17778
17779 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17780
17781 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17782 }
17783
17784 /* Given OPERANDS of consecutive load/store, check if we can merge
17785 them into ldp/stp by adjusting the offset. LOAD is true if they
17786 are load instructions. MODE is the mode of memory operands.
17787
17788 Given below consecutive stores:
17789
17790 str w1, [xb, 0x100]
17791 str w1, [xb, 0x104]
17792 str w1, [xb, 0x108]
17793 str w1, [xb, 0x10c]
17794
17795 Though the offsets are out of the range supported by stp, we can
17796 still pair them after adjusting the offset, like:
17797
17798 add scratch, xb, 0x100
17799 stp w1, w1, [scratch]
17800 stp w1, w1, [scratch, 0x8]
17801
17802 The peephole patterns detecting this opportunity should guarantee
17803 the scratch register is avaliable. */
17804
17805 bool
17806 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17807 scalar_mode mode)
17808 {
17809 const int num_insns = 4;
17810 enum reg_class rclass;
17811 HOST_WIDE_INT offvals[num_insns], msize;
17812 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17813
17814 if (load)
17815 {
17816 for (int i = 0; i < num_insns; i++)
17817 {
17818 reg[i] = operands[2 * i];
17819 mem[i] = operands[2 * i + 1];
17820
17821 gcc_assert (REG_P (reg[i]));
17822 }
17823
17824 /* Do not attempt to merge the loads if the loads clobber each other. */
17825 for (int i = 0; i < 8; i += 2)
17826 for (int j = i + 2; j < 8; j += 2)
17827 if (reg_overlap_mentioned_p (operands[i], operands[j]))
17828 return false;
17829 }
17830 else
17831 for (int i = 0; i < num_insns; i++)
17832 {
17833 mem[i] = operands[2 * i];
17834 reg[i] = operands[2 * i + 1];
17835 }
17836
17837 /* Skip if memory operand is by itself valid for ldp/stp. */
17838 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
17839 return false;
17840
17841 for (int i = 0; i < num_insns; i++)
17842 {
17843 /* The mems cannot be volatile. */
17844 if (MEM_VOLATILE_P (mem[i]))
17845 return false;
17846
17847 /* Check if the addresses are in the form of [base+offset]. */
17848 extract_base_offset_in_addr (mem[i], base + i, offset + i);
17849 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
17850 return false;
17851 }
17852
17853 /* Check if the registers are of same class. */
17854 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
17855 ? FP_REGS : GENERAL_REGS;
17856
17857 for (int i = 1; i < num_insns; i++)
17858 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
17859 {
17860 if (rclass != FP_REGS)
17861 return false;
17862 }
17863 else
17864 {
17865 if (rclass != GENERAL_REGS)
17866 return false;
17867 }
17868
17869 /* Only the last register in the order in which they occur
17870 may be clobbered by the load. */
17871 if (rclass == GENERAL_REGS && load)
17872 for (int i = 0; i < num_insns - 1; i++)
17873 if (reg_mentioned_p (reg[i], mem[i]))
17874 return false;
17875
17876 /* Check if the bases are same. */
17877 for (int i = 0; i < num_insns - 1; i++)
17878 if (!rtx_equal_p (base[i], base[i + 1]))
17879 return false;
17880
17881 for (int i = 0; i < num_insns; i++)
17882 offvals[i] = INTVAL (offset[i]);
17883
17884 msize = GET_MODE_SIZE (mode);
17885
17886 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17887 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
17888 aarch64_host_wide_int_compare);
17889
17890 if (!(offvals[1] == offvals[0] + msize
17891 && offvals[3] == offvals[2] + msize))
17892 return false;
17893
17894 /* Check that offsets are within range of each other. The ldp/stp
17895 instructions have 7 bit immediate offsets, so use 0x80. */
17896 if (offvals[2] - offvals[0] >= msize * 0x80)
17897 return false;
17898
17899 /* The offsets must be aligned with respect to each other. */
17900 if (offvals[0] % msize != offvals[2] % msize)
17901 return false;
17902
17903 /* If we have SImode and slow unaligned ldp,
17904 check the alignment to be at least 8 byte. */
17905 if (mode == SImode
17906 && (aarch64_tune_params.extra_tuning_flags
17907 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17908 && !optimize_size
17909 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
17910 return false;
17911
17912 return true;
17913 }
17914
17915 /* Given OPERANDS of consecutive load/store, this function pairs them
17916 into LDP/STP after adjusting the offset. It depends on the fact
17917 that the operands can be sorted so the offsets are correct for STP.
17918 MODE is the mode of memory operands. CODE is the rtl operator
17919 which should be applied to all memory operands, it's SIGN_EXTEND,
17920 ZERO_EXTEND or UNKNOWN. */
17921
17922 bool
17923 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17924 scalar_mode mode, RTX_CODE code)
17925 {
17926 rtx base, offset_1, offset_3, t1, t2;
17927 rtx mem_1, mem_2, mem_3, mem_4;
17928 rtx temp_operands[8];
17929 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17930 stp_off_upper_limit, stp_off_lower_limit, msize;
17931
17932 /* We make changes on a copy as we may still bail out. */
17933 for (int i = 0; i < 8; i ++)
17934 temp_operands[i] = operands[i];
17935
17936 /* Sort the operands. */
17937 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17938
17939 if (load)
17940 {
17941 mem_1 = temp_operands[1];
17942 mem_2 = temp_operands[3];
17943 mem_3 = temp_operands[5];
17944 mem_4 = temp_operands[7];
17945 }
17946 else
17947 {
17948 mem_1 = temp_operands[0];
17949 mem_2 = temp_operands[2];
17950 mem_3 = temp_operands[4];
17951 mem_4 = temp_operands[6];
17952 gcc_assert (code == UNKNOWN);
17953 }
17954
17955 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17956 extract_base_offset_in_addr (mem_3, &base, &offset_3);
17957 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17958 && offset_3 != NULL_RTX);
17959
17960 /* Adjust offset so it can fit in LDP/STP instruction. */
17961 msize = GET_MODE_SIZE (mode);
17962 stp_off_upper_limit = msize * (0x40 - 1);
17963 stp_off_lower_limit = - msize * 0x40;
17964
17965 off_val_1 = INTVAL (offset_1);
17966 off_val_3 = INTVAL (offset_3);
17967
17968 /* The base offset is optimally half way between the two STP/LDP offsets. */
17969 if (msize <= 4)
17970 base_off = (off_val_1 + off_val_3) / 2;
17971 else
17972 /* However, due to issues with negative LDP/STP offset generation for
17973 larger modes, for DF, DI and vector modes. we must not use negative
17974 addresses smaller than 9 signed unadjusted bits can store. This
17975 provides the most range in this case. */
17976 base_off = off_val_1;
17977
17978 /* Adjust the base so that it is aligned with the addresses but still
17979 optimal. */
17980 if (base_off % msize != off_val_1 % msize)
17981 /* Fix the offset, bearing in mind we want to make it bigger not
17982 smaller. */
17983 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17984 else if (msize <= 4)
17985 /* The negative range of LDP/STP is one larger than the positive range. */
17986 base_off += msize;
17987
17988 /* Check if base offset is too big or too small. We can attempt to resolve
17989 this issue by setting it to the maximum value and seeing if the offsets
17990 still fit. */
17991 if (base_off >= 0x1000)
17992 {
17993 base_off = 0x1000 - 1;
17994 /* We must still make sure that the base offset is aligned with respect
17995 to the address. But it may may not be made any bigger. */
17996 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17997 }
17998
17999 /* Likewise for the case where the base is too small. */
18000 if (base_off <= -0x1000)
18001 {
18002 base_off = -0x1000 + 1;
18003 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18004 }
18005
18006 /* Offset of the first STP/LDP. */
18007 new_off_1 = off_val_1 - base_off;
18008
18009 /* Offset of the second STP/LDP. */
18010 new_off_3 = off_val_3 - base_off;
18011
18012 /* The offsets must be within the range of the LDP/STP instructions. */
18013 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18014 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18015 return false;
18016
18017 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18018 new_off_1), true);
18019 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18020 new_off_1 + msize), true);
18021 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18022 new_off_3), true);
18023 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18024 new_off_3 + msize), true);
18025
18026 if (!aarch64_mem_pair_operand (mem_1, mode)
18027 || !aarch64_mem_pair_operand (mem_3, mode))
18028 return false;
18029
18030 if (code == ZERO_EXTEND)
18031 {
18032 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18033 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18034 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18035 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18036 }
18037 else if (code == SIGN_EXTEND)
18038 {
18039 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18040 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18041 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18042 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18043 }
18044
18045 if (load)
18046 {
18047 operands[0] = temp_operands[0];
18048 operands[1] = mem_1;
18049 operands[2] = temp_operands[2];
18050 operands[3] = mem_2;
18051 operands[4] = temp_operands[4];
18052 operands[5] = mem_3;
18053 operands[6] = temp_operands[6];
18054 operands[7] = mem_4;
18055 }
18056 else
18057 {
18058 operands[0] = mem_1;
18059 operands[1] = temp_operands[1];
18060 operands[2] = mem_2;
18061 operands[3] = temp_operands[3];
18062 operands[4] = mem_3;
18063 operands[5] = temp_operands[5];
18064 operands[6] = mem_4;
18065 operands[7] = temp_operands[7];
18066 }
18067
18068 /* Emit adjusting instruction. */
18069 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18070 /* Emit ldp/stp instructions. */
18071 t1 = gen_rtx_SET (operands[0], operands[1]);
18072 t2 = gen_rtx_SET (operands[2], operands[3]);
18073 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18074 t1 = gen_rtx_SET (operands[4], operands[5]);
18075 t2 = gen_rtx_SET (operands[6], operands[7]);
18076 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18077 return true;
18078 }
18079
18080 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18081 it isn't worth branching around empty masked ops (including masked
18082 stores). */
18083
18084 static bool
18085 aarch64_empty_mask_is_expensive (unsigned)
18086 {
18087 return false;
18088 }
18089
18090 /* Return 1 if pseudo register should be created and used to hold
18091 GOT address for PIC code. */
18092
18093 bool
18094 aarch64_use_pseudo_pic_reg (void)
18095 {
18096 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18097 }
18098
18099 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18100
18101 static int
18102 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18103 {
18104 switch (XINT (x, 1))
18105 {
18106 case UNSPEC_GOTSMALLPIC:
18107 case UNSPEC_GOTSMALLPIC28K:
18108 case UNSPEC_GOTTINYPIC:
18109 return 0;
18110 default:
18111 break;
18112 }
18113
18114 return default_unspec_may_trap_p (x, flags);
18115 }
18116
18117
18118 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18119 return the log2 of that value. Otherwise return -1. */
18120
18121 int
18122 aarch64_fpconst_pow_of_2 (rtx x)
18123 {
18124 const REAL_VALUE_TYPE *r;
18125
18126 if (!CONST_DOUBLE_P (x))
18127 return -1;
18128
18129 r = CONST_DOUBLE_REAL_VALUE (x);
18130
18131 if (REAL_VALUE_NEGATIVE (*r)
18132 || REAL_VALUE_ISNAN (*r)
18133 || REAL_VALUE_ISINF (*r)
18134 || !real_isinteger (r, DFmode))
18135 return -1;
18136
18137 return exact_log2 (real_to_integer (r));
18138 }
18139
18140 /* If X is a vector of equal CONST_DOUBLE values and that value is
18141 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18142
18143 int
18144 aarch64_vec_fpconst_pow_of_2 (rtx x)
18145 {
18146 int nelts;
18147 if (GET_CODE (x) != CONST_VECTOR
18148 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18149 return -1;
18150
18151 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18152 return -1;
18153
18154 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18155 if (firstval <= 0)
18156 return -1;
18157
18158 for (int i = 1; i < nelts; i++)
18159 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18160 return -1;
18161
18162 return firstval;
18163 }
18164
18165 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18166 to float.
18167
18168 __fp16 always promotes through this hook.
18169 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18170 through the generic excess precision logic rather than here. */
18171
18172 static tree
18173 aarch64_promoted_type (const_tree t)
18174 {
18175 if (SCALAR_FLOAT_TYPE_P (t)
18176 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18177 return float_type_node;
18178
18179 return NULL_TREE;
18180 }
18181
18182 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18183
18184 static bool
18185 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18186 optimization_type opt_type)
18187 {
18188 switch (op)
18189 {
18190 case rsqrt_optab:
18191 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18192
18193 default:
18194 return true;
18195 }
18196 }
18197
18198 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18199
18200 static unsigned int
18201 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18202 int *offset)
18203 {
18204 /* Polynomial invariant 1 == (VG / 2) - 1. */
18205 gcc_assert (i == 1);
18206 *factor = 2;
18207 *offset = 1;
18208 return AARCH64_DWARF_VG;
18209 }
18210
18211 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18212 if MODE is HFmode, and punt to the generic implementation otherwise. */
18213
18214 static bool
18215 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18216 {
18217 return (mode == HFmode
18218 ? true
18219 : default_libgcc_floating_mode_supported_p (mode));
18220 }
18221
18222 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18223 if MODE is HFmode, and punt to the generic implementation otherwise. */
18224
18225 static bool
18226 aarch64_scalar_mode_supported_p (scalar_mode mode)
18227 {
18228 return (mode == HFmode
18229 ? true
18230 : default_scalar_mode_supported_p (mode));
18231 }
18232
18233 /* Set the value of FLT_EVAL_METHOD.
18234 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18235
18236 0: evaluate all operations and constants, whose semantic type has at
18237 most the range and precision of type float, to the range and
18238 precision of float; evaluate all other operations and constants to
18239 the range and precision of the semantic type;
18240
18241 N, where _FloatN is a supported interchange floating type
18242 evaluate all operations and constants, whose semantic type has at
18243 most the range and precision of _FloatN type, to the range and
18244 precision of the _FloatN type; evaluate all other operations and
18245 constants to the range and precision of the semantic type;
18246
18247 If we have the ARMv8.2-A extensions then we support _Float16 in native
18248 precision, so we should set this to 16. Otherwise, we support the type,
18249 but want to evaluate expressions in float precision, so set this to
18250 0. */
18251
18252 static enum flt_eval_method
18253 aarch64_excess_precision (enum excess_precision_type type)
18254 {
18255 switch (type)
18256 {
18257 case EXCESS_PRECISION_TYPE_FAST:
18258 case EXCESS_PRECISION_TYPE_STANDARD:
18259 /* We can calculate either in 16-bit range and precision or
18260 32-bit range and precision. Make that decision based on whether
18261 we have native support for the ARMv8.2-A 16-bit floating-point
18262 instructions or not. */
18263 return (TARGET_FP_F16INST
18264 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18265 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18266 case EXCESS_PRECISION_TYPE_IMPLICIT:
18267 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18268 default:
18269 gcc_unreachable ();
18270 }
18271 return FLT_EVAL_METHOD_UNPREDICTABLE;
18272 }
18273
18274 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18275 scheduled for speculative execution. Reject the long-running division
18276 and square-root instructions. */
18277
18278 static bool
18279 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18280 {
18281 switch (get_attr_type (insn))
18282 {
18283 case TYPE_SDIV:
18284 case TYPE_UDIV:
18285 case TYPE_FDIVS:
18286 case TYPE_FDIVD:
18287 case TYPE_FSQRTS:
18288 case TYPE_FSQRTD:
18289 case TYPE_NEON_FP_SQRT_S:
18290 case TYPE_NEON_FP_SQRT_D:
18291 case TYPE_NEON_FP_SQRT_S_Q:
18292 case TYPE_NEON_FP_SQRT_D_Q:
18293 case TYPE_NEON_FP_DIV_S:
18294 case TYPE_NEON_FP_DIV_D:
18295 case TYPE_NEON_FP_DIV_S_Q:
18296 case TYPE_NEON_FP_DIV_D_Q:
18297 return false;
18298 default:
18299 return true;
18300 }
18301 }
18302
18303 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18304
18305 static int
18306 aarch64_compute_pressure_classes (reg_class *classes)
18307 {
18308 int i = 0;
18309 classes[i++] = GENERAL_REGS;
18310 classes[i++] = FP_REGS;
18311 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18312 registers need to go in PR_LO_REGS at some point during their
18313 lifetime. Splitting it into two halves has the effect of making
18314 all predicates count against PR_LO_REGS, so that we try whenever
18315 possible to restrict the number of live predicates to 8. This
18316 greatly reduces the amount of spilling in certain loops. */
18317 classes[i++] = PR_LO_REGS;
18318 classes[i++] = PR_HI_REGS;
18319 return i;
18320 }
18321
18322 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18323
18324 static bool
18325 aarch64_can_change_mode_class (machine_mode from,
18326 machine_mode to, reg_class_t)
18327 {
18328 if (BYTES_BIG_ENDIAN)
18329 {
18330 bool from_sve_p = aarch64_sve_data_mode_p (from);
18331 bool to_sve_p = aarch64_sve_data_mode_p (to);
18332
18333 /* Don't allow changes between SVE data modes and non-SVE modes.
18334 See the comment at the head of aarch64-sve.md for details. */
18335 if (from_sve_p != to_sve_p)
18336 return false;
18337
18338 /* Don't allow changes in element size: lane 0 of the new vector
18339 would not then be lane 0 of the old vector. See the comment
18340 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18341 description.
18342
18343 In the worst case, this forces a register to be spilled in
18344 one mode and reloaded in the other, which handles the
18345 endianness correctly. */
18346 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18347 return false;
18348 }
18349 return true;
18350 }
18351
18352 /* Implement TARGET_EARLY_REMAT_MODES. */
18353
18354 static void
18355 aarch64_select_early_remat_modes (sbitmap modes)
18356 {
18357 /* SVE values are not normally live across a call, so it should be
18358 worth doing early rematerialization even in VL-specific mode. */
18359 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18360 {
18361 machine_mode mode = (machine_mode) i;
18362 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18363 if (vec_flags & VEC_ANY_SVE)
18364 bitmap_set_bit (modes, i);
18365 }
18366 }
18367
18368 /* Override the default target speculation_safe_value. */
18369 static rtx
18370 aarch64_speculation_safe_value (machine_mode mode,
18371 rtx result, rtx val, rtx failval)
18372 {
18373 /* Maybe we should warn if falling back to hard barriers. They are
18374 likely to be noticably more expensive than the alternative below. */
18375 if (!aarch64_track_speculation)
18376 return default_speculation_safe_value (mode, result, val, failval);
18377
18378 if (!REG_P (val))
18379 val = copy_to_mode_reg (mode, val);
18380
18381 if (!aarch64_reg_or_zero (failval, mode))
18382 failval = copy_to_mode_reg (mode, failval);
18383
18384 emit_insn (gen_despeculate_copy (mode, result, val, failval));
18385 return result;
18386 }
18387
18388 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18389 Look into the tuning structure for an estimate.
18390 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18391 Advanced SIMD 128 bits. */
18392
18393 static HOST_WIDE_INT
18394 aarch64_estimated_poly_value (poly_int64 val)
18395 {
18396 enum aarch64_sve_vector_bits_enum width_source
18397 = aarch64_tune_params.sve_width;
18398
18399 /* If we still don't have an estimate, use the default. */
18400 if (width_source == SVE_SCALABLE)
18401 return default_estimated_poly_value (val);
18402
18403 HOST_WIDE_INT over_128 = width_source - 128;
18404 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18405 }
18406
18407 /* Target-specific selftests. */
18408
18409 #if CHECKING_P
18410
18411 namespace selftest {
18412
18413 /* Selftest for the RTL loader.
18414 Verify that the RTL loader copes with a dump from
18415 print_rtx_function. This is essentially just a test that class
18416 function_reader can handle a real dump, but it also verifies
18417 that lookup_reg_by_dump_name correctly handles hard regs.
18418 The presence of hard reg names in the dump means that the test is
18419 target-specific, hence it is in this file. */
18420
18421 static void
18422 aarch64_test_loading_full_dump ()
18423 {
18424 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18425
18426 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18427
18428 rtx_insn *insn_1 = get_insn_by_uid (1);
18429 ASSERT_EQ (NOTE, GET_CODE (insn_1));
18430
18431 rtx_insn *insn_15 = get_insn_by_uid (15);
18432 ASSERT_EQ (INSN, GET_CODE (insn_15));
18433 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18434
18435 /* Verify crtl->return_rtx. */
18436 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18437 ASSERT_EQ (0, REGNO (crtl->return_rtx));
18438 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18439 }
18440
18441 /* Run all target-specific selftests. */
18442
18443 static void
18444 aarch64_run_selftests (void)
18445 {
18446 aarch64_test_loading_full_dump ();
18447 }
18448
18449 } // namespace selftest
18450
18451 #endif /* #if CHECKING_P */
18452
18453 #undef TARGET_ADDRESS_COST
18454 #define TARGET_ADDRESS_COST aarch64_address_cost
18455
18456 /* This hook will determines whether unnamed bitfields affect the alignment
18457 of the containing structure. The hook returns true if the structure
18458 should inherit the alignment requirements of an unnamed bitfield's
18459 type. */
18460 #undef TARGET_ALIGN_ANON_BITFIELD
18461 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18462
18463 #undef TARGET_ASM_ALIGNED_DI_OP
18464 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18465
18466 #undef TARGET_ASM_ALIGNED_HI_OP
18467 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18468
18469 #undef TARGET_ASM_ALIGNED_SI_OP
18470 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18471
18472 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18473 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18474 hook_bool_const_tree_hwi_hwi_const_tree_true
18475
18476 #undef TARGET_ASM_FILE_START
18477 #define TARGET_ASM_FILE_START aarch64_start_file
18478
18479 #undef TARGET_ASM_OUTPUT_MI_THUNK
18480 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18481
18482 #undef TARGET_ASM_SELECT_RTX_SECTION
18483 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18484
18485 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18486 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18487
18488 #undef TARGET_BUILD_BUILTIN_VA_LIST
18489 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18490
18491 #undef TARGET_CALLEE_COPIES
18492 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18493
18494 #undef TARGET_CAN_ELIMINATE
18495 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18496
18497 #undef TARGET_CAN_INLINE_P
18498 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18499
18500 #undef TARGET_CANNOT_FORCE_CONST_MEM
18501 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18502
18503 #undef TARGET_CASE_VALUES_THRESHOLD
18504 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18505
18506 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18507 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18508
18509 /* Only the least significant bit is used for initialization guard
18510 variables. */
18511 #undef TARGET_CXX_GUARD_MASK_BIT
18512 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18513
18514 #undef TARGET_C_MODE_FOR_SUFFIX
18515 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18516
18517 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18518 #undef TARGET_DEFAULT_TARGET_FLAGS
18519 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18520 #endif
18521
18522 #undef TARGET_CLASS_MAX_NREGS
18523 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18524
18525 #undef TARGET_BUILTIN_DECL
18526 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18527
18528 #undef TARGET_BUILTIN_RECIPROCAL
18529 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18530
18531 #undef TARGET_C_EXCESS_PRECISION
18532 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18533
18534 #undef TARGET_EXPAND_BUILTIN
18535 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18536
18537 #undef TARGET_EXPAND_BUILTIN_VA_START
18538 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18539
18540 #undef TARGET_FOLD_BUILTIN
18541 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18542
18543 #undef TARGET_FUNCTION_ARG
18544 #define TARGET_FUNCTION_ARG aarch64_function_arg
18545
18546 #undef TARGET_FUNCTION_ARG_ADVANCE
18547 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18548
18549 #undef TARGET_FUNCTION_ARG_BOUNDARY
18550 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18551
18552 #undef TARGET_FUNCTION_ARG_PADDING
18553 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18554
18555 #undef TARGET_GET_RAW_RESULT_MODE
18556 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18557 #undef TARGET_GET_RAW_ARG_MODE
18558 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18559
18560 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18561 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18562
18563 #undef TARGET_FUNCTION_VALUE
18564 #define TARGET_FUNCTION_VALUE aarch64_function_value
18565
18566 #undef TARGET_FUNCTION_VALUE_REGNO_P
18567 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18568
18569 #undef TARGET_GIMPLE_FOLD_BUILTIN
18570 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18571
18572 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18573 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18574
18575 #undef TARGET_INIT_BUILTINS
18576 #define TARGET_INIT_BUILTINS aarch64_init_builtins
18577
18578 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18579 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18580 aarch64_ira_change_pseudo_allocno_class
18581
18582 #undef TARGET_LEGITIMATE_ADDRESS_P
18583 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18584
18585 #undef TARGET_LEGITIMATE_CONSTANT_P
18586 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18587
18588 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18589 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18590 aarch64_legitimize_address_displacement
18591
18592 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18593 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18594
18595 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18596 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18597 aarch64_libgcc_floating_mode_supported_p
18598
18599 #undef TARGET_MANGLE_TYPE
18600 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18601
18602 #undef TARGET_MEMORY_MOVE_COST
18603 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18604
18605 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18606 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18607
18608 #undef TARGET_MUST_PASS_IN_STACK
18609 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18610
18611 /* This target hook should return true if accesses to volatile bitfields
18612 should use the narrowest mode possible. It should return false if these
18613 accesses should use the bitfield container type. */
18614 #undef TARGET_NARROW_VOLATILE_BITFIELD
18615 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18616
18617 #undef TARGET_OPTION_OVERRIDE
18618 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18619
18620 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18621 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18622 aarch64_override_options_after_change
18623
18624 #undef TARGET_OPTION_SAVE
18625 #define TARGET_OPTION_SAVE aarch64_option_save
18626
18627 #undef TARGET_OPTION_RESTORE
18628 #define TARGET_OPTION_RESTORE aarch64_option_restore
18629
18630 #undef TARGET_OPTION_PRINT
18631 #define TARGET_OPTION_PRINT aarch64_option_print
18632
18633 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18634 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18635
18636 #undef TARGET_SET_CURRENT_FUNCTION
18637 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18638
18639 #undef TARGET_PASS_BY_REFERENCE
18640 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18641
18642 #undef TARGET_PREFERRED_RELOAD_CLASS
18643 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18644
18645 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18646 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18647
18648 #undef TARGET_PROMOTED_TYPE
18649 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18650
18651 #undef TARGET_SECONDARY_RELOAD
18652 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18653
18654 #undef TARGET_SHIFT_TRUNCATION_MASK
18655 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
18656
18657 #undef TARGET_SETUP_INCOMING_VARARGS
18658 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
18659
18660 #undef TARGET_STRUCT_VALUE_RTX
18661 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
18662
18663 #undef TARGET_REGISTER_MOVE_COST
18664 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
18665
18666 #undef TARGET_RETURN_IN_MEMORY
18667 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
18668
18669 #undef TARGET_RETURN_IN_MSB
18670 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
18671
18672 #undef TARGET_RTX_COSTS
18673 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
18674
18675 #undef TARGET_SCALAR_MODE_SUPPORTED_P
18676 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
18677
18678 #undef TARGET_SCHED_ISSUE_RATE
18679 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
18680
18681 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
18682 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
18683 aarch64_sched_first_cycle_multipass_dfa_lookahead
18684
18685 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
18686 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
18687 aarch64_first_cycle_multipass_dfa_lookahead_guard
18688
18689 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
18690 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
18691 aarch64_get_separate_components
18692
18693 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
18694 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
18695 aarch64_components_for_bb
18696
18697 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
18698 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
18699 aarch64_disqualify_components
18700
18701 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
18702 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
18703 aarch64_emit_prologue_components
18704
18705 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
18706 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
18707 aarch64_emit_epilogue_components
18708
18709 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
18710 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
18711 aarch64_set_handled_components
18712
18713 #undef TARGET_TRAMPOLINE_INIT
18714 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
18715
18716 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
18717 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
18718
18719 #undef TARGET_VECTOR_MODE_SUPPORTED_P
18720 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
18721
18722 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
18723 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
18724 aarch64_builtin_support_vector_misalignment
18725
18726 #undef TARGET_ARRAY_MODE
18727 #define TARGET_ARRAY_MODE aarch64_array_mode
18728
18729 #undef TARGET_ARRAY_MODE_SUPPORTED_P
18730 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
18731
18732 #undef TARGET_VECTORIZE_ADD_STMT_COST
18733 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
18734
18735 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
18736 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
18737 aarch64_builtin_vectorization_cost
18738
18739 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
18740 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
18741
18742 #undef TARGET_VECTORIZE_BUILTINS
18743 #define TARGET_VECTORIZE_BUILTINS
18744
18745 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
18746 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
18747 aarch64_builtin_vectorized_function
18748
18749 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
18750 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
18751 aarch64_autovectorize_vector_sizes
18752
18753 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18754 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18755 aarch64_atomic_assign_expand_fenv
18756
18757 /* Section anchor support. */
18758
18759 #undef TARGET_MIN_ANCHOR_OFFSET
18760 #define TARGET_MIN_ANCHOR_OFFSET -256
18761
18762 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18763 byte offset; we can do much more for larger data types, but have no way
18764 to determine the size of the access. We assume accesses are aligned. */
18765 #undef TARGET_MAX_ANCHOR_OFFSET
18766 #define TARGET_MAX_ANCHOR_OFFSET 4095
18767
18768 #undef TARGET_VECTOR_ALIGNMENT
18769 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18770
18771 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18772 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18773 aarch64_vectorize_preferred_vector_alignment
18774 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18775 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18776 aarch64_simd_vector_alignment_reachable
18777
18778 /* vec_perm support. */
18779
18780 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18781 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18782 aarch64_vectorize_vec_perm_const
18783
18784 #undef TARGET_VECTORIZE_GET_MASK_MODE
18785 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18786 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18787 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18788 aarch64_empty_mask_is_expensive
18789 #undef TARGET_PREFERRED_ELSE_VALUE
18790 #define TARGET_PREFERRED_ELSE_VALUE \
18791 aarch64_preferred_else_value
18792
18793 #undef TARGET_INIT_LIBFUNCS
18794 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18795
18796 #undef TARGET_FIXED_CONDITION_CODE_REGS
18797 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18798
18799 #undef TARGET_FLAGS_REGNUM
18800 #define TARGET_FLAGS_REGNUM CC_REGNUM
18801
18802 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18803 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18804
18805 #undef TARGET_ASAN_SHADOW_OFFSET
18806 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18807
18808 #undef TARGET_LEGITIMIZE_ADDRESS
18809 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18810
18811 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18812 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18813
18814 #undef TARGET_CAN_USE_DOLOOP_P
18815 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18816
18817 #undef TARGET_SCHED_ADJUST_PRIORITY
18818 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18819
18820 #undef TARGET_SCHED_MACRO_FUSION_P
18821 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18822
18823 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18824 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18825
18826 #undef TARGET_SCHED_FUSION_PRIORITY
18827 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18828
18829 #undef TARGET_UNSPEC_MAY_TRAP_P
18830 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18831
18832 #undef TARGET_USE_PSEUDO_PIC_REG
18833 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18834
18835 #undef TARGET_PRINT_OPERAND
18836 #define TARGET_PRINT_OPERAND aarch64_print_operand
18837
18838 #undef TARGET_PRINT_OPERAND_ADDRESS
18839 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18840
18841 #undef TARGET_OPTAB_SUPPORTED_P
18842 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18843
18844 #undef TARGET_OMIT_STRUCT_RETURN_REG
18845 #define TARGET_OMIT_STRUCT_RETURN_REG true
18846
18847 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18848 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18849 aarch64_dwarf_poly_indeterminate_value
18850
18851 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18852 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18853 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18854
18855 #undef TARGET_HARD_REGNO_NREGS
18856 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18857 #undef TARGET_HARD_REGNO_MODE_OK
18858 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18859
18860 #undef TARGET_MODES_TIEABLE_P
18861 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18862
18863 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18864 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18865 aarch64_hard_regno_call_part_clobbered
18866
18867 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
18868 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
18869 aarch64_remove_extra_call_preserved_regs
18870
18871 #undef TARGET_CONSTANT_ALIGNMENT
18872 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18873
18874 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
18875 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
18876 aarch64_stack_clash_protection_alloca_probe_range
18877
18878 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18879 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18880
18881 #undef TARGET_CAN_CHANGE_MODE_CLASS
18882 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18883
18884 #undef TARGET_SELECT_EARLY_REMAT_MODES
18885 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18886
18887 #undef TARGET_SPECULATION_SAFE_VALUE
18888 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
18889
18890 #undef TARGET_ESTIMATED_POLY_VALUE
18891 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
18892
18893 #undef TARGET_ATTRIBUTE_TABLE
18894 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
18895
18896 #if CHECKING_P
18897 #undef TARGET_RUN_TARGET_SELFTESTS
18898 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18899 #endif /* #if CHECKING_P */
18900
18901 struct gcc_target targetm = TARGET_INITIALIZER;
18902
18903 #include "gt-aarch64.h"