]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
AArch64: Allow any offset for SVE addressing modes before reload.
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
85 {
86 enum insn_type { MOV, MVN };
87 enum modifier_type { LSL, MSL };
88
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95
96 /* The mode of the elements. */
97 scalar_mode elt_mode;
98
99 /* The value of each element if all elements are the same, or the
100 first value if the constant is a series. */
101 rtx value;
102
103 /* The value of the step if the constant is a series, null otherwise. */
104 rtx step;
105
106 /* The instruction to use to move the immediate into a vector. */
107 insn_type insn;
108
109 /* The kind of shift modifier to use, and the number of bits to shift.
110 This is (LSL, 0) if no shift is needed. */
111 modifier_type modifier;
112 unsigned int shift;
113 };
114
115 /* Construct a floating-point immediate in which each element has mode
116 ELT_MODE_IN and value VALUE_IN. */
117 inline simd_immediate_info
118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
119 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
120 modifier (LSL), shift (0)
121 {}
122
123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
124 and value VALUE_IN. The other parameters are as for the structure
125 fields. */
126 inline simd_immediate_info
127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
128 unsigned HOST_WIDE_INT value_in,
129 insn_type insn_in, modifier_type modifier_in,
130 unsigned int shift_in)
131 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
132 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
133 {}
134
135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
136 and where element I is equal to VALUE_IN + I * STEP_IN. */
137 inline simd_immediate_info
138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
139 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
140 modifier (LSL), shift (0)
141 {}
142
143 /* The current code model. */
144 enum aarch64_code_model aarch64_cmodel;
145
146 /* The number of 64-bit elements in an SVE vector. */
147 poly_uint16 aarch64_sve_vg;
148
149 #ifdef HAVE_AS_TLS
150 #undef TARGET_HAVE_TLS
151 #define TARGET_HAVE_TLS 1
152 #endif
153
154 static bool aarch64_composite_type_p (const_tree, machine_mode);
155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
156 const_tree,
157 machine_mode *, int *,
158 bool *);
159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
161 static void aarch64_override_options_after_change (void);
162 static bool aarch64_vector_mode_supported_p (machine_mode);
163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
165 const_tree type,
166 int misalignment,
167 bool is_packed);
168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
170 aarch64_addr_query_type);
171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
172
173 /* Major revision number of the ARM Architecture implemented by the target. */
174 unsigned aarch64_architecture_version;
175
176 /* The processor for which instructions should be scheduled. */
177 enum aarch64_processor aarch64_tune = cortexa53;
178
179 /* Mask to specify which instruction scheduling options should be used. */
180 unsigned long aarch64_tune_flags = 0;
181
182 /* Global flag for PC relative loads. */
183 bool aarch64_pcrelative_literal_loads;
184
185 /* Global flag for whether frame pointer is enabled. */
186 bool aarch64_use_frame_pointer;
187
188 #define BRANCH_PROTECT_STR_MAX 255
189 char *accepted_branch_protection_string = NULL;
190
191 static enum aarch64_parse_opt_result
192 aarch64_parse_branch_protection (const char*, char**);
193
194 /* Support for command line parsing of boolean flags in the tuning
195 structures. */
196 struct aarch64_flag_desc
197 {
198 const char* name;
199 unsigned int flag;
200 };
201
202 #define AARCH64_FUSION_PAIR(name, internal_name) \
203 { name, AARCH64_FUSE_##internal_name },
204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
205 {
206 { "none", AARCH64_FUSE_NOTHING },
207 #include "aarch64-fusion-pairs.def"
208 { "all", AARCH64_FUSE_ALL },
209 { NULL, AARCH64_FUSE_NOTHING }
210 };
211
212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
213 { name, AARCH64_EXTRA_TUNE_##internal_name },
214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
215 {
216 { "none", AARCH64_EXTRA_TUNE_NONE },
217 #include "aarch64-tuning-flags.def"
218 { "all", AARCH64_EXTRA_TUNE_ALL },
219 { NULL, AARCH64_EXTRA_TUNE_NONE }
220 };
221
222 /* Tuning parameters. */
223
224 static const struct cpu_addrcost_table generic_addrcost_table =
225 {
226 {
227 1, /* hi */
228 0, /* si */
229 0, /* di */
230 1, /* ti */
231 },
232 0, /* pre_modify */
233 0, /* post_modify */
234 0, /* register_offset */
235 0, /* register_sextend */
236 0, /* register_zextend */
237 0 /* imm_offset */
238 };
239
240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
241 {
242 {
243 0, /* hi */
244 0, /* si */
245 0, /* di */
246 2, /* ti */
247 },
248 0, /* pre_modify */
249 0, /* post_modify */
250 1, /* register_offset */
251 1, /* register_sextend */
252 2, /* register_zextend */
253 0, /* imm_offset */
254 };
255
256 static const struct cpu_addrcost_table xgene1_addrcost_table =
257 {
258 {
259 1, /* hi */
260 0, /* si */
261 0, /* di */
262 1, /* ti */
263 },
264 1, /* pre_modify */
265 1, /* post_modify */
266 0, /* register_offset */
267 1, /* register_sextend */
268 1, /* register_zextend */
269 0, /* imm_offset */
270 };
271
272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
273 {
274 {
275 1, /* hi */
276 1, /* si */
277 1, /* di */
278 2, /* ti */
279 },
280 0, /* pre_modify */
281 0, /* post_modify */
282 2, /* register_offset */
283 3, /* register_sextend */
284 3, /* register_zextend */
285 0, /* imm_offset */
286 };
287
288 static const struct cpu_addrcost_table tsv110_addrcost_table =
289 {
290 {
291 1, /* hi */
292 0, /* si */
293 0, /* di */
294 1, /* ti */
295 },
296 0, /* pre_modify */
297 0, /* post_modify */
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
301 0, /* imm_offset */
302 };
303
304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
305 {
306 {
307 1, /* hi */
308 1, /* si */
309 1, /* di */
310 2, /* ti */
311 },
312 1, /* pre_modify */
313 1, /* post_modify */
314 3, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
317 2, /* imm_offset */
318 };
319
320 static const struct cpu_regmove_cost generic_regmove_cost =
321 {
322 1, /* GP2GP */
323 /* Avoid the use of slow int<->fp moves for spilling by setting
324 their cost higher than memmov_cost. */
325 5, /* GP2FP */
326 5, /* FP2GP */
327 2 /* FP2FP */
328 };
329
330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
331 {
332 1, /* GP2GP */
333 /* Avoid the use of slow int<->fp moves for spilling by setting
334 their cost higher than memmov_cost. */
335 5, /* GP2FP */
336 5, /* FP2GP */
337 2 /* FP2FP */
338 };
339
340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
341 {
342 1, /* GP2GP */
343 /* Avoid the use of slow int<->fp moves for spilling by setting
344 their cost higher than memmov_cost. */
345 5, /* GP2FP */
346 5, /* FP2GP */
347 2 /* FP2FP */
348 };
349
350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
351 {
352 1, /* GP2GP */
353 /* Avoid the use of slow int<->fp moves for spilling by setting
354 their cost higher than memmov_cost (actual, 4 and 9). */
355 9, /* GP2FP */
356 9, /* FP2GP */
357 1 /* FP2FP */
358 };
359
360 static const struct cpu_regmove_cost thunderx_regmove_cost =
361 {
362 2, /* GP2GP */
363 2, /* GP2FP */
364 6, /* FP2GP */
365 4 /* FP2FP */
366 };
367
368 static const struct cpu_regmove_cost xgene1_regmove_cost =
369 {
370 1, /* GP2GP */
371 /* Avoid the use of slow int<->fp moves for spilling by setting
372 their cost higher than memmov_cost. */
373 8, /* GP2FP */
374 8, /* FP2GP */
375 2 /* FP2FP */
376 };
377
378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
379 {
380 2, /* GP2GP */
381 /* Avoid the use of int<->fp moves for spilling. */
382 6, /* GP2FP */
383 6, /* FP2GP */
384 4 /* FP2FP */
385 };
386
387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
388 {
389 1, /* GP2GP */
390 /* Avoid the use of int<->fp moves for spilling. */
391 8, /* GP2FP */
392 8, /* FP2GP */
393 4 /* FP2FP */
394 };
395
396 static const struct cpu_regmove_cost tsv110_regmove_cost =
397 {
398 1, /* GP2GP */
399 /* Avoid the use of slow int<->fp moves for spilling by setting
400 their cost higher than memmov_cost. */
401 2, /* GP2FP */
402 3, /* FP2GP */
403 2 /* FP2FP */
404 };
405
406 /* Generic costs for vector insn classes. */
407 static const struct cpu_vector_cost generic_vector_cost =
408 {
409 1, /* scalar_int_stmt_cost */
410 1, /* scalar_fp_stmt_cost */
411 1, /* scalar_load_cost */
412 1, /* scalar_store_cost */
413 1, /* vec_int_stmt_cost */
414 1, /* vec_fp_stmt_cost */
415 2, /* vec_permute_cost */
416 1, /* vec_to_scalar_cost */
417 1, /* scalar_to_vec_cost */
418 1, /* vec_align_load_cost */
419 1, /* vec_unalign_load_cost */
420 1, /* vec_unalign_store_cost */
421 1, /* vec_store_cost */
422 3, /* cond_taken_branch_cost */
423 1 /* cond_not_taken_branch_cost */
424 };
425
426 /* QDF24XX costs for vector insn classes. */
427 static const struct cpu_vector_cost qdf24xx_vector_cost =
428 {
429 1, /* scalar_int_stmt_cost */
430 1, /* scalar_fp_stmt_cost */
431 1, /* scalar_load_cost */
432 1, /* scalar_store_cost */
433 1, /* vec_int_stmt_cost */
434 3, /* vec_fp_stmt_cost */
435 2, /* vec_permute_cost */
436 1, /* vec_to_scalar_cost */
437 1, /* scalar_to_vec_cost */
438 1, /* vec_align_load_cost */
439 1, /* vec_unalign_load_cost */
440 1, /* vec_unalign_store_cost */
441 1, /* vec_store_cost */
442 3, /* cond_taken_branch_cost */
443 1 /* cond_not_taken_branch_cost */
444 };
445
446 /* ThunderX costs for vector insn classes. */
447 static const struct cpu_vector_cost thunderx_vector_cost =
448 {
449 1, /* scalar_int_stmt_cost */
450 1, /* scalar_fp_stmt_cost */
451 3, /* scalar_load_cost */
452 1, /* scalar_store_cost */
453 4, /* vec_int_stmt_cost */
454 1, /* vec_fp_stmt_cost */
455 4, /* vec_permute_cost */
456 2, /* vec_to_scalar_cost */
457 2, /* scalar_to_vec_cost */
458 3, /* vec_align_load_cost */
459 5, /* vec_unalign_load_cost */
460 5, /* vec_unalign_store_cost */
461 1, /* vec_store_cost */
462 3, /* cond_taken_branch_cost */
463 3 /* cond_not_taken_branch_cost */
464 };
465
466 static const struct cpu_vector_cost tsv110_vector_cost =
467 {
468 1, /* scalar_int_stmt_cost */
469 1, /* scalar_fp_stmt_cost */
470 5, /* scalar_load_cost */
471 1, /* scalar_store_cost */
472 2, /* vec_int_stmt_cost */
473 2, /* vec_fp_stmt_cost */
474 2, /* vec_permute_cost */
475 3, /* vec_to_scalar_cost */
476 2, /* scalar_to_vec_cost */
477 5, /* vec_align_load_cost */
478 5, /* vec_unalign_load_cost */
479 1, /* vec_unalign_store_cost */
480 1, /* vec_store_cost */
481 1, /* cond_taken_branch_cost */
482 1 /* cond_not_taken_branch_cost */
483 };
484
485 /* Generic costs for vector insn classes. */
486 static const struct cpu_vector_cost cortexa57_vector_cost =
487 {
488 1, /* scalar_int_stmt_cost */
489 1, /* scalar_fp_stmt_cost */
490 4, /* scalar_load_cost */
491 1, /* scalar_store_cost */
492 2, /* vec_int_stmt_cost */
493 2, /* vec_fp_stmt_cost */
494 3, /* vec_permute_cost */
495 8, /* vec_to_scalar_cost */
496 8, /* scalar_to_vec_cost */
497 4, /* vec_align_load_cost */
498 4, /* vec_unalign_load_cost */
499 1, /* vec_unalign_store_cost */
500 1, /* vec_store_cost */
501 1, /* cond_taken_branch_cost */
502 1 /* cond_not_taken_branch_cost */
503 };
504
505 static const struct cpu_vector_cost exynosm1_vector_cost =
506 {
507 1, /* scalar_int_stmt_cost */
508 1, /* scalar_fp_stmt_cost */
509 5, /* scalar_load_cost */
510 1, /* scalar_store_cost */
511 3, /* vec_int_stmt_cost */
512 3, /* vec_fp_stmt_cost */
513 3, /* vec_permute_cost */
514 3, /* vec_to_scalar_cost */
515 3, /* scalar_to_vec_cost */
516 5, /* vec_align_load_cost */
517 5, /* vec_unalign_load_cost */
518 1, /* vec_unalign_store_cost */
519 1, /* vec_store_cost */
520 1, /* cond_taken_branch_cost */
521 1 /* cond_not_taken_branch_cost */
522 };
523
524 /* Generic costs for vector insn classes. */
525 static const struct cpu_vector_cost xgene1_vector_cost =
526 {
527 1, /* scalar_int_stmt_cost */
528 1, /* scalar_fp_stmt_cost */
529 5, /* scalar_load_cost */
530 1, /* scalar_store_cost */
531 2, /* vec_int_stmt_cost */
532 2, /* vec_fp_stmt_cost */
533 2, /* vec_permute_cost */
534 4, /* vec_to_scalar_cost */
535 4, /* scalar_to_vec_cost */
536 10, /* vec_align_load_cost */
537 10, /* vec_unalign_load_cost */
538 2, /* vec_unalign_store_cost */
539 2, /* vec_store_cost */
540 2, /* cond_taken_branch_cost */
541 1 /* cond_not_taken_branch_cost */
542 };
543
544 /* Costs for vector insn classes for Vulcan. */
545 static const struct cpu_vector_cost thunderx2t99_vector_cost =
546 {
547 1, /* scalar_int_stmt_cost */
548 6, /* scalar_fp_stmt_cost */
549 4, /* scalar_load_cost */
550 1, /* scalar_store_cost */
551 5, /* vec_int_stmt_cost */
552 6, /* vec_fp_stmt_cost */
553 3, /* vec_permute_cost */
554 6, /* vec_to_scalar_cost */
555 5, /* scalar_to_vec_cost */
556 8, /* vec_align_load_cost */
557 8, /* vec_unalign_load_cost */
558 4, /* vec_unalign_store_cost */
559 4, /* vec_store_cost */
560 2, /* cond_taken_branch_cost */
561 1 /* cond_not_taken_branch_cost */
562 };
563
564 /* Generic costs for branch instructions. */
565 static const struct cpu_branch_cost generic_branch_cost =
566 {
567 1, /* Predictable. */
568 3 /* Unpredictable. */
569 };
570
571 /* Generic approximation modes. */
572 static const cpu_approx_modes generic_approx_modes =
573 {
574 AARCH64_APPROX_NONE, /* division */
575 AARCH64_APPROX_NONE, /* sqrt */
576 AARCH64_APPROX_NONE /* recip_sqrt */
577 };
578
579 /* Approximation modes for Exynos M1. */
580 static const cpu_approx_modes exynosm1_approx_modes =
581 {
582 AARCH64_APPROX_NONE, /* division */
583 AARCH64_APPROX_ALL, /* sqrt */
584 AARCH64_APPROX_ALL /* recip_sqrt */
585 };
586
587 /* Approximation modes for X-Gene 1. */
588 static const cpu_approx_modes xgene1_approx_modes =
589 {
590 AARCH64_APPROX_NONE, /* division */
591 AARCH64_APPROX_NONE, /* sqrt */
592 AARCH64_APPROX_ALL /* recip_sqrt */
593 };
594
595 /* Generic prefetch settings (which disable prefetch). */
596 static const cpu_prefetch_tune generic_prefetch_tune =
597 {
598 0, /* num_slots */
599 -1, /* l1_cache_size */
600 -1, /* l1_cache_line_size */
601 -1, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 -1 /* default_opt_level */
605 };
606
607 static const cpu_prefetch_tune exynosm1_prefetch_tune =
608 {
609 0, /* num_slots */
610 -1, /* l1_cache_size */
611 64, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
616 };
617
618 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
619 {
620 4, /* num_slots */
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 512, /* l2_cache_size */
624 false, /* prefetch_dynamic_strides */
625 2048, /* minimum_stride */
626 3 /* default_opt_level */
627 };
628
629 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
630 {
631 8, /* num_slots */
632 32, /* l1_cache_size */
633 128, /* l1_cache_line_size */
634 16*1024, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 3 /* default_opt_level */
638 };
639
640 static const cpu_prefetch_tune thunderx_prefetch_tune =
641 {
642 8, /* num_slots */
643 32, /* l1_cache_size */
644 128, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
649 };
650
651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
652 {
653 8, /* num_slots */
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 256, /* l2_cache_size */
657 true, /* prefetch_dynamic_strides */
658 -1, /* minimum_stride */
659 -1 /* default_opt_level */
660 };
661
662 static const cpu_prefetch_tune tsv110_prefetch_tune =
663 {
664 0, /* num_slots */
665 64, /* l1_cache_size */
666 64, /* l1_cache_line_size */
667 512, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 -1 /* default_opt_level */
671 };
672
673 static const cpu_prefetch_tune xgene1_prefetch_tune =
674 {
675 8, /* num_slots */
676 32, /* l1_cache_size */
677 64, /* l1_cache_line_size */
678 256, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
682 };
683
684 static const struct tune_params generic_tunings =
685 {
686 &cortexa57_extra_costs,
687 &generic_addrcost_table,
688 &generic_regmove_cost,
689 &generic_vector_cost,
690 &generic_branch_cost,
691 &generic_approx_modes,
692 SVE_NOT_IMPLEMENTED, /* sve_width */
693 4, /* memmov_cost */
694 2, /* issue_rate */
695 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
696 "8", /* function_align. */
697 "4", /* jump_align. */
698 "8", /* loop_align. */
699 2, /* int_reassoc_width. */
700 4, /* fp_reassoc_width. */
701 1, /* vec_reassoc_width. */
702 2, /* min_div_recip_mul_sf. */
703 2, /* min_div_recip_mul_df. */
704 0, /* max_case_values. */
705 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
706 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
707 &generic_prefetch_tune
708 };
709
710 static const struct tune_params cortexa35_tunings =
711 {
712 &cortexa53_extra_costs,
713 &generic_addrcost_table,
714 &cortexa53_regmove_cost,
715 &generic_vector_cost,
716 &generic_branch_cost,
717 &generic_approx_modes,
718 SVE_NOT_IMPLEMENTED, /* sve_width */
719 4, /* memmov_cost */
720 1, /* issue_rate */
721 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
722 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
723 "16", /* function_align. */
724 "4", /* jump_align. */
725 "8", /* loop_align. */
726 2, /* int_reassoc_width. */
727 4, /* fp_reassoc_width. */
728 1, /* vec_reassoc_width. */
729 2, /* min_div_recip_mul_sf. */
730 2, /* min_div_recip_mul_df. */
731 0, /* max_case_values. */
732 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
733 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
734 &generic_prefetch_tune
735 };
736
737 static const struct tune_params cortexa53_tunings =
738 {
739 &cortexa53_extra_costs,
740 &generic_addrcost_table,
741 &cortexa53_regmove_cost,
742 &generic_vector_cost,
743 &generic_branch_cost,
744 &generic_approx_modes,
745 SVE_NOT_IMPLEMENTED, /* sve_width */
746 4, /* memmov_cost */
747 2, /* issue_rate */
748 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
749 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
750 "16", /* function_align. */
751 "4", /* jump_align. */
752 "8", /* loop_align. */
753 2, /* int_reassoc_width. */
754 4, /* fp_reassoc_width. */
755 1, /* vec_reassoc_width. */
756 2, /* min_div_recip_mul_sf. */
757 2, /* min_div_recip_mul_df. */
758 0, /* max_case_values. */
759 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
760 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
761 &generic_prefetch_tune
762 };
763
764 static const struct tune_params cortexa57_tunings =
765 {
766 &cortexa57_extra_costs,
767 &generic_addrcost_table,
768 &cortexa57_regmove_cost,
769 &cortexa57_vector_cost,
770 &generic_branch_cost,
771 &generic_approx_modes,
772 SVE_NOT_IMPLEMENTED, /* sve_width */
773 4, /* memmov_cost */
774 3, /* issue_rate */
775 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
776 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
777 "16", /* function_align. */
778 "4", /* jump_align. */
779 "8", /* loop_align. */
780 2, /* int_reassoc_width. */
781 4, /* fp_reassoc_width. */
782 1, /* vec_reassoc_width. */
783 2, /* min_div_recip_mul_sf. */
784 2, /* min_div_recip_mul_df. */
785 0, /* max_case_values. */
786 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
787 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
788 &generic_prefetch_tune
789 };
790
791 static const struct tune_params cortexa72_tunings =
792 {
793 &cortexa57_extra_costs,
794 &generic_addrcost_table,
795 &cortexa57_regmove_cost,
796 &cortexa57_vector_cost,
797 &generic_branch_cost,
798 &generic_approx_modes,
799 SVE_NOT_IMPLEMENTED, /* sve_width */
800 4, /* memmov_cost */
801 3, /* issue_rate */
802 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
803 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
804 "16", /* function_align. */
805 "4", /* jump_align. */
806 "8", /* loop_align. */
807 2, /* int_reassoc_width. */
808 4, /* fp_reassoc_width. */
809 1, /* vec_reassoc_width. */
810 2, /* min_div_recip_mul_sf. */
811 2, /* min_div_recip_mul_df. */
812 0, /* max_case_values. */
813 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
814 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
815 &generic_prefetch_tune
816 };
817
818 static const struct tune_params cortexa73_tunings =
819 {
820 &cortexa57_extra_costs,
821 &generic_addrcost_table,
822 &cortexa57_regmove_cost,
823 &cortexa57_vector_cost,
824 &generic_branch_cost,
825 &generic_approx_modes,
826 SVE_NOT_IMPLEMENTED, /* sve_width */
827 4, /* memmov_cost. */
828 2, /* issue_rate. */
829 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
830 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
831 "16", /* function_align. */
832 "4", /* jump_align. */
833 "8", /* loop_align. */
834 2, /* int_reassoc_width. */
835 4, /* fp_reassoc_width. */
836 1, /* vec_reassoc_width. */
837 2, /* min_div_recip_mul_sf. */
838 2, /* min_div_recip_mul_df. */
839 0, /* max_case_values. */
840 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
841 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
842 &generic_prefetch_tune
843 };
844
845
846
847 static const struct tune_params exynosm1_tunings =
848 {
849 &exynosm1_extra_costs,
850 &exynosm1_addrcost_table,
851 &exynosm1_regmove_cost,
852 &exynosm1_vector_cost,
853 &generic_branch_cost,
854 &exynosm1_approx_modes,
855 SVE_NOT_IMPLEMENTED, /* sve_width */
856 4, /* memmov_cost */
857 3, /* issue_rate */
858 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
859 "4", /* function_align. */
860 "4", /* jump_align. */
861 "4", /* loop_align. */
862 2, /* int_reassoc_width. */
863 4, /* fp_reassoc_width. */
864 1, /* vec_reassoc_width. */
865 2, /* min_div_recip_mul_sf. */
866 2, /* min_div_recip_mul_df. */
867 48, /* max_case_values. */
868 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
869 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
870 &exynosm1_prefetch_tune
871 };
872
873 static const struct tune_params thunderxt88_tunings =
874 {
875 &thunderx_extra_costs,
876 &generic_addrcost_table,
877 &thunderx_regmove_cost,
878 &thunderx_vector_cost,
879 &generic_branch_cost,
880 &generic_approx_modes,
881 SVE_NOT_IMPLEMENTED, /* sve_width */
882 6, /* memmov_cost */
883 2, /* issue_rate */
884 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
885 "8", /* function_align. */
886 "8", /* jump_align. */
887 "8", /* loop_align. */
888 2, /* int_reassoc_width. */
889 4, /* fp_reassoc_width. */
890 1, /* vec_reassoc_width. */
891 2, /* min_div_recip_mul_sf. */
892 2, /* min_div_recip_mul_df. */
893 0, /* max_case_values. */
894 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
895 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
896 &thunderxt88_prefetch_tune
897 };
898
899 static const struct tune_params thunderx_tunings =
900 {
901 &thunderx_extra_costs,
902 &generic_addrcost_table,
903 &thunderx_regmove_cost,
904 &thunderx_vector_cost,
905 &generic_branch_cost,
906 &generic_approx_modes,
907 SVE_NOT_IMPLEMENTED, /* sve_width */
908 6, /* memmov_cost */
909 2, /* issue_rate */
910 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
911 "8", /* function_align. */
912 "8", /* jump_align. */
913 "8", /* loop_align. */
914 2, /* int_reassoc_width. */
915 4, /* fp_reassoc_width. */
916 1, /* vec_reassoc_width. */
917 2, /* min_div_recip_mul_sf. */
918 2, /* min_div_recip_mul_df. */
919 0, /* max_case_values. */
920 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
921 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
922 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
923 &thunderx_prefetch_tune
924 };
925
926 static const struct tune_params tsv110_tunings =
927 {
928 &tsv110_extra_costs,
929 &tsv110_addrcost_table,
930 &tsv110_regmove_cost,
931 &tsv110_vector_cost,
932 &generic_branch_cost,
933 &generic_approx_modes,
934 SVE_NOT_IMPLEMENTED, /* sve_width */
935 4, /* memmov_cost */
936 4, /* issue_rate */
937 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
938 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
939 "16", /* function_align. */
940 "4", /* jump_align. */
941 "8", /* loop_align. */
942 2, /* int_reassoc_width. */
943 4, /* fp_reassoc_width. */
944 1, /* vec_reassoc_width. */
945 2, /* min_div_recip_mul_sf. */
946 2, /* min_div_recip_mul_df. */
947 0, /* max_case_values. */
948 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
949 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
950 &tsv110_prefetch_tune
951 };
952
953 static const struct tune_params xgene1_tunings =
954 {
955 &xgene1_extra_costs,
956 &xgene1_addrcost_table,
957 &xgene1_regmove_cost,
958 &xgene1_vector_cost,
959 &generic_branch_cost,
960 &xgene1_approx_modes,
961 SVE_NOT_IMPLEMENTED, /* sve_width */
962 6, /* memmov_cost */
963 4, /* issue_rate */
964 AARCH64_FUSE_NOTHING, /* fusible_ops */
965 "16", /* function_align. */
966 "16", /* jump_align. */
967 "16", /* loop_align. */
968 2, /* int_reassoc_width. */
969 4, /* fp_reassoc_width. */
970 1, /* vec_reassoc_width. */
971 2, /* min_div_recip_mul_sf. */
972 2, /* min_div_recip_mul_df. */
973 17, /* max_case_values. */
974 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
975 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
976 &xgene1_prefetch_tune
977 };
978
979 static const struct tune_params emag_tunings =
980 {
981 &xgene1_extra_costs,
982 &xgene1_addrcost_table,
983 &xgene1_regmove_cost,
984 &xgene1_vector_cost,
985 &generic_branch_cost,
986 &xgene1_approx_modes,
987 SVE_NOT_IMPLEMENTED,
988 6, /* memmov_cost */
989 4, /* issue_rate */
990 AARCH64_FUSE_NOTHING, /* fusible_ops */
991 "16", /* function_align. */
992 "16", /* jump_align. */
993 "16", /* loop_align. */
994 2, /* int_reassoc_width. */
995 4, /* fp_reassoc_width. */
996 1, /* vec_reassoc_width. */
997 2, /* min_div_recip_mul_sf. */
998 2, /* min_div_recip_mul_df. */
999 17, /* max_case_values. */
1000 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1001 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1002 &xgene1_prefetch_tune
1003 };
1004
1005 static const struct tune_params qdf24xx_tunings =
1006 {
1007 &qdf24xx_extra_costs,
1008 &qdf24xx_addrcost_table,
1009 &qdf24xx_regmove_cost,
1010 &qdf24xx_vector_cost,
1011 &generic_branch_cost,
1012 &generic_approx_modes,
1013 SVE_NOT_IMPLEMENTED, /* sve_width */
1014 4, /* memmov_cost */
1015 4, /* issue_rate */
1016 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1018 "16", /* function_align. */
1019 "8", /* jump_align. */
1020 "16", /* loop_align. */
1021 2, /* int_reassoc_width. */
1022 4, /* fp_reassoc_width. */
1023 1, /* vec_reassoc_width. */
1024 2, /* min_div_recip_mul_sf. */
1025 2, /* min_div_recip_mul_df. */
1026 0, /* max_case_values. */
1027 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1028 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1029 &qdf24xx_prefetch_tune
1030 };
1031
1032 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1033 for now. */
1034 static const struct tune_params saphira_tunings =
1035 {
1036 &generic_extra_costs,
1037 &generic_addrcost_table,
1038 &generic_regmove_cost,
1039 &generic_vector_cost,
1040 &generic_branch_cost,
1041 &generic_approx_modes,
1042 SVE_NOT_IMPLEMENTED, /* sve_width */
1043 4, /* memmov_cost */
1044 4, /* issue_rate */
1045 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1046 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1047 "16", /* function_align. */
1048 "8", /* jump_align. */
1049 "16", /* loop_align. */
1050 2, /* int_reassoc_width. */
1051 4, /* fp_reassoc_width. */
1052 1, /* vec_reassoc_width. */
1053 2, /* min_div_recip_mul_sf. */
1054 2, /* min_div_recip_mul_df. */
1055 0, /* max_case_values. */
1056 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1057 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1058 &generic_prefetch_tune
1059 };
1060
1061 static const struct tune_params thunderx2t99_tunings =
1062 {
1063 &thunderx2t99_extra_costs,
1064 &thunderx2t99_addrcost_table,
1065 &thunderx2t99_regmove_cost,
1066 &thunderx2t99_vector_cost,
1067 &generic_branch_cost,
1068 &generic_approx_modes,
1069 SVE_NOT_IMPLEMENTED, /* sve_width */
1070 4, /* memmov_cost. */
1071 4, /* issue_rate. */
1072 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1073 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1074 "16", /* function_align. */
1075 "8", /* jump_align. */
1076 "16", /* loop_align. */
1077 3, /* int_reassoc_width. */
1078 2, /* fp_reassoc_width. */
1079 2, /* vec_reassoc_width. */
1080 2, /* min_div_recip_mul_sf. */
1081 2, /* min_div_recip_mul_df. */
1082 0, /* max_case_values. */
1083 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1084 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1085 &thunderx2t99_prefetch_tune
1086 };
1087
1088 static const struct tune_params ares_tunings =
1089 {
1090 &cortexa57_extra_costs,
1091 &generic_addrcost_table,
1092 &generic_regmove_cost,
1093 &cortexa57_vector_cost,
1094 &generic_branch_cost,
1095 &generic_approx_modes,
1096 SVE_NOT_IMPLEMENTED, /* sve_width */
1097 4, /* memmov_cost */
1098 3, /* issue_rate */
1099 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1100 "32:16", /* function_align. */
1101 "32:16", /* jump_align. */
1102 "32:16", /* loop_align. */
1103 2, /* int_reassoc_width. */
1104 4, /* fp_reassoc_width. */
1105 2, /* vec_reassoc_width. */
1106 2, /* min_div_recip_mul_sf. */
1107 2, /* min_div_recip_mul_df. */
1108 0, /* max_case_values. */
1109 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1110 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1111 &generic_prefetch_tune
1112 };
1113
1114 /* Support for fine-grained override of the tuning structures. */
1115 struct aarch64_tuning_override_function
1116 {
1117 const char* name;
1118 void (*parse_override)(const char*, struct tune_params*);
1119 };
1120
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1124
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions[] =
1127 {
1128 { "fuse", aarch64_parse_fuse_string },
1129 { "tune", aarch64_parse_tune_string },
1130 { "sve_width", aarch64_parse_sve_width_string },
1131 { NULL, NULL }
1132 };
1133
1134 /* A processor implementing AArch64. */
1135 struct processor
1136 {
1137 const char *const name;
1138 enum aarch64_processor ident;
1139 enum aarch64_processor sched_core;
1140 enum aarch64_arch arch;
1141 unsigned architecture_version;
1142 const unsigned long flags;
1143 const struct tune_params *const tune;
1144 };
1145
1146 /* Architectures implementing AArch64. */
1147 static const struct processor all_architectures[] =
1148 {
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1153 };
1154
1155 /* Processor cores implementing AArch64. */
1156 static const struct processor all_cores[] =
1157 {
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1160 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1161 FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1164 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1165 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1166 };
1167
1168
1169 /* Target specification. These are populated by the -march, -mtune, -mcpu
1170 handling code or by target attributes. */
1171 static const struct processor *selected_arch;
1172 static const struct processor *selected_cpu;
1173 static const struct processor *selected_tune;
1174
1175 /* The current tuning set. */
1176 struct tune_params aarch64_tune_params = generic_tunings;
1177
1178 /* Table of machine attributes. */
1179 static const struct attribute_spec aarch64_attribute_table[] =
1180 {
1181 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1182 affects_type_identity, handler, exclude } */
1183 { "aarch64_vector_pcs", 0, 0, false, true, true, false, NULL, NULL },
1184 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1185 };
1186
1187 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1188
1189 /* An ISA extension in the co-processor and main instruction set space. */
1190 struct aarch64_option_extension
1191 {
1192 const char *const name;
1193 const unsigned long flags_on;
1194 const unsigned long flags_off;
1195 };
1196
1197 typedef enum aarch64_cond_code
1198 {
1199 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1200 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1201 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1202 }
1203 aarch64_cc;
1204
1205 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1206
1207 struct aarch64_branch_protect_type
1208 {
1209 /* The type's name that the user passes to the branch-protection option
1210 string. */
1211 const char* name;
1212 /* Function to handle the protection type and set global variables.
1213 First argument is the string token corresponding with this type and the
1214 second argument is the next token in the option string.
1215 Return values:
1216 * AARCH64_PARSE_OK: Handling was sucessful.
1217 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1218 should print an error.
1219 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1220 own error. */
1221 enum aarch64_parse_opt_result (*handler)(char*, char*);
1222 /* A list of types that can follow this type in the option string. */
1223 const aarch64_branch_protect_type* subtypes;
1224 unsigned int num_subtypes;
1225 };
1226
1227 static enum aarch64_parse_opt_result
1228 aarch64_handle_no_branch_protection (char* str, char* rest)
1229 {
1230 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1231 aarch64_enable_bti = 0;
1232 if (rest)
1233 {
1234 error ("unexpected %<%s%> after %<%s%>", rest, str);
1235 return AARCH64_PARSE_INVALID_FEATURE;
1236 }
1237 return AARCH64_PARSE_OK;
1238 }
1239
1240 static enum aarch64_parse_opt_result
1241 aarch64_handle_standard_branch_protection (char* str, char* rest)
1242 {
1243 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1244 aarch64_enable_bti = 1;
1245 if (rest)
1246 {
1247 error ("unexpected %<%s%> after %<%s%>", rest, str);
1248 return AARCH64_PARSE_INVALID_FEATURE;
1249 }
1250 return AARCH64_PARSE_OK;
1251 }
1252
1253 static enum aarch64_parse_opt_result
1254 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1255 char* rest ATTRIBUTE_UNUSED)
1256 {
1257 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1258 return AARCH64_PARSE_OK;
1259 }
1260
1261 static enum aarch64_parse_opt_result
1262 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1263 char* rest ATTRIBUTE_UNUSED)
1264 {
1265 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1266 return AARCH64_PARSE_OK;
1267 }
1268
1269 static enum aarch64_parse_opt_result
1270 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1271 char* rest ATTRIBUTE_UNUSED)
1272 {
1273 aarch64_enable_bti = 1;
1274 return AARCH64_PARSE_OK;
1275 }
1276
1277 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1278 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1279 { NULL, NULL, NULL, 0 }
1280 };
1281
1282 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1283 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1284 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1285 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1286 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1287 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1288 { NULL, NULL, NULL, 0 }
1289 };
1290
1291 /* The condition codes of the processor, and the inverse function. */
1292 static const char * const aarch64_condition_codes[] =
1293 {
1294 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1295 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1296 };
1297
1298 /* Generate code to enable conditional branches in functions over 1 MiB. */
1299 const char *
1300 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1301 const char * branch_format)
1302 {
1303 rtx_code_label * tmp_label = gen_label_rtx ();
1304 char label_buf[256];
1305 char buffer[128];
1306 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1307 CODE_LABEL_NUMBER (tmp_label));
1308 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1309 rtx dest_label = operands[pos_label];
1310 operands[pos_label] = tmp_label;
1311
1312 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1313 output_asm_insn (buffer, operands);
1314
1315 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1316 operands[pos_label] = dest_label;
1317 output_asm_insn (buffer, operands);
1318 return "";
1319 }
1320
1321 void
1322 aarch64_err_no_fpadvsimd (machine_mode mode)
1323 {
1324 if (TARGET_GENERAL_REGS_ONLY)
1325 if (FLOAT_MODE_P (mode))
1326 error ("%qs is incompatible with the use of floating-point types",
1327 "-mgeneral-regs-only");
1328 else
1329 error ("%qs is incompatible with the use of vector types",
1330 "-mgeneral-regs-only");
1331 else
1332 if (FLOAT_MODE_P (mode))
1333 error ("%qs feature modifier is incompatible with the use of"
1334 " floating-point types", "+nofp");
1335 else
1336 error ("%qs feature modifier is incompatible with the use of"
1337 " vector types", "+nofp");
1338 }
1339
1340 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1341 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1342 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1343 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1344 and GENERAL_REGS is lower than the memory cost (in this case the best class
1345 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1346 cost results in bad allocations with many redundant int<->FP moves which
1347 are expensive on various cores.
1348 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1349 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1350 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1351 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1352 The result of this is that it is no longer inefficient to have a higher
1353 memory move cost than the register move cost.
1354 */
1355
1356 static reg_class_t
1357 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1358 reg_class_t best_class)
1359 {
1360 machine_mode mode;
1361
1362 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1363 || !reg_class_subset_p (FP_REGS, allocno_class))
1364 return allocno_class;
1365
1366 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1367 || !reg_class_subset_p (FP_REGS, best_class))
1368 return best_class;
1369
1370 mode = PSEUDO_REGNO_MODE (regno);
1371 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1372 }
1373
1374 static unsigned int
1375 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1376 {
1377 if (GET_MODE_UNIT_SIZE (mode) == 4)
1378 return aarch64_tune_params.min_div_recip_mul_sf;
1379 return aarch64_tune_params.min_div_recip_mul_df;
1380 }
1381
1382 /* Return the reassociation width of treeop OPC with mode MODE. */
1383 static int
1384 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1385 {
1386 if (VECTOR_MODE_P (mode))
1387 return aarch64_tune_params.vec_reassoc_width;
1388 if (INTEGRAL_MODE_P (mode))
1389 return aarch64_tune_params.int_reassoc_width;
1390 /* Avoid reassociating floating point addition so we emit more FMAs. */
1391 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1392 return aarch64_tune_params.fp_reassoc_width;
1393 return 1;
1394 }
1395
1396 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1397 unsigned
1398 aarch64_dbx_register_number (unsigned regno)
1399 {
1400 if (GP_REGNUM_P (regno))
1401 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1402 else if (regno == SP_REGNUM)
1403 return AARCH64_DWARF_SP;
1404 else if (FP_REGNUM_P (regno))
1405 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1406 else if (PR_REGNUM_P (regno))
1407 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1408 else if (regno == VG_REGNUM)
1409 return AARCH64_DWARF_VG;
1410
1411 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1412 equivalent DWARF register. */
1413 return DWARF_FRAME_REGISTERS;
1414 }
1415
1416 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1417 static bool
1418 aarch64_advsimd_struct_mode_p (machine_mode mode)
1419 {
1420 return (TARGET_SIMD
1421 && (mode == OImode || mode == CImode || mode == XImode));
1422 }
1423
1424 /* Return true if MODE is an SVE predicate mode. */
1425 static bool
1426 aarch64_sve_pred_mode_p (machine_mode mode)
1427 {
1428 return (TARGET_SVE
1429 && (mode == VNx16BImode
1430 || mode == VNx8BImode
1431 || mode == VNx4BImode
1432 || mode == VNx2BImode));
1433 }
1434
1435 /* Three mutually-exclusive flags describing a vector or predicate type. */
1436 const unsigned int VEC_ADVSIMD = 1;
1437 const unsigned int VEC_SVE_DATA = 2;
1438 const unsigned int VEC_SVE_PRED = 4;
1439 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1440 a structure of 2, 3 or 4 vectors. */
1441 const unsigned int VEC_STRUCT = 8;
1442 /* Useful combinations of the above. */
1443 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1444 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1445
1446 /* Return a set of flags describing the vector properties of mode MODE.
1447 Ignore modes that are not supported by the current target. */
1448 static unsigned int
1449 aarch64_classify_vector_mode (machine_mode mode)
1450 {
1451 if (aarch64_advsimd_struct_mode_p (mode))
1452 return VEC_ADVSIMD | VEC_STRUCT;
1453
1454 if (aarch64_sve_pred_mode_p (mode))
1455 return VEC_SVE_PRED;
1456
1457 scalar_mode inner = GET_MODE_INNER (mode);
1458 if (VECTOR_MODE_P (mode)
1459 && (inner == QImode
1460 || inner == HImode
1461 || inner == HFmode
1462 || inner == SImode
1463 || inner == SFmode
1464 || inner == DImode
1465 || inner == DFmode))
1466 {
1467 if (TARGET_SVE)
1468 {
1469 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1470 return VEC_SVE_DATA;
1471 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1472 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1473 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1474 return VEC_SVE_DATA | VEC_STRUCT;
1475 }
1476
1477 /* This includes V1DF but not V1DI (which doesn't exist). */
1478 if (TARGET_SIMD
1479 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1480 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1481 return VEC_ADVSIMD;
1482 }
1483
1484 return 0;
1485 }
1486
1487 /* Return true if MODE is any of the data vector modes, including
1488 structure modes. */
1489 static bool
1490 aarch64_vector_data_mode_p (machine_mode mode)
1491 {
1492 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1493 }
1494
1495 /* Return true if MODE is an SVE data vector mode; either a single vector
1496 or a structure of vectors. */
1497 static bool
1498 aarch64_sve_data_mode_p (machine_mode mode)
1499 {
1500 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1501 }
1502
1503 /* Implement target hook TARGET_ARRAY_MODE. */
1504 static opt_machine_mode
1505 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1506 {
1507 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1508 && IN_RANGE (nelems, 2, 4))
1509 return mode_for_vector (GET_MODE_INNER (mode),
1510 GET_MODE_NUNITS (mode) * nelems);
1511
1512 return opt_machine_mode ();
1513 }
1514
1515 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1516 static bool
1517 aarch64_array_mode_supported_p (machine_mode mode,
1518 unsigned HOST_WIDE_INT nelems)
1519 {
1520 if (TARGET_SIMD
1521 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1522 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1523 && (nelems >= 2 && nelems <= 4))
1524 return true;
1525
1526 return false;
1527 }
1528
1529 /* Return the SVE predicate mode to use for elements that have
1530 ELEM_NBYTES bytes, if such a mode exists. */
1531
1532 opt_machine_mode
1533 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1534 {
1535 if (TARGET_SVE)
1536 {
1537 if (elem_nbytes == 1)
1538 return VNx16BImode;
1539 if (elem_nbytes == 2)
1540 return VNx8BImode;
1541 if (elem_nbytes == 4)
1542 return VNx4BImode;
1543 if (elem_nbytes == 8)
1544 return VNx2BImode;
1545 }
1546 return opt_machine_mode ();
1547 }
1548
1549 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1550
1551 static opt_machine_mode
1552 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1553 {
1554 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1555 {
1556 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1557 machine_mode pred_mode;
1558 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1559 return pred_mode;
1560 }
1561
1562 return default_get_mask_mode (nunits, nbytes);
1563 }
1564
1565 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1566 prefer to use the first arithmetic operand as the else value if
1567 the else value doesn't matter, since that exactly matches the SVE
1568 destructive merging form. For ternary operations we could either
1569 pick the first operand and use FMAD-like instructions or the last
1570 operand and use FMLA-like instructions; the latter seems more
1571 natural. */
1572
1573 static tree
1574 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1575 {
1576 return nops == 3 ? ops[2] : ops[0];
1577 }
1578
1579 /* Implement TARGET_HARD_REGNO_NREGS. */
1580
1581 static unsigned int
1582 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1583 {
1584 /* ??? Logically we should only need to provide a value when
1585 HARD_REGNO_MODE_OK says that the combination is valid,
1586 but at the moment we need to handle all modes. Just ignore
1587 any runtime parts for registers that can't store them. */
1588 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1589 switch (aarch64_regno_regclass (regno))
1590 {
1591 case FP_REGS:
1592 case FP_LO_REGS:
1593 if (aarch64_sve_data_mode_p (mode))
1594 return exact_div (GET_MODE_SIZE (mode),
1595 BYTES_PER_SVE_VECTOR).to_constant ();
1596 return CEIL (lowest_size, UNITS_PER_VREG);
1597 case PR_REGS:
1598 case PR_LO_REGS:
1599 case PR_HI_REGS:
1600 return 1;
1601 default:
1602 return CEIL (lowest_size, UNITS_PER_WORD);
1603 }
1604 gcc_unreachable ();
1605 }
1606
1607 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1608
1609 static bool
1610 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1611 {
1612 if (GET_MODE_CLASS (mode) == MODE_CC)
1613 return regno == CC_REGNUM;
1614
1615 if (regno == VG_REGNUM)
1616 /* This must have the same size as _Unwind_Word. */
1617 return mode == DImode;
1618
1619 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1620 if (vec_flags & VEC_SVE_PRED)
1621 return PR_REGNUM_P (regno);
1622
1623 if (PR_REGNUM_P (regno))
1624 return 0;
1625
1626 if (regno == SP_REGNUM)
1627 /* The purpose of comparing with ptr_mode is to support the
1628 global register variable associated with the stack pointer
1629 register via the syntax of asm ("wsp") in ILP32. */
1630 return mode == Pmode || mode == ptr_mode;
1631
1632 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1633 return mode == Pmode;
1634
1635 if (GP_REGNUM_P (regno))
1636 {
1637 if (known_le (GET_MODE_SIZE (mode), 8))
1638 return true;
1639 else if (known_le (GET_MODE_SIZE (mode), 16))
1640 return (regno & 1) == 0;
1641 }
1642 else if (FP_REGNUM_P (regno))
1643 {
1644 if (vec_flags & VEC_STRUCT)
1645 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1646 else
1647 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1648 }
1649
1650 return false;
1651 }
1652
1653 /* Return true if this is a definition of a vectorized simd function. */
1654
1655 static bool
1656 aarch64_simd_decl_p (tree fndecl)
1657 {
1658 tree fntype;
1659
1660 if (fndecl == NULL)
1661 return false;
1662 fntype = TREE_TYPE (fndecl);
1663 if (fntype == NULL)
1664 return false;
1665
1666 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1667 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1668 return true;
1669
1670 return false;
1671 }
1672
1673 /* Return the mode a register save/restore should use. DImode for integer
1674 registers, DFmode for FP registers in non-SIMD functions (they only save
1675 the bottom half of a 128 bit register), or TFmode for FP registers in
1676 SIMD functions. */
1677
1678 static machine_mode
1679 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1680 {
1681 return GP_REGNUM_P (regno)
1682 ? E_DImode
1683 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1684 }
1685
1686 /* Return true if the instruction is a call to a SIMD function, false
1687 if it is not a SIMD function or if we do not know anything about
1688 the function. */
1689
1690 static bool
1691 aarch64_simd_call_p (rtx_insn *insn)
1692 {
1693 rtx symbol;
1694 rtx call;
1695 tree fndecl;
1696
1697 gcc_assert (CALL_P (insn));
1698 call = get_call_rtx_from (insn);
1699 symbol = XEXP (XEXP (call, 0), 0);
1700 if (GET_CODE (symbol) != SYMBOL_REF)
1701 return false;
1702 fndecl = SYMBOL_REF_DECL (symbol);
1703 if (!fndecl)
1704 return false;
1705
1706 return aarch64_simd_decl_p (fndecl);
1707 }
1708
1709 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1710 a function that uses the SIMD ABI, take advantage of the extra
1711 call-preserved registers that the ABI provides. */
1712
1713 void
1714 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1715 HARD_REG_SET *return_set)
1716 {
1717 if (aarch64_simd_call_p (insn))
1718 {
1719 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1720 if (FP_SIMD_SAVED_REGNUM_P (regno))
1721 CLEAR_HARD_REG_BIT (*return_set, regno);
1722 }
1723 }
1724
1725 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1726 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1727 clobbers the top 64 bits when restoring the bottom 64 bits. */
1728
1729 static bool
1730 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1731 machine_mode mode)
1732 {
1733 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1734 return FP_REGNUM_P (regno)
1735 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1736 }
1737
1738 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1739
1740 rtx_insn *
1741 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1742 {
1743 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1744
1745 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1746 return call_1;
1747 else
1748 return call_2;
1749 }
1750
1751 /* Implement REGMODE_NATURAL_SIZE. */
1752 poly_uint64
1753 aarch64_regmode_natural_size (machine_mode mode)
1754 {
1755 /* The natural size for SVE data modes is one SVE data vector,
1756 and similarly for predicates. We can't independently modify
1757 anything smaller than that. */
1758 /* ??? For now, only do this for variable-width SVE registers.
1759 Doing it for constant-sized registers breaks lower-subreg.c. */
1760 /* ??? And once that's fixed, we should probably have similar
1761 code for Advanced SIMD. */
1762 if (!aarch64_sve_vg.is_constant ())
1763 {
1764 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1765 if (vec_flags & VEC_SVE_PRED)
1766 return BYTES_PER_SVE_PRED;
1767 if (vec_flags & VEC_SVE_DATA)
1768 return BYTES_PER_SVE_VECTOR;
1769 }
1770 return UNITS_PER_WORD;
1771 }
1772
1773 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1774 machine_mode
1775 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1776 machine_mode mode)
1777 {
1778 /* The predicate mode determines which bits are significant and
1779 which are "don't care". Decreasing the number of lanes would
1780 lose data while increasing the number of lanes would make bits
1781 unnecessarily significant. */
1782 if (PR_REGNUM_P (regno))
1783 return mode;
1784 if (known_ge (GET_MODE_SIZE (mode), 4))
1785 return mode;
1786 else
1787 return SImode;
1788 }
1789
1790 /* Return true if I's bits are consecutive ones from the MSB. */
1791 bool
1792 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1793 {
1794 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1795 }
1796
1797 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1798 that strcpy from constants will be faster. */
1799
1800 static HOST_WIDE_INT
1801 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1802 {
1803 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1804 return MAX (align, BITS_PER_WORD);
1805 return align;
1806 }
1807
1808 /* Return true if calls to DECL should be treated as
1809 long-calls (ie called via a register). */
1810 static bool
1811 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1812 {
1813 return false;
1814 }
1815
1816 /* Return true if calls to symbol-ref SYM should be treated as
1817 long-calls (ie called via a register). */
1818 bool
1819 aarch64_is_long_call_p (rtx sym)
1820 {
1821 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1822 }
1823
1824 /* Return true if calls to symbol-ref SYM should not go through
1825 plt stubs. */
1826
1827 bool
1828 aarch64_is_noplt_call_p (rtx sym)
1829 {
1830 const_tree decl = SYMBOL_REF_DECL (sym);
1831
1832 if (flag_pic
1833 && decl
1834 && (!flag_plt
1835 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1836 && !targetm.binds_local_p (decl))
1837 return true;
1838
1839 return false;
1840 }
1841
1842 /* Return true if the offsets to a zero/sign-extract operation
1843 represent an expression that matches an extend operation. The
1844 operands represent the paramters from
1845
1846 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1847 bool
1848 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1849 rtx extract_imm)
1850 {
1851 HOST_WIDE_INT mult_val, extract_val;
1852
1853 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1854 return false;
1855
1856 mult_val = INTVAL (mult_imm);
1857 extract_val = INTVAL (extract_imm);
1858
1859 if (extract_val > 8
1860 && extract_val < GET_MODE_BITSIZE (mode)
1861 && exact_log2 (extract_val & ~7) > 0
1862 && (extract_val & 7) <= 4
1863 && mult_val == (1 << (extract_val & 7)))
1864 return true;
1865
1866 return false;
1867 }
1868
1869 /* Emit an insn that's a simple single-set. Both the operands must be
1870 known to be valid. */
1871 inline static rtx_insn *
1872 emit_set_insn (rtx x, rtx y)
1873 {
1874 return emit_insn (gen_rtx_SET (x, y));
1875 }
1876
1877 /* X and Y are two things to compare using CODE. Emit the compare insn and
1878 return the rtx for register 0 in the proper mode. */
1879 rtx
1880 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1881 {
1882 machine_mode mode = SELECT_CC_MODE (code, x, y);
1883 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1884
1885 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1886 return cc_reg;
1887 }
1888
1889 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
1890
1891 static rtx
1892 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1893 machine_mode y_mode)
1894 {
1895 if (y_mode == E_QImode || y_mode == E_HImode)
1896 {
1897 if (CONST_INT_P (y))
1898 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1899 else
1900 {
1901 rtx t, cc_reg;
1902 machine_mode cc_mode;
1903
1904 t = gen_rtx_ZERO_EXTEND (SImode, y);
1905 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1906 cc_mode = CC_SWPmode;
1907 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1908 emit_set_insn (cc_reg, t);
1909 return cc_reg;
1910 }
1911 }
1912
1913 return aarch64_gen_compare_reg (code, x, y);
1914 }
1915
1916 /* Build the SYMBOL_REF for __tls_get_addr. */
1917
1918 static GTY(()) rtx tls_get_addr_libfunc;
1919
1920 rtx
1921 aarch64_tls_get_addr (void)
1922 {
1923 if (!tls_get_addr_libfunc)
1924 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1925 return tls_get_addr_libfunc;
1926 }
1927
1928 /* Return the TLS model to use for ADDR. */
1929
1930 static enum tls_model
1931 tls_symbolic_operand_type (rtx addr)
1932 {
1933 enum tls_model tls_kind = TLS_MODEL_NONE;
1934 if (GET_CODE (addr) == CONST)
1935 {
1936 poly_int64 addend;
1937 rtx sym = strip_offset (addr, &addend);
1938 if (GET_CODE (sym) == SYMBOL_REF)
1939 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1940 }
1941 else if (GET_CODE (addr) == SYMBOL_REF)
1942 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1943
1944 return tls_kind;
1945 }
1946
1947 /* We'll allow lo_sum's in addresses in our legitimate addresses
1948 so that combine would take care of combining addresses where
1949 necessary, but for generation purposes, we'll generate the address
1950 as :
1951 RTL Absolute
1952 tmp = hi (symbol_ref); adrp x1, foo
1953 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1954 nop
1955
1956 PIC TLS
1957 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1958 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1959 bl __tls_get_addr
1960 nop
1961
1962 Load TLS symbol, depending on TLS mechanism and TLS access model.
1963
1964 Global Dynamic - Traditional TLS:
1965 adrp tmp, :tlsgd:imm
1966 add dest, tmp, #:tlsgd_lo12:imm
1967 bl __tls_get_addr
1968
1969 Global Dynamic - TLS Descriptors:
1970 adrp dest, :tlsdesc:imm
1971 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1972 add dest, dest, #:tlsdesc_lo12:imm
1973 blr tmp
1974 mrs tp, tpidr_el0
1975 add dest, dest, tp
1976
1977 Initial Exec:
1978 mrs tp, tpidr_el0
1979 adrp tmp, :gottprel:imm
1980 ldr dest, [tmp, #:gottprel_lo12:imm]
1981 add dest, dest, tp
1982
1983 Local Exec:
1984 mrs tp, tpidr_el0
1985 add t0, tp, #:tprel_hi12:imm, lsl #12
1986 add t0, t0, #:tprel_lo12_nc:imm
1987 */
1988
1989 static void
1990 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1991 enum aarch64_symbol_type type)
1992 {
1993 switch (type)
1994 {
1995 case SYMBOL_SMALL_ABSOLUTE:
1996 {
1997 /* In ILP32, the mode of dest can be either SImode or DImode. */
1998 rtx tmp_reg = dest;
1999 machine_mode mode = GET_MODE (dest);
2000
2001 gcc_assert (mode == Pmode || mode == ptr_mode);
2002
2003 if (can_create_pseudo_p ())
2004 tmp_reg = gen_reg_rtx (mode);
2005
2006 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2007 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2008 return;
2009 }
2010
2011 case SYMBOL_TINY_ABSOLUTE:
2012 emit_insn (gen_rtx_SET (dest, imm));
2013 return;
2014
2015 case SYMBOL_SMALL_GOT_28K:
2016 {
2017 machine_mode mode = GET_MODE (dest);
2018 rtx gp_rtx = pic_offset_table_rtx;
2019 rtx insn;
2020 rtx mem;
2021
2022 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2023 here before rtl expand. Tree IVOPT will generate rtl pattern to
2024 decide rtx costs, in which case pic_offset_table_rtx is not
2025 initialized. For that case no need to generate the first adrp
2026 instruction as the final cost for global variable access is
2027 one instruction. */
2028 if (gp_rtx != NULL)
2029 {
2030 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2031 using the page base as GOT base, the first page may be wasted,
2032 in the worst scenario, there is only 28K space for GOT).
2033
2034 The generate instruction sequence for accessing global variable
2035 is:
2036
2037 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2038
2039 Only one instruction needed. But we must initialize
2040 pic_offset_table_rtx properly. We generate initialize insn for
2041 every global access, and allow CSE to remove all redundant.
2042
2043 The final instruction sequences will look like the following
2044 for multiply global variables access.
2045
2046 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2047
2048 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2049 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2050 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2051 ... */
2052
2053 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2054 crtl->uses_pic_offset_table = 1;
2055 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2056
2057 if (mode != GET_MODE (gp_rtx))
2058 gp_rtx = gen_lowpart (mode, gp_rtx);
2059
2060 }
2061
2062 if (mode == ptr_mode)
2063 {
2064 if (mode == DImode)
2065 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2066 else
2067 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2068
2069 mem = XVECEXP (SET_SRC (insn), 0, 0);
2070 }
2071 else
2072 {
2073 gcc_assert (mode == Pmode);
2074
2075 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2076 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2077 }
2078
2079 /* The operand is expected to be MEM. Whenever the related insn
2080 pattern changed, above code which calculate mem should be
2081 updated. */
2082 gcc_assert (GET_CODE (mem) == MEM);
2083 MEM_READONLY_P (mem) = 1;
2084 MEM_NOTRAP_P (mem) = 1;
2085 emit_insn (insn);
2086 return;
2087 }
2088
2089 case SYMBOL_SMALL_GOT_4G:
2090 {
2091 /* In ILP32, the mode of dest can be either SImode or DImode,
2092 while the got entry is always of SImode size. The mode of
2093 dest depends on how dest is used: if dest is assigned to a
2094 pointer (e.g. in the memory), it has SImode; it may have
2095 DImode if dest is dereferenced to access the memeory.
2096 This is why we have to handle three different ldr_got_small
2097 patterns here (two patterns for ILP32). */
2098
2099 rtx insn;
2100 rtx mem;
2101 rtx tmp_reg = dest;
2102 machine_mode mode = GET_MODE (dest);
2103
2104 if (can_create_pseudo_p ())
2105 tmp_reg = gen_reg_rtx (mode);
2106
2107 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2108 if (mode == ptr_mode)
2109 {
2110 if (mode == DImode)
2111 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2112 else
2113 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2114
2115 mem = XVECEXP (SET_SRC (insn), 0, 0);
2116 }
2117 else
2118 {
2119 gcc_assert (mode == Pmode);
2120
2121 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2122 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2123 }
2124
2125 gcc_assert (GET_CODE (mem) == MEM);
2126 MEM_READONLY_P (mem) = 1;
2127 MEM_NOTRAP_P (mem) = 1;
2128 emit_insn (insn);
2129 return;
2130 }
2131
2132 case SYMBOL_SMALL_TLSGD:
2133 {
2134 rtx_insn *insns;
2135 machine_mode mode = GET_MODE (dest);
2136 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2137
2138 start_sequence ();
2139 if (TARGET_ILP32)
2140 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2141 else
2142 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2143 insns = get_insns ();
2144 end_sequence ();
2145
2146 RTL_CONST_CALL_P (insns) = 1;
2147 emit_libcall_block (insns, dest, result, imm);
2148 return;
2149 }
2150
2151 case SYMBOL_SMALL_TLSDESC:
2152 {
2153 machine_mode mode = GET_MODE (dest);
2154 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2155 rtx tp;
2156
2157 gcc_assert (mode == Pmode || mode == ptr_mode);
2158
2159 /* In ILP32, the got entry is always of SImode size. Unlike
2160 small GOT, the dest is fixed at reg 0. */
2161 if (TARGET_ILP32)
2162 emit_insn (gen_tlsdesc_small_si (imm));
2163 else
2164 emit_insn (gen_tlsdesc_small_di (imm));
2165 tp = aarch64_load_tp (NULL);
2166
2167 if (mode != Pmode)
2168 tp = gen_lowpart (mode, tp);
2169
2170 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2171 if (REG_P (dest))
2172 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2173 return;
2174 }
2175
2176 case SYMBOL_SMALL_TLSIE:
2177 {
2178 /* In ILP32, the mode of dest can be either SImode or DImode,
2179 while the got entry is always of SImode size. The mode of
2180 dest depends on how dest is used: if dest is assigned to a
2181 pointer (e.g. in the memory), it has SImode; it may have
2182 DImode if dest is dereferenced to access the memeory.
2183 This is why we have to handle three different tlsie_small
2184 patterns here (two patterns for ILP32). */
2185 machine_mode mode = GET_MODE (dest);
2186 rtx tmp_reg = gen_reg_rtx (mode);
2187 rtx tp = aarch64_load_tp (NULL);
2188
2189 if (mode == ptr_mode)
2190 {
2191 if (mode == DImode)
2192 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2193 else
2194 {
2195 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2196 tp = gen_lowpart (mode, tp);
2197 }
2198 }
2199 else
2200 {
2201 gcc_assert (mode == Pmode);
2202 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2203 }
2204
2205 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2206 if (REG_P (dest))
2207 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2208 return;
2209 }
2210
2211 case SYMBOL_TLSLE12:
2212 case SYMBOL_TLSLE24:
2213 case SYMBOL_TLSLE32:
2214 case SYMBOL_TLSLE48:
2215 {
2216 machine_mode mode = GET_MODE (dest);
2217 rtx tp = aarch64_load_tp (NULL);
2218
2219 if (mode != Pmode)
2220 tp = gen_lowpart (mode, tp);
2221
2222 switch (type)
2223 {
2224 case SYMBOL_TLSLE12:
2225 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2226 (dest, tp, imm));
2227 break;
2228 case SYMBOL_TLSLE24:
2229 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2230 (dest, tp, imm));
2231 break;
2232 case SYMBOL_TLSLE32:
2233 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2234 (dest, imm));
2235 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2236 (dest, dest, tp));
2237 break;
2238 case SYMBOL_TLSLE48:
2239 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2240 (dest, imm));
2241 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2242 (dest, dest, tp));
2243 break;
2244 default:
2245 gcc_unreachable ();
2246 }
2247
2248 if (REG_P (dest))
2249 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2250 return;
2251 }
2252
2253 case SYMBOL_TINY_GOT:
2254 emit_insn (gen_ldr_got_tiny (dest, imm));
2255 return;
2256
2257 case SYMBOL_TINY_TLSIE:
2258 {
2259 machine_mode mode = GET_MODE (dest);
2260 rtx tp = aarch64_load_tp (NULL);
2261
2262 if (mode == ptr_mode)
2263 {
2264 if (mode == DImode)
2265 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2266 else
2267 {
2268 tp = gen_lowpart (mode, tp);
2269 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2270 }
2271 }
2272 else
2273 {
2274 gcc_assert (mode == Pmode);
2275 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2276 }
2277
2278 if (REG_P (dest))
2279 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2280 return;
2281 }
2282
2283 default:
2284 gcc_unreachable ();
2285 }
2286 }
2287
2288 /* Emit a move from SRC to DEST. Assume that the move expanders can
2289 handle all moves if !can_create_pseudo_p (). The distinction is
2290 important because, unlike emit_move_insn, the move expanders know
2291 how to force Pmode objects into the constant pool even when the
2292 constant pool address is not itself legitimate. */
2293 static rtx
2294 aarch64_emit_move (rtx dest, rtx src)
2295 {
2296 return (can_create_pseudo_p ()
2297 ? emit_move_insn (dest, src)
2298 : emit_move_insn_1 (dest, src));
2299 }
2300
2301 /* Apply UNOPTAB to OP and store the result in DEST. */
2302
2303 static void
2304 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2305 {
2306 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2307 if (dest != tmp)
2308 emit_move_insn (dest, tmp);
2309 }
2310
2311 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2312
2313 static void
2314 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2315 {
2316 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2317 OPTAB_DIRECT);
2318 if (dest != tmp)
2319 emit_move_insn (dest, tmp);
2320 }
2321
2322 /* Split a 128-bit move operation into two 64-bit move operations,
2323 taking care to handle partial overlap of register to register
2324 copies. Special cases are needed when moving between GP regs and
2325 FP regs. SRC can be a register, constant or memory; DST a register
2326 or memory. If either operand is memory it must not have any side
2327 effects. */
2328 void
2329 aarch64_split_128bit_move (rtx dst, rtx src)
2330 {
2331 rtx dst_lo, dst_hi;
2332 rtx src_lo, src_hi;
2333
2334 machine_mode mode = GET_MODE (dst);
2335
2336 gcc_assert (mode == TImode || mode == TFmode);
2337 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2338 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2339
2340 if (REG_P (dst) && REG_P (src))
2341 {
2342 int src_regno = REGNO (src);
2343 int dst_regno = REGNO (dst);
2344
2345 /* Handle FP <-> GP regs. */
2346 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2347 {
2348 src_lo = gen_lowpart (word_mode, src);
2349 src_hi = gen_highpart (word_mode, src);
2350
2351 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2352 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2353 return;
2354 }
2355 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2356 {
2357 dst_lo = gen_lowpart (word_mode, dst);
2358 dst_hi = gen_highpart (word_mode, dst);
2359
2360 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2361 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2362 return;
2363 }
2364 }
2365
2366 dst_lo = gen_lowpart (word_mode, dst);
2367 dst_hi = gen_highpart (word_mode, dst);
2368 src_lo = gen_lowpart (word_mode, src);
2369 src_hi = gen_highpart_mode (word_mode, mode, src);
2370
2371 /* At most one pairing may overlap. */
2372 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2373 {
2374 aarch64_emit_move (dst_hi, src_hi);
2375 aarch64_emit_move (dst_lo, src_lo);
2376 }
2377 else
2378 {
2379 aarch64_emit_move (dst_lo, src_lo);
2380 aarch64_emit_move (dst_hi, src_hi);
2381 }
2382 }
2383
2384 bool
2385 aarch64_split_128bit_move_p (rtx dst, rtx src)
2386 {
2387 return (! REG_P (src)
2388 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2389 }
2390
2391 /* Split a complex SIMD combine. */
2392
2393 void
2394 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2395 {
2396 machine_mode src_mode = GET_MODE (src1);
2397 machine_mode dst_mode = GET_MODE (dst);
2398
2399 gcc_assert (VECTOR_MODE_P (dst_mode));
2400 gcc_assert (register_operand (dst, dst_mode)
2401 && register_operand (src1, src_mode)
2402 && register_operand (src2, src_mode));
2403
2404 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2405 return;
2406 }
2407
2408 /* Split a complex SIMD move. */
2409
2410 void
2411 aarch64_split_simd_move (rtx dst, rtx src)
2412 {
2413 machine_mode src_mode = GET_MODE (src);
2414 machine_mode dst_mode = GET_MODE (dst);
2415
2416 gcc_assert (VECTOR_MODE_P (dst_mode));
2417
2418 if (REG_P (dst) && REG_P (src))
2419 {
2420 gcc_assert (VECTOR_MODE_P (src_mode));
2421 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2422 }
2423 }
2424
2425 bool
2426 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2427 machine_mode ymode, rtx y)
2428 {
2429 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2430 gcc_assert (r != NULL);
2431 return rtx_equal_p (x, r);
2432 }
2433
2434
2435 static rtx
2436 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2437 {
2438 if (can_create_pseudo_p ())
2439 return force_reg (mode, value);
2440 else
2441 {
2442 gcc_assert (x);
2443 aarch64_emit_move (x, value);
2444 return x;
2445 }
2446 }
2447
2448 /* Return true if we can move VALUE into a register using a single
2449 CNT[BHWD] instruction. */
2450
2451 static bool
2452 aarch64_sve_cnt_immediate_p (poly_int64 value)
2453 {
2454 HOST_WIDE_INT factor = value.coeffs[0];
2455 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2456 return (value.coeffs[1] == factor
2457 && IN_RANGE (factor, 2, 16 * 16)
2458 && (factor & 1) == 0
2459 && factor <= 16 * (factor & -factor));
2460 }
2461
2462 /* Likewise for rtx X. */
2463
2464 bool
2465 aarch64_sve_cnt_immediate_p (rtx x)
2466 {
2467 poly_int64 value;
2468 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2469 }
2470
2471 /* Return the asm string for an instruction with a CNT-like vector size
2472 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2473 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2474 first part of the operands template (the part that comes before the
2475 vector size itself). FACTOR is the number of quadwords.
2476 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2477 If it is zero, we can use any element size. */
2478
2479 static char *
2480 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2481 unsigned int factor,
2482 unsigned int nelts_per_vq)
2483 {
2484 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2485
2486 if (nelts_per_vq == 0)
2487 /* There is some overlap in the ranges of the four CNT instructions.
2488 Here we always use the smallest possible element size, so that the
2489 multiplier is 1 whereever possible. */
2490 nelts_per_vq = factor & -factor;
2491 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2492 gcc_assert (IN_RANGE (shift, 1, 4));
2493 char suffix = "dwhb"[shift - 1];
2494
2495 factor >>= shift;
2496 unsigned int written;
2497 if (factor == 1)
2498 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2499 prefix, suffix, operands);
2500 else
2501 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2502 prefix, suffix, operands, factor);
2503 gcc_assert (written < sizeof (buffer));
2504 return buffer;
2505 }
2506
2507 /* Return the asm string for an instruction with a CNT-like vector size
2508 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2509 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2510 first part of the operands template (the part that comes before the
2511 vector size itself). X is the value of the vector size operand,
2512 as a polynomial integer rtx. */
2513
2514 char *
2515 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2516 rtx x)
2517 {
2518 poly_int64 value = rtx_to_poly_int64 (x);
2519 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2520 return aarch64_output_sve_cnt_immediate (prefix, operands,
2521 value.coeffs[1], 0);
2522 }
2523
2524 /* Return true if we can add VALUE to a register using a single ADDVL
2525 or ADDPL instruction. */
2526
2527 static bool
2528 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2529 {
2530 HOST_WIDE_INT factor = value.coeffs[0];
2531 if (factor == 0 || value.coeffs[1] != factor)
2532 return false;
2533 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2534 and a value of 16 is one vector width. */
2535 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2536 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2537 }
2538
2539 /* Likewise for rtx X. */
2540
2541 bool
2542 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2543 {
2544 poly_int64 value;
2545 return (poly_int_rtx_p (x, &value)
2546 && aarch64_sve_addvl_addpl_immediate_p (value));
2547 }
2548
2549 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2550 and storing the result in operand 0. */
2551
2552 char *
2553 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2554 {
2555 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2556 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2557 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2558
2559 /* Use INC or DEC if possible. */
2560 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2561 {
2562 if (aarch64_sve_cnt_immediate_p (offset_value))
2563 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2564 offset_value.coeffs[1], 0);
2565 if (aarch64_sve_cnt_immediate_p (-offset_value))
2566 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2567 -offset_value.coeffs[1], 0);
2568 }
2569
2570 int factor = offset_value.coeffs[1];
2571 if ((factor & 15) == 0)
2572 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2573 else
2574 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2575 return buffer;
2576 }
2577
2578 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2579 instruction. If it is, store the number of elements in each vector
2580 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2581 factor in *FACTOR_OUT (if nonnull). */
2582
2583 bool
2584 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2585 unsigned int *nelts_per_vq_out)
2586 {
2587 rtx elt;
2588 poly_int64 value;
2589
2590 if (!const_vec_duplicate_p (x, &elt)
2591 || !poly_int_rtx_p (elt, &value))
2592 return false;
2593
2594 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2595 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2596 /* There's no vector INCB. */
2597 return false;
2598
2599 HOST_WIDE_INT factor = value.coeffs[0];
2600 if (value.coeffs[1] != factor)
2601 return false;
2602
2603 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2604 if ((factor % nelts_per_vq) != 0
2605 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2606 return false;
2607
2608 if (factor_out)
2609 *factor_out = factor;
2610 if (nelts_per_vq_out)
2611 *nelts_per_vq_out = nelts_per_vq;
2612 return true;
2613 }
2614
2615 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2616 instruction. */
2617
2618 bool
2619 aarch64_sve_inc_dec_immediate_p (rtx x)
2620 {
2621 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2622 }
2623
2624 /* Return the asm template for an SVE vector INC or DEC instruction.
2625 OPERANDS gives the operands before the vector count and X is the
2626 value of the vector count operand itself. */
2627
2628 char *
2629 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2630 {
2631 int factor;
2632 unsigned int nelts_per_vq;
2633 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2634 gcc_unreachable ();
2635 if (factor < 0)
2636 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2637 nelts_per_vq);
2638 else
2639 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2640 nelts_per_vq);
2641 }
2642
2643 static int
2644 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2645 scalar_int_mode mode)
2646 {
2647 int i;
2648 unsigned HOST_WIDE_INT val, val2, mask;
2649 int one_match, zero_match;
2650 int num_insns;
2651
2652 val = INTVAL (imm);
2653
2654 if (aarch64_move_imm (val, mode))
2655 {
2656 if (generate)
2657 emit_insn (gen_rtx_SET (dest, imm));
2658 return 1;
2659 }
2660
2661 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2662 (with XXXX non-zero). In that case check to see if the move can be done in
2663 a smaller mode. */
2664 val2 = val & 0xffffffff;
2665 if (mode == DImode
2666 && aarch64_move_imm (val2, SImode)
2667 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2668 {
2669 if (generate)
2670 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2671
2672 /* Check if we have to emit a second instruction by checking to see
2673 if any of the upper 32 bits of the original DI mode value is set. */
2674 if (val == val2)
2675 return 1;
2676
2677 i = (val >> 48) ? 48 : 32;
2678
2679 if (generate)
2680 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2681 GEN_INT ((val >> i) & 0xffff)));
2682
2683 return 2;
2684 }
2685
2686 if ((val >> 32) == 0 || mode == SImode)
2687 {
2688 if (generate)
2689 {
2690 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2691 if (mode == SImode)
2692 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2693 GEN_INT ((val >> 16) & 0xffff)));
2694 else
2695 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2696 GEN_INT ((val >> 16) & 0xffff)));
2697 }
2698 return 2;
2699 }
2700
2701 /* Remaining cases are all for DImode. */
2702
2703 mask = 0xffff;
2704 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2705 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2706 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2707 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2708
2709 if (zero_match != 2 && one_match != 2)
2710 {
2711 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2712 For a 64-bit bitmask try whether changing 16 bits to all ones or
2713 zeroes creates a valid bitmask. To check any repeated bitmask,
2714 try using 16 bits from the other 32-bit half of val. */
2715
2716 for (i = 0; i < 64; i += 16, mask <<= 16)
2717 {
2718 val2 = val & ~mask;
2719 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2720 break;
2721 val2 = val | mask;
2722 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2723 break;
2724 val2 = val2 & ~mask;
2725 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2726 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2727 break;
2728 }
2729 if (i != 64)
2730 {
2731 if (generate)
2732 {
2733 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2734 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2735 GEN_INT ((val >> i) & 0xffff)));
2736 }
2737 return 2;
2738 }
2739 }
2740
2741 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2742 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2743 otherwise skip zero bits. */
2744
2745 num_insns = 1;
2746 mask = 0xffff;
2747 val2 = one_match > zero_match ? ~val : val;
2748 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2749
2750 if (generate)
2751 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2752 ? (val | ~(mask << i))
2753 : (val & (mask << i)))));
2754 for (i += 16; i < 64; i += 16)
2755 {
2756 if ((val2 & (mask << i)) == 0)
2757 continue;
2758 if (generate)
2759 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2760 GEN_INT ((val >> i) & 0xffff)));
2761 num_insns ++;
2762 }
2763
2764 return num_insns;
2765 }
2766
2767 /* Return whether imm is a 128-bit immediate which is simple enough to
2768 expand inline. */
2769 bool
2770 aarch64_mov128_immediate (rtx imm)
2771 {
2772 if (GET_CODE (imm) == CONST_INT)
2773 return true;
2774
2775 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2776
2777 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2778 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2779
2780 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2781 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2782 }
2783
2784
2785 /* Return the number of temporary registers that aarch64_add_offset_1
2786 would need to add OFFSET to a register. */
2787
2788 static unsigned int
2789 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2790 {
2791 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2792 }
2793
2794 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2795 a non-polynomial OFFSET. MODE is the mode of the addition.
2796 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2797 be set and CFA adjustments added to the generated instructions.
2798
2799 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2800 temporary if register allocation is already complete. This temporary
2801 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2802 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2803 the immediate again.
2804
2805 Since this function may be used to adjust the stack pointer, we must
2806 ensure that it cannot cause transient stack deallocation (for example
2807 by first incrementing SP and then decrementing when adjusting by a
2808 large immediate). */
2809
2810 static void
2811 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2812 rtx src, HOST_WIDE_INT offset, rtx temp1,
2813 bool frame_related_p, bool emit_move_imm)
2814 {
2815 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2816 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2817
2818 HOST_WIDE_INT moffset = abs_hwi (offset);
2819 rtx_insn *insn;
2820
2821 if (!moffset)
2822 {
2823 if (!rtx_equal_p (dest, src))
2824 {
2825 insn = emit_insn (gen_rtx_SET (dest, src));
2826 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2827 }
2828 return;
2829 }
2830
2831 /* Single instruction adjustment. */
2832 if (aarch64_uimm12_shift (moffset))
2833 {
2834 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2835 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2836 return;
2837 }
2838
2839 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2840 and either:
2841
2842 a) the offset cannot be loaded by a 16-bit move or
2843 b) there is no spare register into which we can move it. */
2844 if (moffset < 0x1000000
2845 && ((!temp1 && !can_create_pseudo_p ())
2846 || !aarch64_move_imm (moffset, mode)))
2847 {
2848 HOST_WIDE_INT low_off = moffset & 0xfff;
2849
2850 low_off = offset < 0 ? -low_off : low_off;
2851 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2852 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2853 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2854 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2855 return;
2856 }
2857
2858 /* Emit a move immediate if required and an addition/subtraction. */
2859 if (emit_move_imm)
2860 {
2861 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2862 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2863 }
2864 insn = emit_insn (offset < 0
2865 ? gen_sub3_insn (dest, src, temp1)
2866 : gen_add3_insn (dest, src, temp1));
2867 if (frame_related_p)
2868 {
2869 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2870 rtx adj = plus_constant (mode, src, offset);
2871 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2872 }
2873 }
2874
2875 /* Return the number of temporary registers that aarch64_add_offset
2876 would need to move OFFSET into a register or add OFFSET to a register;
2877 ADD_P is true if we want the latter rather than the former. */
2878
2879 static unsigned int
2880 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2881 {
2882 /* This follows the same structure as aarch64_add_offset. */
2883 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2884 return 0;
2885
2886 unsigned int count = 0;
2887 HOST_WIDE_INT factor = offset.coeffs[1];
2888 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2889 poly_int64 poly_offset (factor, factor);
2890 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2891 /* Need one register for the ADDVL/ADDPL result. */
2892 count += 1;
2893 else if (factor != 0)
2894 {
2895 factor = abs (factor);
2896 if (factor > 16 * (factor & -factor))
2897 /* Need one register for the CNT result and one for the multiplication
2898 factor. If necessary, the second temporary can be reused for the
2899 constant part of the offset. */
2900 return 2;
2901 /* Need one register for the CNT result (which might then
2902 be shifted). */
2903 count += 1;
2904 }
2905 return count + aarch64_add_offset_1_temporaries (constant);
2906 }
2907
2908 /* If X can be represented as a poly_int64, return the number
2909 of temporaries that are required to add it to a register.
2910 Return -1 otherwise. */
2911
2912 int
2913 aarch64_add_offset_temporaries (rtx x)
2914 {
2915 poly_int64 offset;
2916 if (!poly_int_rtx_p (x, &offset))
2917 return -1;
2918 return aarch64_offset_temporaries (true, offset);
2919 }
2920
2921 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2922 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2923 be set and CFA adjustments added to the generated instructions.
2924
2925 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2926 temporary if register allocation is already complete. This temporary
2927 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2928 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2929 false to avoid emitting the immediate again.
2930
2931 TEMP2, if nonnull, is a second temporary register that doesn't
2932 overlap either DEST or REG.
2933
2934 Since this function may be used to adjust the stack pointer, we must
2935 ensure that it cannot cause transient stack deallocation (for example
2936 by first incrementing SP and then decrementing when adjusting by a
2937 large immediate). */
2938
2939 static void
2940 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2941 poly_int64 offset, rtx temp1, rtx temp2,
2942 bool frame_related_p, bool emit_move_imm = true)
2943 {
2944 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2945 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2946 gcc_assert (temp1 == NULL_RTX
2947 || !frame_related_p
2948 || !reg_overlap_mentioned_p (temp1, dest));
2949 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2950
2951 /* Try using ADDVL or ADDPL to add the whole value. */
2952 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2953 {
2954 rtx offset_rtx = gen_int_mode (offset, mode);
2955 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2956 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2957 return;
2958 }
2959
2960 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2961 SVE vector register, over and above the minimum size of 128 bits.
2962 This is equivalent to half the value returned by CNTD with a
2963 vector shape of ALL. */
2964 HOST_WIDE_INT factor = offset.coeffs[1];
2965 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2966
2967 /* Try using ADDVL or ADDPL to add the VG-based part. */
2968 poly_int64 poly_offset (factor, factor);
2969 if (src != const0_rtx
2970 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2971 {
2972 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2973 if (frame_related_p)
2974 {
2975 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2976 RTX_FRAME_RELATED_P (insn) = true;
2977 src = dest;
2978 }
2979 else
2980 {
2981 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2982 src = aarch64_force_temporary (mode, temp1, addr);
2983 temp1 = temp2;
2984 temp2 = NULL_RTX;
2985 }
2986 }
2987 /* Otherwise use a CNT-based sequence. */
2988 else if (factor != 0)
2989 {
2990 /* Use a subtraction if we have a negative factor. */
2991 rtx_code code = PLUS;
2992 if (factor < 0)
2993 {
2994 factor = -factor;
2995 code = MINUS;
2996 }
2997
2998 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2999 into the multiplication. */
3000 rtx val;
3001 int shift = 0;
3002 if (factor & 1)
3003 /* Use a right shift by 1. */
3004 shift = -1;
3005 else
3006 factor /= 2;
3007 HOST_WIDE_INT low_bit = factor & -factor;
3008 if (factor <= 16 * low_bit)
3009 {
3010 if (factor > 16 * 8)
3011 {
3012 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3013 the value with the minimum multiplier and shift it into
3014 position. */
3015 int extra_shift = exact_log2 (low_bit);
3016 shift += extra_shift;
3017 factor >>= extra_shift;
3018 }
3019 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3020 }
3021 else
3022 {
3023 /* Use CNTD, then multiply it by FACTOR. */
3024 val = gen_int_mode (poly_int64 (2, 2), mode);
3025 val = aarch64_force_temporary (mode, temp1, val);
3026
3027 /* Go back to using a negative multiplication factor if we have
3028 no register from which to subtract. */
3029 if (code == MINUS && src == const0_rtx)
3030 {
3031 factor = -factor;
3032 code = PLUS;
3033 }
3034 rtx coeff1 = gen_int_mode (factor, mode);
3035 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3036 val = gen_rtx_MULT (mode, val, coeff1);
3037 }
3038
3039 if (shift > 0)
3040 {
3041 /* Multiply by 1 << SHIFT. */
3042 val = aarch64_force_temporary (mode, temp1, val);
3043 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3044 }
3045 else if (shift == -1)
3046 {
3047 /* Divide by 2. */
3048 val = aarch64_force_temporary (mode, temp1, val);
3049 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3050 }
3051
3052 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3053 if (src != const0_rtx)
3054 {
3055 val = aarch64_force_temporary (mode, temp1, val);
3056 val = gen_rtx_fmt_ee (code, mode, src, val);
3057 }
3058 else if (code == MINUS)
3059 {
3060 val = aarch64_force_temporary (mode, temp1, val);
3061 val = gen_rtx_NEG (mode, val);
3062 }
3063
3064 if (constant == 0 || frame_related_p)
3065 {
3066 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3067 if (frame_related_p)
3068 {
3069 RTX_FRAME_RELATED_P (insn) = true;
3070 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3071 gen_rtx_SET (dest, plus_constant (Pmode, src,
3072 poly_offset)));
3073 }
3074 src = dest;
3075 if (constant == 0)
3076 return;
3077 }
3078 else
3079 {
3080 src = aarch64_force_temporary (mode, temp1, val);
3081 temp1 = temp2;
3082 temp2 = NULL_RTX;
3083 }
3084
3085 emit_move_imm = true;
3086 }
3087
3088 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3089 frame_related_p, emit_move_imm);
3090 }
3091
3092 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3093 than a poly_int64. */
3094
3095 void
3096 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3097 rtx offset_rtx, rtx temp1, rtx temp2)
3098 {
3099 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3100 temp1, temp2, false);
3101 }
3102
3103 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3104 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3105 if TEMP1 already contains abs (DELTA). */
3106
3107 static inline void
3108 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3109 {
3110 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3111 temp1, temp2, true, emit_move_imm);
3112 }
3113
3114 /* Subtract DELTA from the stack pointer, marking the instructions
3115 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3116 if nonnull. */
3117
3118 static inline void
3119 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3120 bool emit_move_imm = true)
3121 {
3122 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3123 temp1, temp2, frame_related_p, emit_move_imm);
3124 }
3125
3126 /* Set DEST to (vec_series BASE STEP). */
3127
3128 static void
3129 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3130 {
3131 machine_mode mode = GET_MODE (dest);
3132 scalar_mode inner = GET_MODE_INNER (mode);
3133
3134 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3135 if (!aarch64_sve_index_immediate_p (base))
3136 base = force_reg (inner, base);
3137 if (!aarch64_sve_index_immediate_p (step))
3138 step = force_reg (inner, step);
3139
3140 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3141 }
3142
3143 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3144 integer of mode INT_MODE. Return true on success. */
3145
3146 static bool
3147 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3148 rtx src)
3149 {
3150 /* If the constant is smaller than 128 bits, we can do the move
3151 using a vector of SRC_MODEs. */
3152 if (src_mode != TImode)
3153 {
3154 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3155 GET_MODE_SIZE (src_mode));
3156 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3157 emit_move_insn (gen_lowpart (dup_mode, dest),
3158 gen_const_vec_duplicate (dup_mode, src));
3159 return true;
3160 }
3161
3162 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3163 src = force_const_mem (src_mode, src);
3164 if (!src)
3165 return false;
3166
3167 /* Make sure that the address is legitimate. */
3168 if (!aarch64_sve_ld1r_operand_p (src))
3169 {
3170 rtx addr = force_reg (Pmode, XEXP (src, 0));
3171 src = replace_equiv_address (src, addr);
3172 }
3173
3174 machine_mode mode = GET_MODE (dest);
3175 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3176 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3177 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3178 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3179 emit_insn (gen_rtx_SET (dest, src));
3180 return true;
3181 }
3182
3183 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3184 isn't a simple duplicate or series. */
3185
3186 static void
3187 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3188 {
3189 machine_mode mode = GET_MODE (src);
3190 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3191 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3192 gcc_assert (npatterns > 1);
3193
3194 if (nelts_per_pattern == 1)
3195 {
3196 /* The constant is a repeating seqeuence of at least two elements,
3197 where the repeating elements occupy no more than 128 bits.
3198 Get an integer representation of the replicated value. */
3199 scalar_int_mode int_mode;
3200 if (BYTES_BIG_ENDIAN)
3201 /* For now, always use LD1RQ to load the value on big-endian
3202 targets, since the handling of smaller integers includes a
3203 subreg that is semantically an element reverse. */
3204 int_mode = TImode;
3205 else
3206 {
3207 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3208 gcc_assert (int_bits <= 128);
3209 int_mode = int_mode_for_size (int_bits, 0).require ();
3210 }
3211 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3212 if (int_value
3213 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3214 return;
3215 }
3216
3217 /* Expand each pattern individually. */
3218 rtx_vector_builder builder;
3219 auto_vec<rtx, 16> vectors (npatterns);
3220 for (unsigned int i = 0; i < npatterns; ++i)
3221 {
3222 builder.new_vector (mode, 1, nelts_per_pattern);
3223 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3224 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3225 vectors.quick_push (force_reg (mode, builder.build ()));
3226 }
3227
3228 /* Use permutes to interleave the separate vectors. */
3229 while (npatterns > 1)
3230 {
3231 npatterns /= 2;
3232 for (unsigned int i = 0; i < npatterns; ++i)
3233 {
3234 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3235 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3236 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3237 vectors[i] = tmp;
3238 }
3239 }
3240 gcc_assert (vectors[0] == dest);
3241 }
3242
3243 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3244 is a pattern that can be used to set DEST to a replicated scalar
3245 element. */
3246
3247 void
3248 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3249 rtx (*gen_vec_duplicate) (rtx, rtx))
3250 {
3251 machine_mode mode = GET_MODE (dest);
3252
3253 /* Check on what type of symbol it is. */
3254 scalar_int_mode int_mode;
3255 if ((GET_CODE (imm) == SYMBOL_REF
3256 || GET_CODE (imm) == LABEL_REF
3257 || GET_CODE (imm) == CONST
3258 || GET_CODE (imm) == CONST_POLY_INT)
3259 && is_a <scalar_int_mode> (mode, &int_mode))
3260 {
3261 rtx mem;
3262 poly_int64 offset;
3263 HOST_WIDE_INT const_offset;
3264 enum aarch64_symbol_type sty;
3265
3266 /* If we have (const (plus symbol offset)), separate out the offset
3267 before we start classifying the symbol. */
3268 rtx base = strip_offset (imm, &offset);
3269
3270 /* We must always add an offset involving VL separately, rather than
3271 folding it into the relocation. */
3272 if (!offset.is_constant (&const_offset))
3273 {
3274 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3275 emit_insn (gen_rtx_SET (dest, imm));
3276 else
3277 {
3278 /* Do arithmetic on 32-bit values if the result is smaller
3279 than that. */
3280 if (partial_subreg_p (int_mode, SImode))
3281 {
3282 /* It is invalid to do symbol calculations in modes
3283 narrower than SImode. */
3284 gcc_assert (base == const0_rtx);
3285 dest = gen_lowpart (SImode, dest);
3286 int_mode = SImode;
3287 }
3288 if (base != const0_rtx)
3289 {
3290 base = aarch64_force_temporary (int_mode, dest, base);
3291 aarch64_add_offset (int_mode, dest, base, offset,
3292 NULL_RTX, NULL_RTX, false);
3293 }
3294 else
3295 aarch64_add_offset (int_mode, dest, base, offset,
3296 dest, NULL_RTX, false);
3297 }
3298 return;
3299 }
3300
3301 sty = aarch64_classify_symbol (base, const_offset);
3302 switch (sty)
3303 {
3304 case SYMBOL_FORCE_TO_MEM:
3305 if (const_offset != 0
3306 && targetm.cannot_force_const_mem (int_mode, imm))
3307 {
3308 gcc_assert (can_create_pseudo_p ());
3309 base = aarch64_force_temporary (int_mode, dest, base);
3310 aarch64_add_offset (int_mode, dest, base, const_offset,
3311 NULL_RTX, NULL_RTX, false);
3312 return;
3313 }
3314
3315 mem = force_const_mem (ptr_mode, imm);
3316 gcc_assert (mem);
3317
3318 /* If we aren't generating PC relative literals, then
3319 we need to expand the literal pool access carefully.
3320 This is something that needs to be done in a number
3321 of places, so could well live as a separate function. */
3322 if (!aarch64_pcrelative_literal_loads)
3323 {
3324 gcc_assert (can_create_pseudo_p ());
3325 base = gen_reg_rtx (ptr_mode);
3326 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3327 if (ptr_mode != Pmode)
3328 base = convert_memory_address (Pmode, base);
3329 mem = gen_rtx_MEM (ptr_mode, base);
3330 }
3331
3332 if (int_mode != ptr_mode)
3333 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3334
3335 emit_insn (gen_rtx_SET (dest, mem));
3336
3337 return;
3338
3339 case SYMBOL_SMALL_TLSGD:
3340 case SYMBOL_SMALL_TLSDESC:
3341 case SYMBOL_SMALL_TLSIE:
3342 case SYMBOL_SMALL_GOT_28K:
3343 case SYMBOL_SMALL_GOT_4G:
3344 case SYMBOL_TINY_GOT:
3345 case SYMBOL_TINY_TLSIE:
3346 if (const_offset != 0)
3347 {
3348 gcc_assert(can_create_pseudo_p ());
3349 base = aarch64_force_temporary (int_mode, dest, base);
3350 aarch64_add_offset (int_mode, dest, base, const_offset,
3351 NULL_RTX, NULL_RTX, false);
3352 return;
3353 }
3354 /* FALLTHRU */
3355
3356 case SYMBOL_SMALL_ABSOLUTE:
3357 case SYMBOL_TINY_ABSOLUTE:
3358 case SYMBOL_TLSLE12:
3359 case SYMBOL_TLSLE24:
3360 case SYMBOL_TLSLE32:
3361 case SYMBOL_TLSLE48:
3362 aarch64_load_symref_appropriately (dest, imm, sty);
3363 return;
3364
3365 default:
3366 gcc_unreachable ();
3367 }
3368 }
3369
3370 if (!CONST_INT_P (imm))
3371 {
3372 rtx base, step, value;
3373 if (GET_CODE (imm) == HIGH
3374 || aarch64_simd_valid_immediate (imm, NULL))
3375 emit_insn (gen_rtx_SET (dest, imm));
3376 else if (const_vec_series_p (imm, &base, &step))
3377 aarch64_expand_vec_series (dest, base, step);
3378 else if (const_vec_duplicate_p (imm, &value))
3379 {
3380 /* If the constant is out of range of an SVE vector move,
3381 load it from memory if we can, otherwise move it into
3382 a register and use a DUP. */
3383 scalar_mode inner_mode = GET_MODE_INNER (mode);
3384 rtx op = force_const_mem (inner_mode, value);
3385 if (!op)
3386 op = force_reg (inner_mode, value);
3387 else if (!aarch64_sve_ld1r_operand_p (op))
3388 {
3389 rtx addr = force_reg (Pmode, XEXP (op, 0));
3390 op = replace_equiv_address (op, addr);
3391 }
3392 emit_insn (gen_vec_duplicate (dest, op));
3393 }
3394 else if (GET_CODE (imm) == CONST_VECTOR
3395 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3396 aarch64_expand_sve_const_vector (dest, imm);
3397 else
3398 {
3399 rtx mem = force_const_mem (mode, imm);
3400 gcc_assert (mem);
3401 emit_move_insn (dest, mem);
3402 }
3403
3404 return;
3405 }
3406
3407 aarch64_internal_mov_immediate (dest, imm, true,
3408 as_a <scalar_int_mode> (mode));
3409 }
3410
3411 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3412 that is known to contain PTRUE. */
3413
3414 void
3415 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3416 {
3417 expand_operand ops[3];
3418 machine_mode mode = GET_MODE (dest);
3419 create_output_operand (&ops[0], dest, mode);
3420 create_input_operand (&ops[1], pred, GET_MODE(pred));
3421 create_input_operand (&ops[2], src, mode);
3422 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3423 }
3424
3425 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3426 operand is in memory. In this case we need to use the predicated LD1
3427 and ST1 instead of LDR and STR, both for correctness on big-endian
3428 targets and because LD1 and ST1 support a wider range of addressing modes.
3429 PRED_MODE is the mode of the predicate.
3430
3431 See the comment at the head of aarch64-sve.md for details about the
3432 big-endian handling. */
3433
3434 void
3435 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3436 {
3437 machine_mode mode = GET_MODE (dest);
3438 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3439 if (!register_operand (src, mode)
3440 && !register_operand (dest, mode))
3441 {
3442 rtx tmp = gen_reg_rtx (mode);
3443 if (MEM_P (src))
3444 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3445 else
3446 emit_move_insn (tmp, src);
3447 src = tmp;
3448 }
3449 aarch64_emit_sve_pred_move (dest, ptrue, src);
3450 }
3451
3452 /* Called only on big-endian targets. See whether an SVE vector move
3453 from SRC to DEST is effectively a REV[BHW] instruction, because at
3454 least one operand is a subreg of an SVE vector that has wider or
3455 narrower elements. Return true and emit the instruction if so.
3456
3457 For example:
3458
3459 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3460
3461 represents a VIEW_CONVERT between the following vectors, viewed
3462 in memory order:
3463
3464 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3465 R1: { [0], [1], [2], [3], ... }
3466
3467 The high part of lane X in R2 should therefore correspond to lane X*2
3468 of R1, but the register representations are:
3469
3470 msb lsb
3471 R2: ...... [1].high [1].low [0].high [0].low
3472 R1: ...... [3] [2] [1] [0]
3473
3474 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3475 We therefore need a reverse operation to swap the high and low values
3476 around.
3477
3478 This is purely an optimization. Without it we would spill the
3479 subreg operand to the stack in one mode and reload it in the
3480 other mode, which has the same effect as the REV. */
3481
3482 bool
3483 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3484 {
3485 gcc_assert (BYTES_BIG_ENDIAN);
3486 if (GET_CODE (dest) == SUBREG)
3487 dest = SUBREG_REG (dest);
3488 if (GET_CODE (src) == SUBREG)
3489 src = SUBREG_REG (src);
3490
3491 /* The optimization handles two single SVE REGs with different element
3492 sizes. */
3493 if (!REG_P (dest)
3494 || !REG_P (src)
3495 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3496 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3497 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3498 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3499 return false;
3500
3501 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3502 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3503 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3504 UNSPEC_REV_SUBREG);
3505 emit_insn (gen_rtx_SET (dest, unspec));
3506 return true;
3507 }
3508
3509 /* Return a copy of X with mode MODE, without changing its other
3510 attributes. Unlike gen_lowpart, this doesn't care whether the
3511 mode change is valid. */
3512
3513 static rtx
3514 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3515 {
3516 if (GET_MODE (x) == mode)
3517 return x;
3518
3519 x = shallow_copy_rtx (x);
3520 set_mode_and_regno (x, mode, REGNO (x));
3521 return x;
3522 }
3523
3524 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3525 operands. */
3526
3527 void
3528 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3529 {
3530 /* Decide which REV operation we need. The mode with narrower elements
3531 determines the mode of the operands and the mode with the wider
3532 elements determines the reverse width. */
3533 machine_mode mode_with_wider_elts = GET_MODE (dest);
3534 machine_mode mode_with_narrower_elts = GET_MODE (src);
3535 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3536 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3537 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3538
3539 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3540 unsigned int unspec;
3541 if (wider_bytes == 8)
3542 unspec = UNSPEC_REV64;
3543 else if (wider_bytes == 4)
3544 unspec = UNSPEC_REV32;
3545 else if (wider_bytes == 2)
3546 unspec = UNSPEC_REV16;
3547 else
3548 gcc_unreachable ();
3549 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3550
3551 /* Emit:
3552
3553 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3554 UNSPEC_MERGE_PTRUE))
3555
3556 with the appropriate modes. */
3557 ptrue = gen_lowpart (pred_mode, ptrue);
3558 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3559 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3560 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3561 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3562 UNSPEC_MERGE_PTRUE);
3563 emit_insn (gen_rtx_SET (dest, src));
3564 }
3565
3566 static bool
3567 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3568 tree exp ATTRIBUTE_UNUSED)
3569 {
3570 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3571 return false;
3572
3573 return true;
3574 }
3575
3576 /* Implement TARGET_PASS_BY_REFERENCE. */
3577
3578 static bool
3579 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3580 machine_mode mode,
3581 const_tree type,
3582 bool named ATTRIBUTE_UNUSED)
3583 {
3584 HOST_WIDE_INT size;
3585 machine_mode dummymode;
3586 int nregs;
3587
3588 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3589 if (mode == BLKmode && type)
3590 size = int_size_in_bytes (type);
3591 else
3592 /* No frontends can create types with variable-sized modes, so we
3593 shouldn't be asked to pass or return them. */
3594 size = GET_MODE_SIZE (mode).to_constant ();
3595
3596 /* Aggregates are passed by reference based on their size. */
3597 if (type && AGGREGATE_TYPE_P (type))
3598 {
3599 size = int_size_in_bytes (type);
3600 }
3601
3602 /* Variable sized arguments are always returned by reference. */
3603 if (size < 0)
3604 return true;
3605
3606 /* Can this be a candidate to be passed in fp/simd register(s)? */
3607 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3608 &dummymode, &nregs,
3609 NULL))
3610 return false;
3611
3612 /* Arguments which are variable sized or larger than 2 registers are
3613 passed by reference unless they are a homogenous floating point
3614 aggregate. */
3615 return size > 2 * UNITS_PER_WORD;
3616 }
3617
3618 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3619 static bool
3620 aarch64_return_in_msb (const_tree valtype)
3621 {
3622 machine_mode dummy_mode;
3623 int dummy_int;
3624
3625 /* Never happens in little-endian mode. */
3626 if (!BYTES_BIG_ENDIAN)
3627 return false;
3628
3629 /* Only composite types smaller than or equal to 16 bytes can
3630 be potentially returned in registers. */
3631 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3632 || int_size_in_bytes (valtype) <= 0
3633 || int_size_in_bytes (valtype) > 16)
3634 return false;
3635
3636 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3637 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3638 is always passed/returned in the least significant bits of fp/simd
3639 register(s). */
3640 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3641 &dummy_mode, &dummy_int, NULL))
3642 return false;
3643
3644 return true;
3645 }
3646
3647 /* Implement TARGET_FUNCTION_VALUE.
3648 Define how to find the value returned by a function. */
3649
3650 static rtx
3651 aarch64_function_value (const_tree type, const_tree func,
3652 bool outgoing ATTRIBUTE_UNUSED)
3653 {
3654 machine_mode mode;
3655 int unsignedp;
3656 int count;
3657 machine_mode ag_mode;
3658
3659 mode = TYPE_MODE (type);
3660 if (INTEGRAL_TYPE_P (type))
3661 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3662
3663 if (aarch64_return_in_msb (type))
3664 {
3665 HOST_WIDE_INT size = int_size_in_bytes (type);
3666
3667 if (size % UNITS_PER_WORD != 0)
3668 {
3669 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3670 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3671 }
3672 }
3673
3674 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3675 &ag_mode, &count, NULL))
3676 {
3677 if (!aarch64_composite_type_p (type, mode))
3678 {
3679 gcc_assert (count == 1 && mode == ag_mode);
3680 return gen_rtx_REG (mode, V0_REGNUM);
3681 }
3682 else
3683 {
3684 int i;
3685 rtx par;
3686
3687 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3688 for (i = 0; i < count; i++)
3689 {
3690 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3691 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3692 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3693 XVECEXP (par, 0, i) = tmp;
3694 }
3695 return par;
3696 }
3697 }
3698 else
3699 return gen_rtx_REG (mode, R0_REGNUM);
3700 }
3701
3702 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3703 Return true if REGNO is the number of a hard register in which the values
3704 of called function may come back. */
3705
3706 static bool
3707 aarch64_function_value_regno_p (const unsigned int regno)
3708 {
3709 /* Maximum of 16 bytes can be returned in the general registers. Examples
3710 of 16-byte return values are: 128-bit integers and 16-byte small
3711 structures (excluding homogeneous floating-point aggregates). */
3712 if (regno == R0_REGNUM || regno == R1_REGNUM)
3713 return true;
3714
3715 /* Up to four fp/simd registers can return a function value, e.g. a
3716 homogeneous floating-point aggregate having four members. */
3717 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3718 return TARGET_FLOAT;
3719
3720 return false;
3721 }
3722
3723 /* Implement TARGET_RETURN_IN_MEMORY.
3724
3725 If the type T of the result of a function is such that
3726 void func (T arg)
3727 would require that arg be passed as a value in a register (or set of
3728 registers) according to the parameter passing rules, then the result
3729 is returned in the same registers as would be used for such an
3730 argument. */
3731
3732 static bool
3733 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3734 {
3735 HOST_WIDE_INT size;
3736 machine_mode ag_mode;
3737 int count;
3738
3739 if (!AGGREGATE_TYPE_P (type)
3740 && TREE_CODE (type) != COMPLEX_TYPE
3741 && TREE_CODE (type) != VECTOR_TYPE)
3742 /* Simple scalar types always returned in registers. */
3743 return false;
3744
3745 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3746 type,
3747 &ag_mode,
3748 &count,
3749 NULL))
3750 return false;
3751
3752 /* Types larger than 2 registers returned in memory. */
3753 size = int_size_in_bytes (type);
3754 return (size < 0 || size > 2 * UNITS_PER_WORD);
3755 }
3756
3757 static bool
3758 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3759 const_tree type, int *nregs)
3760 {
3761 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3762 return aarch64_vfp_is_call_or_return_candidate (mode,
3763 type,
3764 &pcum->aapcs_vfp_rmode,
3765 nregs,
3766 NULL);
3767 }
3768
3769 /* Given MODE and TYPE of a function argument, return the alignment in
3770 bits. The idea is to suppress any stronger alignment requested by
3771 the user and opt for the natural alignment (specified in AAPCS64 \S
3772 4.1). ABI_BREAK is set to true if the alignment was incorrectly
3773 calculated in versions of GCC prior to GCC-9. This is a helper
3774 function for local use only. */
3775
3776 static unsigned int
3777 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3778 bool *abi_break)
3779 {
3780 *abi_break = false;
3781 if (!type)
3782 return GET_MODE_ALIGNMENT (mode);
3783
3784 if (integer_zerop (TYPE_SIZE (type)))
3785 return 0;
3786
3787 gcc_assert (TYPE_MODE (type) == mode);
3788
3789 if (!AGGREGATE_TYPE_P (type))
3790 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3791
3792 if (TREE_CODE (type) == ARRAY_TYPE)
3793 return TYPE_ALIGN (TREE_TYPE (type));
3794
3795 unsigned int alignment = 0;
3796 unsigned int bitfield_alignment = 0;
3797 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3798 if (TREE_CODE (field) == FIELD_DECL)
3799 {
3800 alignment = std::max (alignment, DECL_ALIGN (field));
3801 if (DECL_BIT_FIELD_TYPE (field))
3802 bitfield_alignment
3803 = std::max (bitfield_alignment,
3804 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3805 }
3806
3807 if (bitfield_alignment > alignment)
3808 {
3809 *abi_break = true;
3810 return bitfield_alignment;
3811 }
3812
3813 return alignment;
3814 }
3815
3816 /* Layout a function argument according to the AAPCS64 rules. The rule
3817 numbers refer to the rule numbers in the AAPCS64. */
3818
3819 static void
3820 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3821 const_tree type,
3822 bool named ATTRIBUTE_UNUSED)
3823 {
3824 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3825 int ncrn, nvrn, nregs;
3826 bool allocate_ncrn, allocate_nvrn;
3827 HOST_WIDE_INT size;
3828 bool abi_break;
3829
3830 /* We need to do this once per argument. */
3831 if (pcum->aapcs_arg_processed)
3832 return;
3833
3834 pcum->aapcs_arg_processed = true;
3835
3836 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3837 if (type)
3838 size = int_size_in_bytes (type);
3839 else
3840 /* No frontends can create types with variable-sized modes, so we
3841 shouldn't be asked to pass or return them. */
3842 size = GET_MODE_SIZE (mode).to_constant ();
3843 size = ROUND_UP (size, UNITS_PER_WORD);
3844
3845 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3846 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3847 mode,
3848 type,
3849 &nregs);
3850
3851 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3852 The following code thus handles passing by SIMD/FP registers first. */
3853
3854 nvrn = pcum->aapcs_nvrn;
3855
3856 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3857 and homogenous short-vector aggregates (HVA). */
3858 if (allocate_nvrn)
3859 {
3860 if (!TARGET_FLOAT)
3861 aarch64_err_no_fpadvsimd (mode);
3862
3863 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3864 {
3865 pcum->aapcs_nextnvrn = nvrn + nregs;
3866 if (!aarch64_composite_type_p (type, mode))
3867 {
3868 gcc_assert (nregs == 1);
3869 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3870 }
3871 else
3872 {
3873 rtx par;
3874 int i;
3875 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3876 for (i = 0; i < nregs; i++)
3877 {
3878 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3879 V0_REGNUM + nvrn + i);
3880 rtx offset = gen_int_mode
3881 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3882 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3883 XVECEXP (par, 0, i) = tmp;
3884 }
3885 pcum->aapcs_reg = par;
3886 }
3887 return;
3888 }
3889 else
3890 {
3891 /* C.3 NSRN is set to 8. */
3892 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3893 goto on_stack;
3894 }
3895 }
3896
3897 ncrn = pcum->aapcs_ncrn;
3898 nregs = size / UNITS_PER_WORD;
3899
3900 /* C6 - C9. though the sign and zero extension semantics are
3901 handled elsewhere. This is the case where the argument fits
3902 entirely general registers. */
3903 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3904 {
3905 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3906
3907 /* C.8 if the argument has an alignment of 16 then the NGRN is
3908 rounded up to the next even number. */
3909 if (nregs == 2
3910 && ncrn % 2
3911 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3912 comparison is there because for > 16 * BITS_PER_UNIT
3913 alignment nregs should be > 2 and therefore it should be
3914 passed by reference rather than value. */
3915 && (aarch64_function_arg_alignment (mode, type, &abi_break)
3916 == 16 * BITS_PER_UNIT))
3917 {
3918 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3919 inform (input_location, "parameter passing for argument of type "
3920 "%qT changed in GCC 9.1", type);
3921 ++ncrn;
3922 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3923 }
3924
3925 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3926 A reg is still generated for it, but the caller should be smart
3927 enough not to use it. */
3928 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3929 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3930 else
3931 {
3932 rtx par;
3933 int i;
3934
3935 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3936 for (i = 0; i < nregs; i++)
3937 {
3938 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3939 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3940 GEN_INT (i * UNITS_PER_WORD));
3941 XVECEXP (par, 0, i) = tmp;
3942 }
3943 pcum->aapcs_reg = par;
3944 }
3945
3946 pcum->aapcs_nextncrn = ncrn + nregs;
3947 return;
3948 }
3949
3950 /* C.11 */
3951 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3952
3953 /* The argument is passed on stack; record the needed number of words for
3954 this argument and align the total size if necessary. */
3955 on_stack:
3956 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3957
3958 if (aarch64_function_arg_alignment (mode, type, &abi_break)
3959 == 16 * BITS_PER_UNIT)
3960 {
3961 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
3962 if (pcum->aapcs_stack_size != new_size)
3963 {
3964 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3965 inform (input_location, "parameter passing for argument of type "
3966 "%qT changed in GCC 9.1", type);
3967 pcum->aapcs_stack_size = new_size;
3968 }
3969 }
3970 return;
3971 }
3972
3973 /* Implement TARGET_FUNCTION_ARG. */
3974
3975 static rtx
3976 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3977 const_tree type, bool named)
3978 {
3979 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3980 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3981
3982 if (mode == VOIDmode)
3983 return NULL_RTX;
3984
3985 aarch64_layout_arg (pcum_v, mode, type, named);
3986 return pcum->aapcs_reg;
3987 }
3988
3989 void
3990 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3991 const_tree fntype ATTRIBUTE_UNUSED,
3992 rtx libname ATTRIBUTE_UNUSED,
3993 const_tree fndecl ATTRIBUTE_UNUSED,
3994 unsigned n_named ATTRIBUTE_UNUSED)
3995 {
3996 pcum->aapcs_ncrn = 0;
3997 pcum->aapcs_nvrn = 0;
3998 pcum->aapcs_nextncrn = 0;
3999 pcum->aapcs_nextnvrn = 0;
4000 pcum->pcs_variant = ARM_PCS_AAPCS64;
4001 pcum->aapcs_reg = NULL_RTX;
4002 pcum->aapcs_arg_processed = false;
4003 pcum->aapcs_stack_words = 0;
4004 pcum->aapcs_stack_size = 0;
4005
4006 if (!TARGET_FLOAT
4007 && fndecl && TREE_PUBLIC (fndecl)
4008 && fntype && fntype != error_mark_node)
4009 {
4010 const_tree type = TREE_TYPE (fntype);
4011 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4012 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4013 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4014 &mode, &nregs, NULL))
4015 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4016 }
4017 return;
4018 }
4019
4020 static void
4021 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4022 machine_mode mode,
4023 const_tree type,
4024 bool named)
4025 {
4026 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4027 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4028 {
4029 aarch64_layout_arg (pcum_v, mode, type, named);
4030 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4031 != (pcum->aapcs_stack_words != 0));
4032 pcum->aapcs_arg_processed = false;
4033 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4034 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4035 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4036 pcum->aapcs_stack_words = 0;
4037 pcum->aapcs_reg = NULL_RTX;
4038 }
4039 }
4040
4041 bool
4042 aarch64_function_arg_regno_p (unsigned regno)
4043 {
4044 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4045 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4046 }
4047
4048 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4049 PARM_BOUNDARY bits of alignment, but will be given anything up
4050 to STACK_BOUNDARY bits if the type requires it. This makes sure
4051 that both before and after the layout of each argument, the Next
4052 Stacked Argument Address (NSAA) will have a minimum alignment of
4053 8 bytes. */
4054
4055 static unsigned int
4056 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4057 {
4058 bool abi_break;
4059 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4060 &abi_break);
4061 if (abi_break & warn_psabi)
4062 inform (input_location, "parameter passing for argument of type "
4063 "%qT changed in GCC 9.1", type);
4064
4065 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4066 }
4067
4068 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4069
4070 static fixed_size_mode
4071 aarch64_get_reg_raw_mode (int regno)
4072 {
4073 if (TARGET_SVE && FP_REGNUM_P (regno))
4074 /* Don't use the SVE part of the register for __builtin_apply and
4075 __builtin_return. The SVE registers aren't used by the normal PCS,
4076 so using them there would be a waste of time. The PCS extensions
4077 for SVE types are fundamentally incompatible with the
4078 __builtin_return/__builtin_apply interface. */
4079 return as_a <fixed_size_mode> (V16QImode);
4080 return default_get_reg_raw_mode (regno);
4081 }
4082
4083 /* Implement TARGET_FUNCTION_ARG_PADDING.
4084
4085 Small aggregate types are placed in the lowest memory address.
4086
4087 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4088
4089 static pad_direction
4090 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4091 {
4092 /* On little-endian targets, the least significant byte of every stack
4093 argument is passed at the lowest byte address of the stack slot. */
4094 if (!BYTES_BIG_ENDIAN)
4095 return PAD_UPWARD;
4096
4097 /* Otherwise, integral, floating-point and pointer types are padded downward:
4098 the least significant byte of a stack argument is passed at the highest
4099 byte address of the stack slot. */
4100 if (type
4101 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4102 || POINTER_TYPE_P (type))
4103 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4104 return PAD_DOWNWARD;
4105
4106 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4107 return PAD_UPWARD;
4108 }
4109
4110 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4111
4112 It specifies padding for the last (may also be the only)
4113 element of a block move between registers and memory. If
4114 assuming the block is in the memory, padding upward means that
4115 the last element is padded after its highest significant byte,
4116 while in downward padding, the last element is padded at the
4117 its least significant byte side.
4118
4119 Small aggregates and small complex types are always padded
4120 upwards.
4121
4122 We don't need to worry about homogeneous floating-point or
4123 short-vector aggregates; their move is not affected by the
4124 padding direction determined here. Regardless of endianness,
4125 each element of such an aggregate is put in the least
4126 significant bits of a fp/simd register.
4127
4128 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4129 register has useful data, and return the opposite if the most
4130 significant byte does. */
4131
4132 bool
4133 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4134 bool first ATTRIBUTE_UNUSED)
4135 {
4136
4137 /* Small composite types are always padded upward. */
4138 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4139 {
4140 HOST_WIDE_INT size;
4141 if (type)
4142 size = int_size_in_bytes (type);
4143 else
4144 /* No frontends can create types with variable-sized modes, so we
4145 shouldn't be asked to pass or return them. */
4146 size = GET_MODE_SIZE (mode).to_constant ();
4147 if (size < 2 * UNITS_PER_WORD)
4148 return true;
4149 }
4150
4151 /* Otherwise, use the default padding. */
4152 return !BYTES_BIG_ENDIAN;
4153 }
4154
4155 static scalar_int_mode
4156 aarch64_libgcc_cmp_return_mode (void)
4157 {
4158 return SImode;
4159 }
4160
4161 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4162
4163 /* We use the 12-bit shifted immediate arithmetic instructions so values
4164 must be multiple of (1 << 12), i.e. 4096. */
4165 #define ARITH_FACTOR 4096
4166
4167 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4168 #error Cannot use simple address calculation for stack probing
4169 #endif
4170
4171 /* The pair of scratch registers used for stack probing. */
4172 #define PROBE_STACK_FIRST_REG R9_REGNUM
4173 #define PROBE_STACK_SECOND_REG R10_REGNUM
4174
4175 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4176 inclusive. These are offsets from the current stack pointer. */
4177
4178 static void
4179 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4180 {
4181 HOST_WIDE_INT size;
4182 if (!poly_size.is_constant (&size))
4183 {
4184 sorry ("stack probes for SVE frames");
4185 return;
4186 }
4187
4188 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4189
4190 /* See the same assertion on PROBE_INTERVAL above. */
4191 gcc_assert ((first % ARITH_FACTOR) == 0);
4192
4193 /* See if we have a constant small number of probes to generate. If so,
4194 that's the easy case. */
4195 if (size <= PROBE_INTERVAL)
4196 {
4197 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4198
4199 emit_set_insn (reg1,
4200 plus_constant (Pmode,
4201 stack_pointer_rtx, -(first + base)));
4202 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4203 }
4204
4205 /* The run-time loop is made up of 8 insns in the generic case while the
4206 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4207 else if (size <= 4 * PROBE_INTERVAL)
4208 {
4209 HOST_WIDE_INT i, rem;
4210
4211 emit_set_insn (reg1,
4212 plus_constant (Pmode,
4213 stack_pointer_rtx,
4214 -(first + PROBE_INTERVAL)));
4215 emit_stack_probe (reg1);
4216
4217 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4218 it exceeds SIZE. If only two probes are needed, this will not
4219 generate any code. Then probe at FIRST + SIZE. */
4220 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4221 {
4222 emit_set_insn (reg1,
4223 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4224 emit_stack_probe (reg1);
4225 }
4226
4227 rem = size - (i - PROBE_INTERVAL);
4228 if (rem > 256)
4229 {
4230 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4231
4232 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4233 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4234 }
4235 else
4236 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4237 }
4238
4239 /* Otherwise, do the same as above, but in a loop. Note that we must be
4240 extra careful with variables wrapping around because we might be at
4241 the very top (or the very bottom) of the address space and we have
4242 to be able to handle this case properly; in particular, we use an
4243 equality test for the loop condition. */
4244 else
4245 {
4246 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4247
4248 /* Step 1: round SIZE to the previous multiple of the interval. */
4249
4250 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4251
4252
4253 /* Step 2: compute initial and final value of the loop counter. */
4254
4255 /* TEST_ADDR = SP + FIRST. */
4256 emit_set_insn (reg1,
4257 plus_constant (Pmode, stack_pointer_rtx, -first));
4258
4259 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4260 HOST_WIDE_INT adjustment = - (first + rounded_size);
4261 if (! aarch64_uimm12_shift (adjustment))
4262 {
4263 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4264 true, Pmode);
4265 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4266 }
4267 else
4268 emit_set_insn (reg2,
4269 plus_constant (Pmode, stack_pointer_rtx, adjustment));
4270
4271 /* Step 3: the loop
4272
4273 do
4274 {
4275 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4276 probe at TEST_ADDR
4277 }
4278 while (TEST_ADDR != LAST_ADDR)
4279
4280 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4281 until it is equal to ROUNDED_SIZE. */
4282
4283 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4284
4285
4286 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4287 that SIZE is equal to ROUNDED_SIZE. */
4288
4289 if (size != rounded_size)
4290 {
4291 HOST_WIDE_INT rem = size - rounded_size;
4292
4293 if (rem > 256)
4294 {
4295 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4296
4297 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4298 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4299 }
4300 else
4301 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4302 }
4303 }
4304
4305 /* Make sure nothing is scheduled before we are done. */
4306 emit_insn (gen_blockage ());
4307 }
4308
4309 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4310 absolute addresses. */
4311
4312 const char *
4313 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4314 {
4315 static int labelno = 0;
4316 char loop_lab[32];
4317 rtx xops[2];
4318
4319 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4320
4321 /* Loop. */
4322 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4323
4324 HOST_WIDE_INT stack_clash_probe_interval
4325 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4326
4327 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4328 xops[0] = reg1;
4329 HOST_WIDE_INT interval;
4330 if (flag_stack_clash_protection)
4331 interval = stack_clash_probe_interval;
4332 else
4333 interval = PROBE_INTERVAL;
4334
4335 gcc_assert (aarch64_uimm12_shift (interval));
4336 xops[1] = GEN_INT (interval);
4337
4338 output_asm_insn ("sub\t%0, %0, %1", xops);
4339
4340 /* If doing stack clash protection then we probe up by the ABI specified
4341 amount. We do this because we're dropping full pages at a time in the
4342 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4343 if (flag_stack_clash_protection)
4344 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4345 else
4346 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4347
4348 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4349 by this amount for each iteration. */
4350 output_asm_insn ("str\txzr, [%0, %1]", xops);
4351
4352 /* Test if TEST_ADDR == LAST_ADDR. */
4353 xops[1] = reg2;
4354 output_asm_insn ("cmp\t%0, %1", xops);
4355
4356 /* Branch. */
4357 fputs ("\tb.ne\t", asm_out_file);
4358 assemble_name_raw (asm_out_file, loop_lab);
4359 fputc ('\n', asm_out_file);
4360
4361 return "";
4362 }
4363
4364 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4365 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4366 of GUARD_SIZE. When a probe is emitted it is done at most
4367 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4368 at most MIN_PROBE_THRESHOLD. By the end of this function
4369 BASE = BASE - ADJUSTMENT. */
4370
4371 const char *
4372 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4373 rtx min_probe_threshold, rtx guard_size)
4374 {
4375 /* This function is not allowed to use any instruction generation function
4376 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4377 so instead emit the code you want using output_asm_insn. */
4378 gcc_assert (flag_stack_clash_protection);
4379 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4380 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4381
4382 /* The minimum required allocation before the residual requires probing. */
4383 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4384
4385 /* Clamp the value down to the nearest value that can be used with a cmp. */
4386 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4387 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4388
4389 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4390 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4391
4392 static int labelno = 0;
4393 char loop_start_lab[32];
4394 char loop_end_lab[32];
4395 rtx xops[2];
4396
4397 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4398 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4399
4400 /* Emit loop start label. */
4401 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4402
4403 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4404 xops[0] = adjustment;
4405 xops[1] = probe_offset_value_rtx;
4406 output_asm_insn ("cmp\t%0, %1", xops);
4407
4408 /* Branch to end if not enough adjustment to probe. */
4409 fputs ("\tb.lt\t", asm_out_file);
4410 assemble_name_raw (asm_out_file, loop_end_lab);
4411 fputc ('\n', asm_out_file);
4412
4413 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4414 xops[0] = base;
4415 xops[1] = probe_offset_value_rtx;
4416 output_asm_insn ("sub\t%0, %0, %1", xops);
4417
4418 /* Probe at BASE. */
4419 xops[1] = const0_rtx;
4420 output_asm_insn ("str\txzr, [%0, %1]", xops);
4421
4422 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4423 xops[0] = adjustment;
4424 xops[1] = probe_offset_value_rtx;
4425 output_asm_insn ("sub\t%0, %0, %1", xops);
4426
4427 /* Branch to start if still more bytes to allocate. */
4428 fputs ("\tb\t", asm_out_file);
4429 assemble_name_raw (asm_out_file, loop_start_lab);
4430 fputc ('\n', asm_out_file);
4431
4432 /* No probe leave. */
4433 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4434
4435 /* BASE = BASE - ADJUSTMENT. */
4436 xops[0] = base;
4437 xops[1] = adjustment;
4438 output_asm_insn ("sub\t%0, %0, %1", xops);
4439 return "";
4440 }
4441
4442 /* Determine whether a frame chain needs to be generated. */
4443 static bool
4444 aarch64_needs_frame_chain (void)
4445 {
4446 /* Force a frame chain for EH returns so the return address is at FP+8. */
4447 if (frame_pointer_needed || crtl->calls_eh_return)
4448 return true;
4449
4450 /* A leaf function cannot have calls or write LR. */
4451 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4452
4453 /* Don't use a frame chain in leaf functions if leaf frame pointers
4454 are disabled. */
4455 if (flag_omit_leaf_frame_pointer && is_leaf)
4456 return false;
4457
4458 return aarch64_use_frame_pointer;
4459 }
4460
4461 /* Mark the registers that need to be saved by the callee and calculate
4462 the size of the callee-saved registers area and frame record (both FP
4463 and LR may be omitted). */
4464 static void
4465 aarch64_layout_frame (void)
4466 {
4467 HOST_WIDE_INT offset = 0;
4468 int regno, last_fp_reg = INVALID_REGNUM;
4469 bool simd_function = aarch64_simd_decl_p (cfun->decl);
4470
4471 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4472
4473 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4474 the mid-end is doing. */
4475 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4476
4477 #define SLOT_NOT_REQUIRED (-2)
4478 #define SLOT_REQUIRED (-1)
4479
4480 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4481 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4482
4483 /* If this is a non-leaf simd function with calls we assume that
4484 at least one of those calls is to a non-simd function and thus
4485 we must save V8 to V23 in the prologue. */
4486
4487 if (simd_function && !crtl->is_leaf)
4488 {
4489 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4490 if (FP_SIMD_SAVED_REGNUM_P (regno))
4491 df_set_regs_ever_live (regno, true);
4492 }
4493
4494 /* First mark all the registers that really need to be saved... */
4495 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4496 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4497
4498 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4499 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4500
4501 /* ... that includes the eh data registers (if needed)... */
4502 if (crtl->calls_eh_return)
4503 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4504 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4505 = SLOT_REQUIRED;
4506
4507 /* ... and any callee saved register that dataflow says is live. */
4508 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4509 if (df_regs_ever_live_p (regno)
4510 && (regno == R30_REGNUM
4511 || !call_used_regs[regno]))
4512 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4513
4514 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4515 if (df_regs_ever_live_p (regno)
4516 && (!call_used_regs[regno]
4517 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4518 {
4519 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4520 last_fp_reg = regno;
4521 }
4522
4523 if (cfun->machine->frame.emit_frame_chain)
4524 {
4525 /* FP and LR are placed in the linkage record. */
4526 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4527 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4528 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4529 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4530 offset = 2 * UNITS_PER_WORD;
4531 }
4532
4533 /* With stack-clash, LR must be saved in non-leaf functions. */
4534 gcc_assert (crtl->is_leaf
4535 || (cfun->machine->frame.reg_offset[R30_REGNUM]
4536 != SLOT_NOT_REQUIRED));
4537
4538 /* Now assign stack slots for them. */
4539 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4540 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4541 {
4542 cfun->machine->frame.reg_offset[regno] = offset;
4543 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4544 cfun->machine->frame.wb_candidate1 = regno;
4545 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4546 cfun->machine->frame.wb_candidate2 = regno;
4547 offset += UNITS_PER_WORD;
4548 }
4549
4550 HOST_WIDE_INT max_int_offset = offset;
4551 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4552 bool has_align_gap = offset != max_int_offset;
4553
4554 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4555 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4556 {
4557 /* If there is an alignment gap between integer and fp callee-saves,
4558 allocate the last fp register to it if possible. */
4559 if (regno == last_fp_reg
4560 && has_align_gap
4561 && !simd_function
4562 && (offset & 8) == 0)
4563 {
4564 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4565 break;
4566 }
4567
4568 cfun->machine->frame.reg_offset[regno] = offset;
4569 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4570 cfun->machine->frame.wb_candidate1 = regno;
4571 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4572 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4573 cfun->machine->frame.wb_candidate2 = regno;
4574 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4575 }
4576
4577 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4578
4579 cfun->machine->frame.saved_regs_size = offset;
4580
4581 HOST_WIDE_INT varargs_and_saved_regs_size
4582 = offset + cfun->machine->frame.saved_varargs_size;
4583
4584 cfun->machine->frame.hard_fp_offset
4585 = aligned_upper_bound (varargs_and_saved_regs_size
4586 + get_frame_size (),
4587 STACK_BOUNDARY / BITS_PER_UNIT);
4588
4589 /* Both these values are already aligned. */
4590 gcc_assert (multiple_p (crtl->outgoing_args_size,
4591 STACK_BOUNDARY / BITS_PER_UNIT));
4592 cfun->machine->frame.frame_size
4593 = (cfun->machine->frame.hard_fp_offset
4594 + crtl->outgoing_args_size);
4595
4596 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4597
4598 cfun->machine->frame.initial_adjust = 0;
4599 cfun->machine->frame.final_adjust = 0;
4600 cfun->machine->frame.callee_adjust = 0;
4601 cfun->machine->frame.callee_offset = 0;
4602
4603 HOST_WIDE_INT max_push_offset = 0;
4604 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4605 max_push_offset = 512;
4606 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4607 max_push_offset = 256;
4608
4609 HOST_WIDE_INT const_size, const_fp_offset;
4610 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4611 && const_size < max_push_offset
4612 && known_eq (crtl->outgoing_args_size, 0))
4613 {
4614 /* Simple, small frame with no outgoing arguments:
4615 stp reg1, reg2, [sp, -frame_size]!
4616 stp reg3, reg4, [sp, 16] */
4617 cfun->machine->frame.callee_adjust = const_size;
4618 }
4619 else if (known_lt (crtl->outgoing_args_size
4620 + cfun->machine->frame.saved_regs_size, 512)
4621 && !(cfun->calls_alloca
4622 && known_lt (cfun->machine->frame.hard_fp_offset,
4623 max_push_offset)))
4624 {
4625 /* Frame with small outgoing arguments:
4626 sub sp, sp, frame_size
4627 stp reg1, reg2, [sp, outgoing_args_size]
4628 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4629 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4630 cfun->machine->frame.callee_offset
4631 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4632 }
4633 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4634 && const_fp_offset < max_push_offset)
4635 {
4636 /* Frame with large outgoing arguments but a small local area:
4637 stp reg1, reg2, [sp, -hard_fp_offset]!
4638 stp reg3, reg4, [sp, 16]
4639 sub sp, sp, outgoing_args_size */
4640 cfun->machine->frame.callee_adjust = const_fp_offset;
4641 cfun->machine->frame.final_adjust
4642 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4643 }
4644 else
4645 {
4646 /* Frame with large local area and outgoing arguments using frame pointer:
4647 sub sp, sp, hard_fp_offset
4648 stp x29, x30, [sp, 0]
4649 add x29, sp, 0
4650 stp reg3, reg4, [sp, 16]
4651 sub sp, sp, outgoing_args_size */
4652 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4653 cfun->machine->frame.final_adjust
4654 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4655 }
4656
4657 cfun->machine->frame.laid_out = true;
4658 }
4659
4660 /* Return true if the register REGNO is saved on entry to
4661 the current function. */
4662
4663 static bool
4664 aarch64_register_saved_on_entry (int regno)
4665 {
4666 return cfun->machine->frame.reg_offset[regno] >= 0;
4667 }
4668
4669 /* Return the next register up from REGNO up to LIMIT for the callee
4670 to save. */
4671
4672 static unsigned
4673 aarch64_next_callee_save (unsigned regno, unsigned limit)
4674 {
4675 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4676 regno ++;
4677 return regno;
4678 }
4679
4680 /* Push the register number REGNO of mode MODE to the stack with write-back
4681 adjusting the stack by ADJUSTMENT. */
4682
4683 static void
4684 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4685 HOST_WIDE_INT adjustment)
4686 {
4687 rtx base_rtx = stack_pointer_rtx;
4688 rtx insn, reg, mem;
4689
4690 reg = gen_rtx_REG (mode, regno);
4691 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4692 plus_constant (Pmode, base_rtx, -adjustment));
4693 mem = gen_frame_mem (mode, mem);
4694
4695 insn = emit_move_insn (mem, reg);
4696 RTX_FRAME_RELATED_P (insn) = 1;
4697 }
4698
4699 /* Generate and return an instruction to store the pair of registers
4700 REG and REG2 of mode MODE to location BASE with write-back adjusting
4701 the stack location BASE by ADJUSTMENT. */
4702
4703 static rtx
4704 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4705 HOST_WIDE_INT adjustment)
4706 {
4707 switch (mode)
4708 {
4709 case E_DImode:
4710 return gen_storewb_pairdi_di (base, base, reg, reg2,
4711 GEN_INT (-adjustment),
4712 GEN_INT (UNITS_PER_WORD - adjustment));
4713 case E_DFmode:
4714 return gen_storewb_pairdf_di (base, base, reg, reg2,
4715 GEN_INT (-adjustment),
4716 GEN_INT (UNITS_PER_WORD - adjustment));
4717 case E_TFmode:
4718 return gen_storewb_pairtf_di (base, base, reg, reg2,
4719 GEN_INT (-adjustment),
4720 GEN_INT (UNITS_PER_VREG - adjustment));
4721 default:
4722 gcc_unreachable ();
4723 }
4724 }
4725
4726 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4727 stack pointer by ADJUSTMENT. */
4728
4729 static void
4730 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4731 {
4732 rtx_insn *insn;
4733 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4734
4735 if (regno2 == INVALID_REGNUM)
4736 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4737
4738 rtx reg1 = gen_rtx_REG (mode, regno1);
4739 rtx reg2 = gen_rtx_REG (mode, regno2);
4740
4741 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4742 reg2, adjustment));
4743 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4744 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4745 RTX_FRAME_RELATED_P (insn) = 1;
4746 }
4747
4748 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4749 adjusting it by ADJUSTMENT afterwards. */
4750
4751 static rtx
4752 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4753 HOST_WIDE_INT adjustment)
4754 {
4755 switch (mode)
4756 {
4757 case E_DImode:
4758 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4759 GEN_INT (UNITS_PER_WORD));
4760 case E_DFmode:
4761 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4762 GEN_INT (UNITS_PER_WORD));
4763 case E_TFmode:
4764 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4765 GEN_INT (UNITS_PER_VREG));
4766 default:
4767 gcc_unreachable ();
4768 }
4769 }
4770
4771 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4772 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4773 into CFI_OPS. */
4774
4775 static void
4776 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4777 rtx *cfi_ops)
4778 {
4779 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4780 rtx reg1 = gen_rtx_REG (mode, regno1);
4781
4782 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4783
4784 if (regno2 == INVALID_REGNUM)
4785 {
4786 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4787 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4788 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4789 }
4790 else
4791 {
4792 rtx reg2 = gen_rtx_REG (mode, regno2);
4793 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4794 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4795 reg2, adjustment));
4796 }
4797 }
4798
4799 /* Generate and return a store pair instruction of mode MODE to store
4800 register REG1 to MEM1 and register REG2 to MEM2. */
4801
4802 static rtx
4803 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4804 rtx reg2)
4805 {
4806 switch (mode)
4807 {
4808 case E_DImode:
4809 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4810
4811 case E_DFmode:
4812 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4813
4814 case E_TFmode:
4815 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4816
4817 default:
4818 gcc_unreachable ();
4819 }
4820 }
4821
4822 /* Generate and regurn a load pair isntruction of mode MODE to load register
4823 REG1 from MEM1 and register REG2 from MEM2. */
4824
4825 static rtx
4826 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4827 rtx mem2)
4828 {
4829 switch (mode)
4830 {
4831 case E_DImode:
4832 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4833
4834 case E_DFmode:
4835 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4836
4837 case E_TFmode:
4838 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4839
4840 default:
4841 gcc_unreachable ();
4842 }
4843 }
4844
4845 /* Return TRUE if return address signing should be enabled for the current
4846 function, otherwise return FALSE. */
4847
4848 bool
4849 aarch64_return_address_signing_enabled (void)
4850 {
4851 /* This function should only be called after frame laid out. */
4852 gcc_assert (cfun->machine->frame.laid_out);
4853
4854 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4855 if it's LR is pushed onto stack. */
4856 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4857 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4858 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4859 }
4860
4861 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
4862 bool
4863 aarch64_bti_enabled (void)
4864 {
4865 return (aarch64_enable_bti == 1);
4866 }
4867
4868 /* Emit code to save the callee-saved registers from register number START
4869 to LIMIT to the stack at the location starting at offset START_OFFSET,
4870 skipping any write-back candidates if SKIP_WB is true. */
4871
4872 static void
4873 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4874 unsigned start, unsigned limit, bool skip_wb)
4875 {
4876 rtx_insn *insn;
4877 unsigned regno;
4878 unsigned regno2;
4879
4880 for (regno = aarch64_next_callee_save (start, limit);
4881 regno <= limit;
4882 regno = aarch64_next_callee_save (regno + 1, limit))
4883 {
4884 rtx reg, mem;
4885 poly_int64 offset;
4886 int offset_diff;
4887
4888 if (skip_wb
4889 && (regno == cfun->machine->frame.wb_candidate1
4890 || regno == cfun->machine->frame.wb_candidate2))
4891 continue;
4892
4893 if (cfun->machine->reg_is_wrapped_separately[regno])
4894 continue;
4895
4896 reg = gen_rtx_REG (mode, regno);
4897 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4898 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4899 offset));
4900
4901 regno2 = aarch64_next_callee_save (regno + 1, limit);
4902 offset_diff = cfun->machine->frame.reg_offset[regno2]
4903 - cfun->machine->frame.reg_offset[regno];
4904
4905 if (regno2 <= limit
4906 && !cfun->machine->reg_is_wrapped_separately[regno2]
4907 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4908 {
4909 rtx reg2 = gen_rtx_REG (mode, regno2);
4910 rtx mem2;
4911
4912 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4913 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4914 offset));
4915 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4916 reg2));
4917
4918 /* The first part of a frame-related parallel insn is
4919 always assumed to be relevant to the frame
4920 calculations; subsequent parts, are only
4921 frame-related if explicitly marked. */
4922 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4923 regno = regno2;
4924 }
4925 else
4926 insn = emit_move_insn (mem, reg);
4927
4928 RTX_FRAME_RELATED_P (insn) = 1;
4929 }
4930 }
4931
4932 /* Emit code to restore the callee registers of mode MODE from register
4933 number START up to and including LIMIT. Restore from the stack offset
4934 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4935 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4936
4937 static void
4938 aarch64_restore_callee_saves (machine_mode mode,
4939 poly_int64 start_offset, unsigned start,
4940 unsigned limit, bool skip_wb, rtx *cfi_ops)
4941 {
4942 rtx base_rtx = stack_pointer_rtx;
4943 unsigned regno;
4944 unsigned regno2;
4945 poly_int64 offset;
4946
4947 for (regno = aarch64_next_callee_save (start, limit);
4948 regno <= limit;
4949 regno = aarch64_next_callee_save (regno + 1, limit))
4950 {
4951 if (cfun->machine->reg_is_wrapped_separately[regno])
4952 continue;
4953
4954 rtx reg, mem;
4955 int offset_diff;
4956
4957 if (skip_wb
4958 && (regno == cfun->machine->frame.wb_candidate1
4959 || regno == cfun->machine->frame.wb_candidate2))
4960 continue;
4961
4962 reg = gen_rtx_REG (mode, regno);
4963 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4964 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4965
4966 regno2 = aarch64_next_callee_save (regno + 1, limit);
4967 offset_diff = cfun->machine->frame.reg_offset[regno2]
4968 - cfun->machine->frame.reg_offset[regno];
4969
4970 if (regno2 <= limit
4971 && !cfun->machine->reg_is_wrapped_separately[regno2]
4972 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4973 {
4974 rtx reg2 = gen_rtx_REG (mode, regno2);
4975 rtx mem2;
4976
4977 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4978 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4979 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4980
4981 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4982 regno = regno2;
4983 }
4984 else
4985 emit_move_insn (reg, mem);
4986 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4987 }
4988 }
4989
4990 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4991 of MODE. */
4992
4993 static inline bool
4994 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4995 {
4996 HOST_WIDE_INT multiple;
4997 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4998 && IN_RANGE (multiple, -8, 7));
4999 }
5000
5001 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5002 of MODE. */
5003
5004 static inline bool
5005 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5006 {
5007 HOST_WIDE_INT multiple;
5008 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5009 && IN_RANGE (multiple, 0, 63));
5010 }
5011
5012 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5013 of MODE. */
5014
5015 bool
5016 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5017 {
5018 HOST_WIDE_INT multiple;
5019 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5020 && IN_RANGE (multiple, -64, 63));
5021 }
5022
5023 /* Return true if OFFSET is a signed 9-bit value. */
5024
5025 bool
5026 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5027 poly_int64 offset)
5028 {
5029 HOST_WIDE_INT const_offset;
5030 return (offset.is_constant (&const_offset)
5031 && IN_RANGE (const_offset, -256, 255));
5032 }
5033
5034 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5035 of MODE. */
5036
5037 static inline bool
5038 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5039 {
5040 HOST_WIDE_INT multiple;
5041 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5042 && IN_RANGE (multiple, -256, 255));
5043 }
5044
5045 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5046 of MODE. */
5047
5048 static inline bool
5049 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5050 {
5051 HOST_WIDE_INT multiple;
5052 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5053 && IN_RANGE (multiple, 0, 4095));
5054 }
5055
5056 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5057
5058 static sbitmap
5059 aarch64_get_separate_components (void)
5060 {
5061 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5062 bitmap_clear (components);
5063
5064 /* The registers we need saved to the frame. */
5065 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5066 if (aarch64_register_saved_on_entry (regno))
5067 {
5068 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5069 if (!frame_pointer_needed)
5070 offset += cfun->machine->frame.frame_size
5071 - cfun->machine->frame.hard_fp_offset;
5072 /* Check that we can access the stack slot of the register with one
5073 direct load with no adjustments needed. */
5074 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5075 bitmap_set_bit (components, regno);
5076 }
5077
5078 /* Don't mess with the hard frame pointer. */
5079 if (frame_pointer_needed)
5080 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5081
5082 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5083 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5084 /* If registers have been chosen to be stored/restored with
5085 writeback don't interfere with them to avoid having to output explicit
5086 stack adjustment instructions. */
5087 if (reg2 != INVALID_REGNUM)
5088 bitmap_clear_bit (components, reg2);
5089 if (reg1 != INVALID_REGNUM)
5090 bitmap_clear_bit (components, reg1);
5091
5092 bitmap_clear_bit (components, LR_REGNUM);
5093 bitmap_clear_bit (components, SP_REGNUM);
5094
5095 return components;
5096 }
5097
5098 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5099
5100 static sbitmap
5101 aarch64_components_for_bb (basic_block bb)
5102 {
5103 bitmap in = DF_LIVE_IN (bb);
5104 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5105 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5106 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5107
5108 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5109 bitmap_clear (components);
5110
5111 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5112 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5113 if ((!call_used_regs[regno]
5114 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5115 && (bitmap_bit_p (in, regno)
5116 || bitmap_bit_p (gen, regno)
5117 || bitmap_bit_p (kill, regno)))
5118 {
5119 unsigned regno2, offset, offset2;
5120 bitmap_set_bit (components, regno);
5121
5122 /* If there is a callee-save at an adjacent offset, add it too
5123 to increase the use of LDP/STP. */
5124 offset = cfun->machine->frame.reg_offset[regno];
5125 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5126
5127 if (regno2 <= LAST_SAVED_REGNUM)
5128 {
5129 offset2 = cfun->machine->frame.reg_offset[regno2];
5130 if ((offset & ~8) == (offset2 & ~8))
5131 bitmap_set_bit (components, regno2);
5132 }
5133 }
5134
5135 return components;
5136 }
5137
5138 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5139 Nothing to do for aarch64. */
5140
5141 static void
5142 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5143 {
5144 }
5145
5146 /* Return the next set bit in BMP from START onwards. Return the total number
5147 of bits in BMP if no set bit is found at or after START. */
5148
5149 static unsigned int
5150 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5151 {
5152 unsigned int nbits = SBITMAP_SIZE (bmp);
5153 if (start == nbits)
5154 return start;
5155
5156 gcc_assert (start < nbits);
5157 for (unsigned int i = start; i < nbits; i++)
5158 if (bitmap_bit_p (bmp, i))
5159 return i;
5160
5161 return nbits;
5162 }
5163
5164 /* Do the work for aarch64_emit_prologue_components and
5165 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5166 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5167 for these components or the epilogue sequence. That is, it determines
5168 whether we should emit stores or loads and what kind of CFA notes to attach
5169 to the insns. Otherwise the logic for the two sequences is very
5170 similar. */
5171
5172 static void
5173 aarch64_process_components (sbitmap components, bool prologue_p)
5174 {
5175 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5176 ? HARD_FRAME_POINTER_REGNUM
5177 : STACK_POINTER_REGNUM);
5178
5179 unsigned last_regno = SBITMAP_SIZE (components);
5180 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5181 rtx_insn *insn = NULL;
5182
5183 while (regno != last_regno)
5184 {
5185 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5186 so DFmode for the vector registers is enough. For simd functions
5187 we want to save the low 128 bits. */
5188 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5189
5190 rtx reg = gen_rtx_REG (mode, regno);
5191 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5192 if (!frame_pointer_needed)
5193 offset += cfun->machine->frame.frame_size
5194 - cfun->machine->frame.hard_fp_offset;
5195 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5196 rtx mem = gen_frame_mem (mode, addr);
5197
5198 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5199 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5200 /* No more registers to handle after REGNO.
5201 Emit a single save/restore and exit. */
5202 if (regno2 == last_regno)
5203 {
5204 insn = emit_insn (set);
5205 RTX_FRAME_RELATED_P (insn) = 1;
5206 if (prologue_p)
5207 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5208 else
5209 add_reg_note (insn, REG_CFA_RESTORE, reg);
5210 break;
5211 }
5212
5213 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5214 /* The next register is not of the same class or its offset is not
5215 mergeable with the current one into a pair. */
5216 if (!satisfies_constraint_Ump (mem)
5217 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5218 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5219 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5220 GET_MODE_SIZE (mode)))
5221 {
5222 insn = emit_insn (set);
5223 RTX_FRAME_RELATED_P (insn) = 1;
5224 if (prologue_p)
5225 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5226 else
5227 add_reg_note (insn, REG_CFA_RESTORE, reg);
5228
5229 regno = regno2;
5230 continue;
5231 }
5232
5233 /* REGNO2 can be saved/restored in a pair with REGNO. */
5234 rtx reg2 = gen_rtx_REG (mode, regno2);
5235 if (!frame_pointer_needed)
5236 offset2 += cfun->machine->frame.frame_size
5237 - cfun->machine->frame.hard_fp_offset;
5238 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5239 rtx mem2 = gen_frame_mem (mode, addr2);
5240 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5241 : gen_rtx_SET (reg2, mem2);
5242
5243 if (prologue_p)
5244 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5245 else
5246 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5247
5248 RTX_FRAME_RELATED_P (insn) = 1;
5249 if (prologue_p)
5250 {
5251 add_reg_note (insn, REG_CFA_OFFSET, set);
5252 add_reg_note (insn, REG_CFA_OFFSET, set2);
5253 }
5254 else
5255 {
5256 add_reg_note (insn, REG_CFA_RESTORE, reg);
5257 add_reg_note (insn, REG_CFA_RESTORE, reg2);
5258 }
5259
5260 regno = aarch64_get_next_set_bit (components, regno2 + 1);
5261 }
5262 }
5263
5264 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5265
5266 static void
5267 aarch64_emit_prologue_components (sbitmap components)
5268 {
5269 aarch64_process_components (components, true);
5270 }
5271
5272 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5273
5274 static void
5275 aarch64_emit_epilogue_components (sbitmap components)
5276 {
5277 aarch64_process_components (components, false);
5278 }
5279
5280 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5281
5282 static void
5283 aarch64_set_handled_components (sbitmap components)
5284 {
5285 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5286 if (bitmap_bit_p (components, regno))
5287 cfun->machine->reg_is_wrapped_separately[regno] = true;
5288 }
5289
5290 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5291 determining the probe offset for alloca. */
5292
5293 static HOST_WIDE_INT
5294 aarch64_stack_clash_protection_alloca_probe_range (void)
5295 {
5296 return STACK_CLASH_CALLER_GUARD;
5297 }
5298
5299
5300 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5301 registers. If POLY_SIZE is not large enough to require a probe this function
5302 will only adjust the stack. When allocating the stack space
5303 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5304 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5305 arguments. If we are then we ensure that any allocation larger than the ABI
5306 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5307 maintained.
5308
5309 We emit barriers after each stack adjustment to prevent optimizations from
5310 breaking the invariant that we never drop the stack more than a page. This
5311 invariant is needed to make it easier to correctly handle asynchronous
5312 events, e.g. if we were to allow the stack to be dropped by more than a page
5313 and then have multiple probes up and we take a signal somewhere in between
5314 then the signal handler doesn't know the state of the stack and can make no
5315 assumptions about which pages have been probed. */
5316
5317 static void
5318 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5319 poly_int64 poly_size,
5320 bool frame_related_p,
5321 bool final_adjustment_p)
5322 {
5323 HOST_WIDE_INT guard_size
5324 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5325 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5326 /* When doing the final adjustment for the outgoing argument size we can't
5327 assume that LR was saved at position 0. So subtract it's offset from the
5328 ABI safe buffer so that we don't accidentally allow an adjustment that
5329 would result in an allocation larger than the ABI buffer without
5330 probing. */
5331 HOST_WIDE_INT min_probe_threshold
5332 = final_adjustment_p
5333 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5334 : guard_size - guard_used_by_caller;
5335
5336 poly_int64 frame_size = cfun->machine->frame.frame_size;
5337
5338 /* We should always have a positive probe threshold. */
5339 gcc_assert (min_probe_threshold > 0);
5340
5341 if (flag_stack_clash_protection && !final_adjustment_p)
5342 {
5343 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5344 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5345
5346 if (known_eq (frame_size, 0))
5347 {
5348 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5349 }
5350 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5351 && known_lt (final_adjust, guard_used_by_caller))
5352 {
5353 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5354 }
5355 }
5356
5357 /* If SIZE is not large enough to require probing, just adjust the stack and
5358 exit. */
5359 if (known_lt (poly_size, min_probe_threshold)
5360 || !flag_stack_clash_protection)
5361 {
5362 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5363 return;
5364 }
5365
5366 HOST_WIDE_INT size;
5367 /* Handle the SVE non-constant case first. */
5368 if (!poly_size.is_constant (&size))
5369 {
5370 if (dump_file)
5371 {
5372 fprintf (dump_file, "Stack clash SVE prologue: ");
5373 print_dec (poly_size, dump_file);
5374 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5375 }
5376
5377 /* First calculate the amount of bytes we're actually spilling. */
5378 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5379 poly_size, temp1, temp2, false, true);
5380
5381 rtx_insn *insn = get_last_insn ();
5382
5383 if (frame_related_p)
5384 {
5385 /* This is done to provide unwinding information for the stack
5386 adjustments we're about to do, however to prevent the optimizers
5387 from removing the R11 move and leaving the CFA note (which would be
5388 very wrong) we tie the old and new stack pointer together.
5389 The tie will expand to nothing but the optimizers will not touch
5390 the instruction. */
5391 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5392 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5393 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5394
5395 /* We want the CFA independent of the stack pointer for the
5396 duration of the loop. */
5397 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5398 RTX_FRAME_RELATED_P (insn) = 1;
5399 }
5400
5401 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5402 rtx guard_const = gen_int_mode (guard_size, Pmode);
5403
5404 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5405 stack_pointer_rtx, temp1,
5406 probe_const, guard_const));
5407
5408 /* Now reset the CFA register if needed. */
5409 if (frame_related_p)
5410 {
5411 add_reg_note (insn, REG_CFA_DEF_CFA,
5412 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5413 gen_int_mode (poly_size, Pmode)));
5414 RTX_FRAME_RELATED_P (insn) = 1;
5415 }
5416
5417 return;
5418 }
5419
5420 if (dump_file)
5421 fprintf (dump_file,
5422 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5423 " bytes, probing will be required.\n", size);
5424
5425 /* Round size to the nearest multiple of guard_size, and calculate the
5426 residual as the difference between the original size and the rounded
5427 size. */
5428 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5429 HOST_WIDE_INT residual = size - rounded_size;
5430
5431 /* We can handle a small number of allocations/probes inline. Otherwise
5432 punt to a loop. */
5433 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5434 {
5435 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5436 {
5437 aarch64_sub_sp (NULL, temp2, guard_size, true);
5438 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5439 guard_used_by_caller));
5440 emit_insn (gen_blockage ());
5441 }
5442 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5443 }
5444 else
5445 {
5446 /* Compute the ending address. */
5447 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5448 temp1, NULL, false, true);
5449 rtx_insn *insn = get_last_insn ();
5450
5451 /* For the initial allocation, we don't have a frame pointer
5452 set up, so we always need CFI notes. If we're doing the
5453 final allocation, then we may have a frame pointer, in which
5454 case it is the CFA, otherwise we need CFI notes.
5455
5456 We can determine which allocation we are doing by looking at
5457 the value of FRAME_RELATED_P since the final allocations are not
5458 frame related. */
5459 if (frame_related_p)
5460 {
5461 /* We want the CFA independent of the stack pointer for the
5462 duration of the loop. */
5463 add_reg_note (insn, REG_CFA_DEF_CFA,
5464 plus_constant (Pmode, temp1, rounded_size));
5465 RTX_FRAME_RELATED_P (insn) = 1;
5466 }
5467
5468 /* This allocates and probes the stack. Note that this re-uses some of
5469 the existing Ada stack protection code. However we are guaranteed not
5470 to enter the non loop or residual branches of that code.
5471
5472 The non-loop part won't be entered because if our allocation amount
5473 doesn't require a loop, the case above would handle it.
5474
5475 The residual amount won't be entered because TEMP1 is a mutliple of
5476 the allocation size. The residual will always be 0. As such, the only
5477 part we are actually using from that code is the loop setup. The
5478 actual probing is done in aarch64_output_probe_stack_range. */
5479 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5480 stack_pointer_rtx, temp1));
5481
5482 /* Now reset the CFA register if needed. */
5483 if (frame_related_p)
5484 {
5485 add_reg_note (insn, REG_CFA_DEF_CFA,
5486 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5487 RTX_FRAME_RELATED_P (insn) = 1;
5488 }
5489
5490 emit_insn (gen_blockage ());
5491 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5492 }
5493
5494 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5495 be probed. This maintains the requirement that each page is probed at
5496 least once. For initial probing we probe only if the allocation is
5497 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5498 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5499 GUARD_SIZE. This works that for any allocation that is large enough to
5500 trigger a probe here, we'll have at least one, and if they're not large
5501 enough for this code to emit anything for them, The page would have been
5502 probed by the saving of FP/LR either by this function or any callees. If
5503 we don't have any callees then we won't have more stack adjustments and so
5504 are still safe. */
5505 if (residual)
5506 {
5507 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5508 /* If we're doing final adjustments, and we've done any full page
5509 allocations then any residual needs to be probed. */
5510 if (final_adjustment_p && rounded_size != 0)
5511 min_probe_threshold = 0;
5512 /* If doing a small final adjustment, we always probe at offset 0.
5513 This is done to avoid issues when LR is not at position 0 or when
5514 the final adjustment is smaller than the probing offset. */
5515 else if (final_adjustment_p && rounded_size == 0)
5516 residual_probe_offset = 0;
5517
5518 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5519 if (residual >= min_probe_threshold)
5520 {
5521 if (dump_file)
5522 fprintf (dump_file,
5523 "Stack clash AArch64 prologue residuals: "
5524 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5525 "\n", residual);
5526
5527 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5528 residual_probe_offset));
5529 emit_insn (gen_blockage ());
5530 }
5531 }
5532 }
5533
5534 /* Return 1 if the register is used by the epilogue. We need to say the
5535 return register is used, but only after epilogue generation is complete.
5536 Note that in the case of sibcalls, the values "used by the epilogue" are
5537 considered live at the start of the called function.
5538
5539 For SIMD functions we need to return 1 for FP registers that are saved and
5540 restored by a function but are not zero in call_used_regs. If we do not do
5541 this optimizations may remove the restore of the register. */
5542
5543 int
5544 aarch64_epilogue_uses (int regno)
5545 {
5546 if (epilogue_completed)
5547 {
5548 if (regno == LR_REGNUM)
5549 return 1;
5550 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5551 return 1;
5552 }
5553 return 0;
5554 }
5555
5556 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5557 is saved at BASE + OFFSET. */
5558
5559 static void
5560 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5561 rtx base, poly_int64 offset)
5562 {
5563 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5564 add_reg_note (insn, REG_CFA_EXPRESSION,
5565 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5566 }
5567
5568 /* AArch64 stack frames generated by this compiler look like:
5569
5570 +-------------------------------+
5571 | |
5572 | incoming stack arguments |
5573 | |
5574 +-------------------------------+
5575 | | <-- incoming stack pointer (aligned)
5576 | callee-allocated save area |
5577 | for register varargs |
5578 | |
5579 +-------------------------------+
5580 | local variables | <-- frame_pointer_rtx
5581 | |
5582 +-------------------------------+
5583 | padding | \
5584 +-------------------------------+ |
5585 | callee-saved registers | | frame.saved_regs_size
5586 +-------------------------------+ |
5587 | LR' | |
5588 +-------------------------------+ |
5589 | FP' | / <- hard_frame_pointer_rtx (aligned)
5590 +-------------------------------+
5591 | dynamic allocation |
5592 +-------------------------------+
5593 | padding |
5594 +-------------------------------+
5595 | outgoing stack arguments | <-- arg_pointer
5596 | |
5597 +-------------------------------+
5598 | | <-- stack_pointer_rtx (aligned)
5599
5600 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5601 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5602 unchanged.
5603
5604 By default for stack-clash we assume the guard is at least 64KB, but this
5605 value is configurable to either 4KB or 64KB. We also force the guard size to
5606 be the same as the probing interval and both values are kept in sync.
5607
5608 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5609 on the guard size) of stack space without probing.
5610
5611 When probing is needed, we emit a probe at the start of the prologue
5612 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5613
5614 We have to track how much space has been allocated and the only stores
5615 to the stack we track as implicit probes are the FP/LR stores.
5616
5617 For outgoing arguments we probe if the size is larger than 1KB, such that
5618 the ABI specified buffer is maintained for the next callee.
5619
5620 The following registers are reserved during frame layout and should not be
5621 used for any other purpose:
5622
5623 - r11: Used by stack clash protection when SVE is enabled.
5624 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5625 - r14 and r15: Used for speculation tracking.
5626 - r16(IP0), r17(IP1): Used by indirect tailcalls.
5627 - r30(LR), r29(FP): Used by standard frame layout.
5628
5629 These registers must be avoided in frame layout related code unless the
5630 explicit intention is to interact with one of the features listed above. */
5631
5632 /* Generate the prologue instructions for entry into a function.
5633 Establish the stack frame by decreasing the stack pointer with a
5634 properly calculated size and, if necessary, create a frame record
5635 filled with the values of LR and previous frame pointer. The
5636 current FP is also set up if it is in use. */
5637
5638 void
5639 aarch64_expand_prologue (void)
5640 {
5641 poly_int64 frame_size = cfun->machine->frame.frame_size;
5642 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5643 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5644 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5645 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5646 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5647 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5648 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5649 rtx_insn *insn;
5650
5651 /* Sign return address for functions. */
5652 if (aarch64_return_address_signing_enabled ())
5653 {
5654 insn = emit_insn (gen_pacisp ());
5655 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5656 RTX_FRAME_RELATED_P (insn) = 1;
5657 }
5658
5659 if (flag_stack_usage_info)
5660 current_function_static_stack_size = constant_lower_bound (frame_size);
5661
5662 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5663 {
5664 if (crtl->is_leaf && !cfun->calls_alloca)
5665 {
5666 if (maybe_gt (frame_size, PROBE_INTERVAL)
5667 && maybe_gt (frame_size, get_stack_check_protect ()))
5668 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5669 (frame_size
5670 - get_stack_check_protect ()));
5671 }
5672 else if (maybe_gt (frame_size, 0))
5673 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5674 }
5675
5676 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5677 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5678
5679 /* In theory we should never have both an initial adjustment
5680 and a callee save adjustment. Verify that is the case since the
5681 code below does not handle it for -fstack-clash-protection. */
5682 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5683
5684 /* Will only probe if the initial adjustment is larger than the guard
5685 less the amount of the guard reserved for use by the caller's
5686 outgoing args. */
5687 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5688 true, false);
5689
5690 if (callee_adjust != 0)
5691 aarch64_push_regs (reg1, reg2, callee_adjust);
5692
5693 if (emit_frame_chain)
5694 {
5695 poly_int64 reg_offset = callee_adjust;
5696 if (callee_adjust == 0)
5697 {
5698 reg1 = R29_REGNUM;
5699 reg2 = R30_REGNUM;
5700 reg_offset = callee_offset;
5701 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5702 }
5703 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5704 stack_pointer_rtx, callee_offset,
5705 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5706 if (frame_pointer_needed && !frame_size.is_constant ())
5707 {
5708 /* Variable-sized frames need to describe the save slot
5709 address using DW_CFA_expression rather than DW_CFA_offset.
5710 This means that, without taking further action, the
5711 locations of the registers that we've already saved would
5712 remain based on the stack pointer even after we redefine
5713 the CFA based on the frame pointer. We therefore need new
5714 DW_CFA_expressions to re-express the save slots with addresses
5715 based on the frame pointer. */
5716 rtx_insn *insn = get_last_insn ();
5717 gcc_assert (RTX_FRAME_RELATED_P (insn));
5718
5719 /* Add an explicit CFA definition if this was previously
5720 implicit. */
5721 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5722 {
5723 rtx src = plus_constant (Pmode, stack_pointer_rtx,
5724 callee_offset);
5725 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5726 gen_rtx_SET (hard_frame_pointer_rtx, src));
5727 }
5728
5729 /* Change the save slot expressions for the registers that
5730 we've already saved. */
5731 reg_offset -= callee_offset;
5732 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5733 reg_offset + UNITS_PER_WORD);
5734 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5735 reg_offset);
5736 }
5737 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5738 }
5739
5740 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5741 callee_adjust != 0 || emit_frame_chain);
5742 if (aarch64_simd_decl_p (cfun->decl))
5743 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5744 callee_adjust != 0 || emit_frame_chain);
5745 else
5746 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5747 callee_adjust != 0 || emit_frame_chain);
5748
5749 /* We may need to probe the final adjustment if it is larger than the guard
5750 that is assumed by the called. */
5751 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5752 !frame_pointer_needed, true);
5753 }
5754
5755 /* Return TRUE if we can use a simple_return insn.
5756
5757 This function checks whether the callee saved stack is empty, which
5758 means no restore actions are need. The pro_and_epilogue will use
5759 this to check whether shrink-wrapping opt is feasible. */
5760
5761 bool
5762 aarch64_use_return_insn_p (void)
5763 {
5764 if (!reload_completed)
5765 return false;
5766
5767 if (crtl->profile)
5768 return false;
5769
5770 return known_eq (cfun->machine->frame.frame_size, 0);
5771 }
5772
5773 /* Return false for non-leaf SIMD functions in order to avoid
5774 shrink-wrapping them. Doing this will lose the necessary
5775 save/restore of FP registers. */
5776
5777 bool
5778 aarch64_use_simple_return_insn_p (void)
5779 {
5780 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5781 return false;
5782
5783 return true;
5784 }
5785
5786 /* Generate the epilogue instructions for returning from a function.
5787 This is almost exactly the reverse of the prolog sequence, except
5788 that we need to insert barriers to avoid scheduling loads that read
5789 from a deallocated stack, and we optimize the unwind records by
5790 emitting them all together if possible. */
5791 void
5792 aarch64_expand_epilogue (bool for_sibcall)
5793 {
5794 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5795 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5796 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5797 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5798 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5799 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5800 rtx cfi_ops = NULL;
5801 rtx_insn *insn;
5802 /* A stack clash protection prologue may not have left EP0_REGNUM or
5803 EP1_REGNUM in a usable state. The same is true for allocations
5804 with an SVE component, since we then need both temporary registers
5805 for each allocation. For stack clash we are in a usable state if
5806 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5807 HOST_WIDE_INT guard_size
5808 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5809 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5810
5811 /* We can re-use the registers when the allocation amount is smaller than
5812 guard_size - guard_used_by_caller because we won't be doing any probes
5813 then. In such situations the register should remain live with the correct
5814 value. */
5815 bool can_inherit_p = (initial_adjust.is_constant ()
5816 && final_adjust.is_constant ())
5817 && (!flag_stack_clash_protection
5818 || known_lt (initial_adjust,
5819 guard_size - guard_used_by_caller));
5820
5821 /* We need to add memory barrier to prevent read from deallocated stack. */
5822 bool need_barrier_p
5823 = maybe_ne (get_frame_size ()
5824 + cfun->machine->frame.saved_varargs_size, 0);
5825
5826 /* Emit a barrier to prevent loads from a deallocated stack. */
5827 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5828 || cfun->calls_alloca
5829 || crtl->calls_eh_return)
5830 {
5831 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5832 need_barrier_p = false;
5833 }
5834
5835 /* Restore the stack pointer from the frame pointer if it may not
5836 be the same as the stack pointer. */
5837 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5838 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5839 if (frame_pointer_needed
5840 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5841 /* If writeback is used when restoring callee-saves, the CFA
5842 is restored on the instruction doing the writeback. */
5843 aarch64_add_offset (Pmode, stack_pointer_rtx,
5844 hard_frame_pointer_rtx, -callee_offset,
5845 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5846 else
5847 /* The case where we need to re-use the register here is very rare, so
5848 avoid the complicated condition and just always emit a move if the
5849 immediate doesn't fit. */
5850 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5851
5852 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5853 callee_adjust != 0, &cfi_ops);
5854 if (aarch64_simd_decl_p (cfun->decl))
5855 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5856 callee_adjust != 0, &cfi_ops);
5857 else
5858 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5859 callee_adjust != 0, &cfi_ops);
5860
5861 if (need_barrier_p)
5862 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5863
5864 if (callee_adjust != 0)
5865 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5866
5867 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5868 {
5869 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5870 insn = get_last_insn ();
5871 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5872 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5873 RTX_FRAME_RELATED_P (insn) = 1;
5874 cfi_ops = NULL;
5875 }
5876
5877 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5878 add restriction on emit_move optimization to leaf functions. */
5879 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5880 (!can_inherit_p || !crtl->is_leaf
5881 || df_regs_ever_live_p (EP0_REGNUM)));
5882
5883 if (cfi_ops)
5884 {
5885 /* Emit delayed restores and reset the CFA to be SP. */
5886 insn = get_last_insn ();
5887 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5888 REG_NOTES (insn) = cfi_ops;
5889 RTX_FRAME_RELATED_P (insn) = 1;
5890 }
5891
5892 /* We prefer to emit the combined return/authenticate instruction RETAA,
5893 however there are three cases in which we must instead emit an explicit
5894 authentication instruction.
5895
5896 1) Sibcalls don't return in a normal way, so if we're about to call one
5897 we must authenticate.
5898
5899 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5900 generating code for !TARGET_ARMV8_3 we can't use it and must
5901 explicitly authenticate.
5902
5903 3) On an eh_return path we make extra stack adjustments to update the
5904 canonical frame address to be the exception handler's CFA. We want
5905 to authenticate using the CFA of the function which calls eh_return.
5906 */
5907 if (aarch64_return_address_signing_enabled ()
5908 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5909 {
5910 insn = emit_insn (gen_autisp ());
5911 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5912 RTX_FRAME_RELATED_P (insn) = 1;
5913 }
5914
5915 /* Stack adjustment for exception handler. */
5916 if (crtl->calls_eh_return)
5917 {
5918 /* We need to unwind the stack by the offset computed by
5919 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5920 to be SP; letting the CFA move during this adjustment
5921 is just as correct as retaining the CFA from the body
5922 of the function. Therefore, do nothing special. */
5923 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5924 }
5925
5926 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5927 if (!for_sibcall)
5928 emit_jump_insn (ret_rtx);
5929 }
5930
5931 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5932 normally or return to a previous frame after unwinding.
5933
5934 An EH return uses a single shared return sequence. The epilogue is
5935 exactly like a normal epilogue except that it has an extra input
5936 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5937 that must be applied after the frame has been destroyed. An extra label
5938 is inserted before the epilogue which initializes this register to zero,
5939 and this is the entry point for a normal return.
5940
5941 An actual EH return updates the return address, initializes the stack
5942 adjustment and jumps directly into the epilogue (bypassing the zeroing
5943 of the adjustment). Since the return address is typically saved on the
5944 stack when a function makes a call, the saved LR must be updated outside
5945 the epilogue.
5946
5947 This poses problems as the store is generated well before the epilogue,
5948 so the offset of LR is not known yet. Also optimizations will remove the
5949 store as it appears dead, even after the epilogue is generated (as the
5950 base or offset for loading LR is different in many cases).
5951
5952 To avoid these problems this implementation forces the frame pointer
5953 in eh_return functions so that the location of LR is fixed and known early.
5954 It also marks the store volatile, so no optimization is permitted to
5955 remove the store. */
5956 rtx
5957 aarch64_eh_return_handler_rtx (void)
5958 {
5959 rtx tmp = gen_frame_mem (Pmode,
5960 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5961
5962 /* Mark the store volatile, so no optimization is permitted to remove it. */
5963 MEM_VOLATILE_P (tmp) = true;
5964 return tmp;
5965 }
5966
5967 /* Output code to add DELTA to the first argument, and then jump
5968 to FUNCTION. Used for C++ multiple inheritance. */
5969 static void
5970 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5971 HOST_WIDE_INT delta,
5972 HOST_WIDE_INT vcall_offset,
5973 tree function)
5974 {
5975 /* The this pointer is always in x0. Note that this differs from
5976 Arm where the this pointer maybe bumped to r1 if r0 is required
5977 to return a pointer to an aggregate. On AArch64 a result value
5978 pointer will be in x8. */
5979 int this_regno = R0_REGNUM;
5980 rtx this_rtx, temp0, temp1, addr, funexp;
5981 rtx_insn *insn;
5982
5983 reload_completed = 1;
5984 emit_note (NOTE_INSN_PROLOGUE_END);
5985
5986 this_rtx = gen_rtx_REG (Pmode, this_regno);
5987 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5988 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5989
5990 if (vcall_offset == 0)
5991 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5992 else
5993 {
5994 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5995
5996 addr = this_rtx;
5997 if (delta != 0)
5998 {
5999 if (delta >= -256 && delta < 256)
6000 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6001 plus_constant (Pmode, this_rtx, delta));
6002 else
6003 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6004 temp1, temp0, false);
6005 }
6006
6007 if (Pmode == ptr_mode)
6008 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6009 else
6010 aarch64_emit_move (temp0,
6011 gen_rtx_ZERO_EXTEND (Pmode,
6012 gen_rtx_MEM (ptr_mode, addr)));
6013
6014 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6015 addr = plus_constant (Pmode, temp0, vcall_offset);
6016 else
6017 {
6018 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6019 Pmode);
6020 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6021 }
6022
6023 if (Pmode == ptr_mode)
6024 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6025 else
6026 aarch64_emit_move (temp1,
6027 gen_rtx_SIGN_EXTEND (Pmode,
6028 gen_rtx_MEM (ptr_mode, addr)));
6029
6030 emit_insn (gen_add2_insn (this_rtx, temp1));
6031 }
6032
6033 /* Generate a tail call to the target function. */
6034 if (!TREE_USED (function))
6035 {
6036 assemble_external (function);
6037 TREE_USED (function) = 1;
6038 }
6039 funexp = XEXP (DECL_RTL (function), 0);
6040 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6041 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6042 SIBLING_CALL_P (insn) = 1;
6043
6044 insn = get_insns ();
6045 shorten_branches (insn);
6046 final_start_function (insn, file, 1);
6047 final (insn, file, 1);
6048 final_end_function ();
6049
6050 /* Stop pretending to be a post-reload pass. */
6051 reload_completed = 0;
6052 }
6053
6054 static bool
6055 aarch64_tls_referenced_p (rtx x)
6056 {
6057 if (!TARGET_HAVE_TLS)
6058 return false;
6059 subrtx_iterator::array_type array;
6060 FOR_EACH_SUBRTX (iter, array, x, ALL)
6061 {
6062 const_rtx x = *iter;
6063 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6064 return true;
6065 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6066 TLS offsets, not real symbol references. */
6067 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6068 iter.skip_subrtxes ();
6069 }
6070 return false;
6071 }
6072
6073
6074 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6075 a left shift of 0 or 12 bits. */
6076 bool
6077 aarch64_uimm12_shift (HOST_WIDE_INT val)
6078 {
6079 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6080 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6081 );
6082 }
6083
6084 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6085 that can be created with a left shift of 0 or 12. */
6086 static HOST_WIDE_INT
6087 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6088 {
6089 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6090 handle correctly. */
6091 gcc_assert ((val & 0xffffff) == val);
6092
6093 if (((val & 0xfff) << 0) == val)
6094 return val;
6095
6096 return val & (0xfff << 12);
6097 }
6098
6099 /* Return true if val is an immediate that can be loaded into a
6100 register by a MOVZ instruction. */
6101 static bool
6102 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6103 {
6104 if (GET_MODE_SIZE (mode) > 4)
6105 {
6106 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6107 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6108 return 1;
6109 }
6110 else
6111 {
6112 /* Ignore sign extension. */
6113 val &= (HOST_WIDE_INT) 0xffffffff;
6114 }
6115 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6116 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6117 }
6118
6119 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6120 64-bit (DImode) integer. */
6121
6122 static unsigned HOST_WIDE_INT
6123 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6124 {
6125 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6126 while (size < 64)
6127 {
6128 val &= (HOST_WIDE_INT_1U << size) - 1;
6129 val |= val << size;
6130 size *= 2;
6131 }
6132 return val;
6133 }
6134
6135 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6136
6137 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6138 {
6139 0x0000000100000001ull,
6140 0x0001000100010001ull,
6141 0x0101010101010101ull,
6142 0x1111111111111111ull,
6143 0x5555555555555555ull,
6144 };
6145
6146
6147 /* Return true if val is a valid bitmask immediate. */
6148
6149 bool
6150 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6151 {
6152 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6153 int bits;
6154
6155 /* Check for a single sequence of one bits and return quickly if so.
6156 The special cases of all ones and all zeroes returns false. */
6157 val = aarch64_replicate_bitmask_imm (val_in, mode);
6158 tmp = val + (val & -val);
6159
6160 if (tmp == (tmp & -tmp))
6161 return (val + 1) > 1;
6162
6163 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6164 if (mode == SImode)
6165 val = (val << 32) | (val & 0xffffffff);
6166
6167 /* Invert if the immediate doesn't start with a zero bit - this means we
6168 only need to search for sequences of one bits. */
6169 if (val & 1)
6170 val = ~val;
6171
6172 /* Find the first set bit and set tmp to val with the first sequence of one
6173 bits removed. Return success if there is a single sequence of ones. */
6174 first_one = val & -val;
6175 tmp = val & (val + first_one);
6176
6177 if (tmp == 0)
6178 return true;
6179
6180 /* Find the next set bit and compute the difference in bit position. */
6181 next_one = tmp & -tmp;
6182 bits = clz_hwi (first_one) - clz_hwi (next_one);
6183 mask = val ^ tmp;
6184
6185 /* Check the bit position difference is a power of 2, and that the first
6186 sequence of one bits fits within 'bits' bits. */
6187 if ((mask >> bits) != 0 || bits != (bits & -bits))
6188 return false;
6189
6190 /* Check the sequence of one bits is repeated 64/bits times. */
6191 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6192 }
6193
6194 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6195 Assumed precondition: VAL_IN Is not zero. */
6196
6197 unsigned HOST_WIDE_INT
6198 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6199 {
6200 int lowest_bit_set = ctz_hwi (val_in);
6201 int highest_bit_set = floor_log2 (val_in);
6202 gcc_assert (val_in != 0);
6203
6204 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6205 (HOST_WIDE_INT_1U << lowest_bit_set));
6206 }
6207
6208 /* Create constant where bits outside of lowest bit set to highest bit set
6209 are set to 1. */
6210
6211 unsigned HOST_WIDE_INT
6212 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6213 {
6214 return val_in | ~aarch64_and_split_imm1 (val_in);
6215 }
6216
6217 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6218
6219 bool
6220 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6221 {
6222 scalar_int_mode int_mode;
6223 if (!is_a <scalar_int_mode> (mode, &int_mode))
6224 return false;
6225
6226 if (aarch64_bitmask_imm (val_in, int_mode))
6227 return false;
6228
6229 if (aarch64_move_imm (val_in, int_mode))
6230 return false;
6231
6232 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6233
6234 return aarch64_bitmask_imm (imm2, int_mode);
6235 }
6236
6237 /* Return true if val is an immediate that can be loaded into a
6238 register in a single instruction. */
6239 bool
6240 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6241 {
6242 scalar_int_mode int_mode;
6243 if (!is_a <scalar_int_mode> (mode, &int_mode))
6244 return false;
6245
6246 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6247 return 1;
6248 return aarch64_bitmask_imm (val, int_mode);
6249 }
6250
6251 static bool
6252 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6253 {
6254 rtx base, offset;
6255
6256 if (GET_CODE (x) == HIGH)
6257 return true;
6258
6259 /* There's no way to calculate VL-based values using relocations. */
6260 subrtx_iterator::array_type array;
6261 FOR_EACH_SUBRTX (iter, array, x, ALL)
6262 if (GET_CODE (*iter) == CONST_POLY_INT)
6263 return true;
6264
6265 split_const (x, &base, &offset);
6266 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6267 {
6268 if (aarch64_classify_symbol (base, INTVAL (offset))
6269 != SYMBOL_FORCE_TO_MEM)
6270 return true;
6271 else
6272 /* Avoid generating a 64-bit relocation in ILP32; leave
6273 to aarch64_expand_mov_immediate to handle it properly. */
6274 return mode != ptr_mode;
6275 }
6276
6277 return aarch64_tls_referenced_p (x);
6278 }
6279
6280 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6281 The expansion for a table switch is quite expensive due to the number
6282 of instructions, the table lookup and hard to predict indirect jump.
6283 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6284 set, otherwise use tables for > 16 cases as a tradeoff between size and
6285 performance. When optimizing for size, use the default setting. */
6286
6287 static unsigned int
6288 aarch64_case_values_threshold (void)
6289 {
6290 /* Use the specified limit for the number of cases before using jump
6291 tables at higher optimization levels. */
6292 if (optimize > 2
6293 && selected_cpu->tune->max_case_values != 0)
6294 return selected_cpu->tune->max_case_values;
6295 else
6296 return optimize_size ? default_case_values_threshold () : 17;
6297 }
6298
6299 /* Return true if register REGNO is a valid index register.
6300 STRICT_P is true if REG_OK_STRICT is in effect. */
6301
6302 bool
6303 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6304 {
6305 if (!HARD_REGISTER_NUM_P (regno))
6306 {
6307 if (!strict_p)
6308 return true;
6309
6310 if (!reg_renumber)
6311 return false;
6312
6313 regno = reg_renumber[regno];
6314 }
6315 return GP_REGNUM_P (regno);
6316 }
6317
6318 /* Return true if register REGNO is a valid base register for mode MODE.
6319 STRICT_P is true if REG_OK_STRICT is in effect. */
6320
6321 bool
6322 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6323 {
6324 if (!HARD_REGISTER_NUM_P (regno))
6325 {
6326 if (!strict_p)
6327 return true;
6328
6329 if (!reg_renumber)
6330 return false;
6331
6332 regno = reg_renumber[regno];
6333 }
6334
6335 /* The fake registers will be eliminated to either the stack or
6336 hard frame pointer, both of which are usually valid base registers.
6337 Reload deals with the cases where the eliminated form isn't valid. */
6338 return (GP_REGNUM_P (regno)
6339 || regno == SP_REGNUM
6340 || regno == FRAME_POINTER_REGNUM
6341 || regno == ARG_POINTER_REGNUM);
6342 }
6343
6344 /* Return true if X is a valid base register for mode MODE.
6345 STRICT_P is true if REG_OK_STRICT is in effect. */
6346
6347 static bool
6348 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6349 {
6350 if (!strict_p
6351 && GET_CODE (x) == SUBREG
6352 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6353 x = SUBREG_REG (x);
6354
6355 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6356 }
6357
6358 /* Return true if address offset is a valid index. If it is, fill in INFO
6359 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6360
6361 static bool
6362 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6363 machine_mode mode, bool strict_p)
6364 {
6365 enum aarch64_address_type type;
6366 rtx index;
6367 int shift;
6368
6369 /* (reg:P) */
6370 if ((REG_P (x) || GET_CODE (x) == SUBREG)
6371 && GET_MODE (x) == Pmode)
6372 {
6373 type = ADDRESS_REG_REG;
6374 index = x;
6375 shift = 0;
6376 }
6377 /* (sign_extend:DI (reg:SI)) */
6378 else if ((GET_CODE (x) == SIGN_EXTEND
6379 || GET_CODE (x) == ZERO_EXTEND)
6380 && GET_MODE (x) == DImode
6381 && GET_MODE (XEXP (x, 0)) == SImode)
6382 {
6383 type = (GET_CODE (x) == SIGN_EXTEND)
6384 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6385 index = XEXP (x, 0);
6386 shift = 0;
6387 }
6388 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6389 else if (GET_CODE (x) == MULT
6390 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6391 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6392 && GET_MODE (XEXP (x, 0)) == DImode
6393 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6394 && CONST_INT_P (XEXP (x, 1)))
6395 {
6396 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6397 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6398 index = XEXP (XEXP (x, 0), 0);
6399 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6400 }
6401 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6402 else if (GET_CODE (x) == ASHIFT
6403 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6404 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6405 && GET_MODE (XEXP (x, 0)) == DImode
6406 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6407 && CONST_INT_P (XEXP (x, 1)))
6408 {
6409 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6410 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6411 index = XEXP (XEXP (x, 0), 0);
6412 shift = INTVAL (XEXP (x, 1));
6413 }
6414 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6415 else if ((GET_CODE (x) == SIGN_EXTRACT
6416 || GET_CODE (x) == ZERO_EXTRACT)
6417 && GET_MODE (x) == DImode
6418 && GET_CODE (XEXP (x, 0)) == MULT
6419 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6420 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6421 {
6422 type = (GET_CODE (x) == SIGN_EXTRACT)
6423 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6424 index = XEXP (XEXP (x, 0), 0);
6425 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6426 if (INTVAL (XEXP (x, 1)) != 32 + shift
6427 || INTVAL (XEXP (x, 2)) != 0)
6428 shift = -1;
6429 }
6430 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6431 (const_int 0xffffffff<<shift)) */
6432 else if (GET_CODE (x) == AND
6433 && GET_MODE (x) == DImode
6434 && GET_CODE (XEXP (x, 0)) == MULT
6435 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6436 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6437 && CONST_INT_P (XEXP (x, 1)))
6438 {
6439 type = ADDRESS_REG_UXTW;
6440 index = XEXP (XEXP (x, 0), 0);
6441 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6442 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6443 shift = -1;
6444 }
6445 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6446 else if ((GET_CODE (x) == SIGN_EXTRACT
6447 || GET_CODE (x) == ZERO_EXTRACT)
6448 && GET_MODE (x) == DImode
6449 && GET_CODE (XEXP (x, 0)) == ASHIFT
6450 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6451 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6452 {
6453 type = (GET_CODE (x) == SIGN_EXTRACT)
6454 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6455 index = XEXP (XEXP (x, 0), 0);
6456 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6457 if (INTVAL (XEXP (x, 1)) != 32 + shift
6458 || INTVAL (XEXP (x, 2)) != 0)
6459 shift = -1;
6460 }
6461 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6462 (const_int 0xffffffff<<shift)) */
6463 else if (GET_CODE (x) == AND
6464 && GET_MODE (x) == DImode
6465 && GET_CODE (XEXP (x, 0)) == ASHIFT
6466 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6467 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6468 && CONST_INT_P (XEXP (x, 1)))
6469 {
6470 type = ADDRESS_REG_UXTW;
6471 index = XEXP (XEXP (x, 0), 0);
6472 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6473 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6474 shift = -1;
6475 }
6476 /* (mult:P (reg:P) (const_int scale)) */
6477 else if (GET_CODE (x) == MULT
6478 && GET_MODE (x) == Pmode
6479 && GET_MODE (XEXP (x, 0)) == Pmode
6480 && CONST_INT_P (XEXP (x, 1)))
6481 {
6482 type = ADDRESS_REG_REG;
6483 index = XEXP (x, 0);
6484 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6485 }
6486 /* (ashift:P (reg:P) (const_int shift)) */
6487 else if (GET_CODE (x) == ASHIFT
6488 && GET_MODE (x) == Pmode
6489 && GET_MODE (XEXP (x, 0)) == Pmode
6490 && CONST_INT_P (XEXP (x, 1)))
6491 {
6492 type = ADDRESS_REG_REG;
6493 index = XEXP (x, 0);
6494 shift = INTVAL (XEXP (x, 1));
6495 }
6496 else
6497 return false;
6498
6499 if (!strict_p
6500 && GET_CODE (index) == SUBREG
6501 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6502 index = SUBREG_REG (index);
6503
6504 if (aarch64_sve_data_mode_p (mode))
6505 {
6506 if (type != ADDRESS_REG_REG
6507 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6508 return false;
6509 }
6510 else
6511 {
6512 if (shift != 0
6513 && !(IN_RANGE (shift, 1, 3)
6514 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6515 return false;
6516 }
6517
6518 if (REG_P (index)
6519 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6520 {
6521 info->type = type;
6522 info->offset = index;
6523 info->shift = shift;
6524 return true;
6525 }
6526
6527 return false;
6528 }
6529
6530 /* Return true if MODE is one of the modes for which we
6531 support LDP/STP operations. */
6532
6533 static bool
6534 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6535 {
6536 return mode == SImode || mode == DImode
6537 || mode == SFmode || mode == DFmode
6538 || (aarch64_vector_mode_supported_p (mode)
6539 && (known_eq (GET_MODE_SIZE (mode), 8)
6540 || (known_eq (GET_MODE_SIZE (mode), 16)
6541 && (aarch64_tune_params.extra_tuning_flags
6542 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6543 }
6544
6545 /* Return true if REGNO is a virtual pointer register, or an eliminable
6546 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6547 include stack_pointer or hard_frame_pointer. */
6548 static bool
6549 virt_or_elim_regno_p (unsigned regno)
6550 {
6551 return ((regno >= FIRST_VIRTUAL_REGISTER
6552 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6553 || regno == FRAME_POINTER_REGNUM
6554 || regno == ARG_POINTER_REGNUM);
6555 }
6556
6557 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6558 If it is, fill in INFO appropriately. STRICT_P is true if
6559 REG_OK_STRICT is in effect. */
6560
6561 bool
6562 aarch64_classify_address (struct aarch64_address_info *info,
6563 rtx x, machine_mode mode, bool strict_p,
6564 aarch64_addr_query_type type)
6565 {
6566 enum rtx_code code = GET_CODE (x);
6567 rtx op0, op1;
6568 poly_int64 offset;
6569
6570 HOST_WIDE_INT const_size;
6571
6572 /* On BE, we use load/store pair for all large int mode load/stores.
6573 TI/TFmode may also use a load/store pair. */
6574 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6575 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6576 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6577 || type == ADDR_QUERY_LDP_STP_N
6578 || mode == TImode
6579 || mode == TFmode
6580 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6581
6582 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6583 corresponds to the actual size of the memory being loaded/stored and the
6584 mode of the corresponding addressing mode is half of that. */
6585 if (type == ADDR_QUERY_LDP_STP_N
6586 && known_eq (GET_MODE_SIZE (mode), 16))
6587 mode = DFmode;
6588
6589 bool allow_reg_index_p = (!load_store_pair_p
6590 && (known_lt (GET_MODE_SIZE (mode), 16)
6591 || vec_flags == VEC_ADVSIMD
6592 || vec_flags == VEC_SVE_DATA));
6593
6594 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6595 [Rn, #offset, MUL VL]. */
6596 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6597 && (code != REG && code != PLUS))
6598 return false;
6599
6600 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6601 REG addressing. */
6602 if (advsimd_struct_p
6603 && !BYTES_BIG_ENDIAN
6604 && (code != POST_INC && code != REG))
6605 return false;
6606
6607 gcc_checking_assert (GET_MODE (x) == VOIDmode
6608 || SCALAR_INT_MODE_P (GET_MODE (x)));
6609
6610 switch (code)
6611 {
6612 case REG:
6613 case SUBREG:
6614 info->type = ADDRESS_REG_IMM;
6615 info->base = x;
6616 info->offset = const0_rtx;
6617 info->const_offset = 0;
6618 return aarch64_base_register_rtx_p (x, strict_p);
6619
6620 case PLUS:
6621 op0 = XEXP (x, 0);
6622 op1 = XEXP (x, 1);
6623
6624 if (! strict_p
6625 && REG_P (op0)
6626 && virt_or_elim_regno_p (REGNO (op0))
6627 && poly_int_rtx_p (op1, &offset))
6628 {
6629 info->type = ADDRESS_REG_IMM;
6630 info->base = op0;
6631 info->offset = op1;
6632 info->const_offset = offset;
6633
6634 return true;
6635 }
6636
6637 if (maybe_ne (GET_MODE_SIZE (mode), 0)
6638 && aarch64_base_register_rtx_p (op0, strict_p)
6639 && poly_int_rtx_p (op1, &offset))
6640 {
6641 info->type = ADDRESS_REG_IMM;
6642 info->base = op0;
6643 info->offset = op1;
6644 info->const_offset = offset;
6645
6646 /* TImode and TFmode values are allowed in both pairs of X
6647 registers and individual Q registers. The available
6648 address modes are:
6649 X,X: 7-bit signed scaled offset
6650 Q: 9-bit signed offset
6651 We conservatively require an offset representable in either mode.
6652 When performing the check for pairs of X registers i.e. LDP/STP
6653 pass down DImode since that is the natural size of the LDP/STP
6654 instruction memory accesses. */
6655 if (mode == TImode || mode == TFmode)
6656 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6657 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6658 || offset_12bit_unsigned_scaled_p (mode, offset)));
6659
6660 /* A 7bit offset check because OImode will emit a ldp/stp
6661 instruction (only big endian will get here).
6662 For ldp/stp instructions, the offset is scaled for the size of a
6663 single element of the pair. */
6664 if (mode == OImode)
6665 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6666
6667 /* Three 9/12 bit offsets checks because CImode will emit three
6668 ldr/str instructions (only big endian will get here). */
6669 if (mode == CImode)
6670 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6671 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6672 offset + 32)
6673 || offset_12bit_unsigned_scaled_p (V16QImode,
6674 offset + 32)));
6675
6676 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6677 instructions (only big endian will get here). */
6678 if (mode == XImode)
6679 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6680 && aarch64_offset_7bit_signed_scaled_p (TImode,
6681 offset + 32));
6682
6683 /* Make "m" use the LD1 offset range for SVE data modes, so
6684 that pre-RTL optimizers like ivopts will work to that
6685 instead of the wider LDR/STR range. */
6686 if (vec_flags == VEC_SVE_DATA)
6687 return (type == ADDR_QUERY_M
6688 ? offset_4bit_signed_scaled_p (mode, offset)
6689 : offset_9bit_signed_scaled_p (mode, offset));
6690
6691 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6692 {
6693 poly_int64 end_offset = (offset
6694 + GET_MODE_SIZE (mode)
6695 - BYTES_PER_SVE_VECTOR);
6696 return (type == ADDR_QUERY_M
6697 ? offset_4bit_signed_scaled_p (mode, offset)
6698 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6699 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6700 end_offset)));
6701 }
6702
6703 if (vec_flags == VEC_SVE_PRED)
6704 return offset_9bit_signed_scaled_p (mode, offset);
6705
6706 if (load_store_pair_p)
6707 return ((known_eq (GET_MODE_SIZE (mode), 4)
6708 || known_eq (GET_MODE_SIZE (mode), 8)
6709 || known_eq (GET_MODE_SIZE (mode), 16))
6710 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6711 else
6712 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6713 || offset_12bit_unsigned_scaled_p (mode, offset));
6714 }
6715
6716 if (allow_reg_index_p)
6717 {
6718 /* Look for base + (scaled/extended) index register. */
6719 if (aarch64_base_register_rtx_p (op0, strict_p)
6720 && aarch64_classify_index (info, op1, mode, strict_p))
6721 {
6722 info->base = op0;
6723 return true;
6724 }
6725 if (aarch64_base_register_rtx_p (op1, strict_p)
6726 && aarch64_classify_index (info, op0, mode, strict_p))
6727 {
6728 info->base = op1;
6729 return true;
6730 }
6731 }
6732
6733 return false;
6734
6735 case POST_INC:
6736 case POST_DEC:
6737 case PRE_INC:
6738 case PRE_DEC:
6739 info->type = ADDRESS_REG_WB;
6740 info->base = XEXP (x, 0);
6741 info->offset = NULL_RTX;
6742 return aarch64_base_register_rtx_p (info->base, strict_p);
6743
6744 case POST_MODIFY:
6745 case PRE_MODIFY:
6746 info->type = ADDRESS_REG_WB;
6747 info->base = XEXP (x, 0);
6748 if (GET_CODE (XEXP (x, 1)) == PLUS
6749 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6750 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6751 && aarch64_base_register_rtx_p (info->base, strict_p))
6752 {
6753 info->offset = XEXP (XEXP (x, 1), 1);
6754 info->const_offset = offset;
6755
6756 /* TImode and TFmode values are allowed in both pairs of X
6757 registers and individual Q registers. The available
6758 address modes are:
6759 X,X: 7-bit signed scaled offset
6760 Q: 9-bit signed offset
6761 We conservatively require an offset representable in either mode.
6762 */
6763 if (mode == TImode || mode == TFmode)
6764 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6765 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6766
6767 if (load_store_pair_p)
6768 return ((known_eq (GET_MODE_SIZE (mode), 4)
6769 || known_eq (GET_MODE_SIZE (mode), 8)
6770 || known_eq (GET_MODE_SIZE (mode), 16))
6771 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6772 else
6773 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6774 }
6775 return false;
6776
6777 case CONST:
6778 case SYMBOL_REF:
6779 case LABEL_REF:
6780 /* load literal: pc-relative constant pool entry. Only supported
6781 for SI mode or larger. */
6782 info->type = ADDRESS_SYMBOLIC;
6783
6784 if (!load_store_pair_p
6785 && GET_MODE_SIZE (mode).is_constant (&const_size)
6786 && const_size >= 4)
6787 {
6788 rtx sym, addend;
6789
6790 split_const (x, &sym, &addend);
6791 return ((GET_CODE (sym) == LABEL_REF
6792 || (GET_CODE (sym) == SYMBOL_REF
6793 && CONSTANT_POOL_ADDRESS_P (sym)
6794 && aarch64_pcrelative_literal_loads)));
6795 }
6796 return false;
6797
6798 case LO_SUM:
6799 info->type = ADDRESS_LO_SUM;
6800 info->base = XEXP (x, 0);
6801 info->offset = XEXP (x, 1);
6802 if (allow_reg_index_p
6803 && aarch64_base_register_rtx_p (info->base, strict_p))
6804 {
6805 rtx sym, offs;
6806 split_const (info->offset, &sym, &offs);
6807 if (GET_CODE (sym) == SYMBOL_REF
6808 && (aarch64_classify_symbol (sym, INTVAL (offs))
6809 == SYMBOL_SMALL_ABSOLUTE))
6810 {
6811 /* The symbol and offset must be aligned to the access size. */
6812 unsigned int align;
6813
6814 if (CONSTANT_POOL_ADDRESS_P (sym))
6815 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6816 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6817 {
6818 tree exp = SYMBOL_REF_DECL (sym);
6819 align = TYPE_ALIGN (TREE_TYPE (exp));
6820 align = aarch64_constant_alignment (exp, align);
6821 }
6822 else if (SYMBOL_REF_DECL (sym))
6823 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6824 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6825 && SYMBOL_REF_BLOCK (sym) != NULL)
6826 align = SYMBOL_REF_BLOCK (sym)->alignment;
6827 else
6828 align = BITS_PER_UNIT;
6829
6830 poly_int64 ref_size = GET_MODE_SIZE (mode);
6831 if (known_eq (ref_size, 0))
6832 ref_size = GET_MODE_SIZE (DImode);
6833
6834 return (multiple_p (INTVAL (offs), ref_size)
6835 && multiple_p (align / BITS_PER_UNIT, ref_size));
6836 }
6837 }
6838 return false;
6839
6840 default:
6841 return false;
6842 }
6843 }
6844
6845 /* Return true if the address X is valid for a PRFM instruction.
6846 STRICT_P is true if we should do strict checking with
6847 aarch64_classify_address. */
6848
6849 bool
6850 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6851 {
6852 struct aarch64_address_info addr;
6853
6854 /* PRFM accepts the same addresses as DImode... */
6855 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6856 if (!res)
6857 return false;
6858
6859 /* ... except writeback forms. */
6860 return addr.type != ADDRESS_REG_WB;
6861 }
6862
6863 bool
6864 aarch64_symbolic_address_p (rtx x)
6865 {
6866 rtx offset;
6867
6868 split_const (x, &x, &offset);
6869 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6870 }
6871
6872 /* Classify the base of symbolic expression X. */
6873
6874 enum aarch64_symbol_type
6875 aarch64_classify_symbolic_expression (rtx x)
6876 {
6877 rtx offset;
6878
6879 split_const (x, &x, &offset);
6880 return aarch64_classify_symbol (x, INTVAL (offset));
6881 }
6882
6883
6884 /* Return TRUE if X is a legitimate address for accessing memory in
6885 mode MODE. */
6886 static bool
6887 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6888 {
6889 struct aarch64_address_info addr;
6890
6891 return aarch64_classify_address (&addr, x, mode, strict_p);
6892 }
6893
6894 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6895 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6896 bool
6897 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6898 aarch64_addr_query_type type)
6899 {
6900 struct aarch64_address_info addr;
6901
6902 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6903 }
6904
6905 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6906
6907 static bool
6908 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6909 poly_int64 orig_offset,
6910 machine_mode mode)
6911 {
6912 HOST_WIDE_INT size;
6913 if (GET_MODE_SIZE (mode).is_constant (&size))
6914 {
6915 HOST_WIDE_INT const_offset, second_offset;
6916
6917 /* A general SVE offset is A * VQ + B. Remove the A component from
6918 coefficient 0 in order to get the constant B. */
6919 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6920
6921 /* Split an out-of-range address displacement into a base and
6922 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6923 range otherwise to increase opportunities for sharing the base
6924 address of different sizes. Unaligned accesses use the signed
6925 9-bit range, TImode/TFmode use the intersection of signed
6926 scaled 7-bit and signed 9-bit offset. */
6927 if (mode == TImode || mode == TFmode)
6928 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6929 else if ((const_offset & (size - 1)) != 0)
6930 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6931 else
6932 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6933
6934 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6935 return false;
6936
6937 /* Split the offset into second_offset and the rest. */
6938 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6939 *offset2 = gen_int_mode (second_offset, Pmode);
6940 return true;
6941 }
6942 else
6943 {
6944 /* Get the mode we should use as the basis of the range. For structure
6945 modes this is the mode of one vector. */
6946 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6947 machine_mode step_mode
6948 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6949
6950 /* Get the "mul vl" multiplier we'd like to use. */
6951 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6952 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6953 if (vec_flags & VEC_SVE_DATA)
6954 /* LDR supports a 9-bit range, but the move patterns for
6955 structure modes require all vectors to be in range of the
6956 same base. The simplest way of accomodating that while still
6957 promoting reuse of anchor points between different modes is
6958 to use an 8-bit range unconditionally. */
6959 vnum = ((vnum + 128) & 255) - 128;
6960 else
6961 /* Predicates are only handled singly, so we might as well use
6962 the full range. */
6963 vnum = ((vnum + 256) & 511) - 256;
6964 if (vnum == 0)
6965 return false;
6966
6967 /* Convert the "mul vl" multiplier into a byte offset. */
6968 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6969 if (known_eq (second_offset, orig_offset))
6970 return false;
6971
6972 /* Split the offset into second_offset and the rest. */
6973 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6974 *offset2 = gen_int_mode (second_offset, Pmode);
6975 return true;
6976 }
6977 }
6978
6979 /* Return the binary representation of floating point constant VALUE in INTVAL.
6980 If the value cannot be converted, return false without setting INTVAL.
6981 The conversion is done in the given MODE. */
6982 bool
6983 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6984 {
6985
6986 /* We make a general exception for 0. */
6987 if (aarch64_float_const_zero_rtx_p (value))
6988 {
6989 *intval = 0;
6990 return true;
6991 }
6992
6993 scalar_float_mode mode;
6994 if (GET_CODE (value) != CONST_DOUBLE
6995 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6996 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6997 /* Only support up to DF mode. */
6998 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6999 return false;
7000
7001 unsigned HOST_WIDE_INT ival = 0;
7002
7003 long res[2];
7004 real_to_target (res,
7005 CONST_DOUBLE_REAL_VALUE (value),
7006 REAL_MODE_FORMAT (mode));
7007
7008 if (mode == DFmode)
7009 {
7010 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7011 ival = zext_hwi (res[order], 32);
7012 ival |= (zext_hwi (res[1 - order], 32) << 32);
7013 }
7014 else
7015 ival = zext_hwi (res[0], 32);
7016
7017 *intval = ival;
7018 return true;
7019 }
7020
7021 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7022 single MOV(+MOVK) followed by an FMOV. */
7023 bool
7024 aarch64_float_const_rtx_p (rtx x)
7025 {
7026 machine_mode mode = GET_MODE (x);
7027 if (mode == VOIDmode)
7028 return false;
7029
7030 /* Determine whether it's cheaper to write float constants as
7031 mov/movk pairs over ldr/adrp pairs. */
7032 unsigned HOST_WIDE_INT ival;
7033
7034 if (GET_CODE (x) == CONST_DOUBLE
7035 && SCALAR_FLOAT_MODE_P (mode)
7036 && aarch64_reinterpret_float_as_int (x, &ival))
7037 {
7038 scalar_int_mode imode = (mode == HFmode
7039 ? SImode
7040 : int_mode_for_mode (mode).require ());
7041 int num_instr = aarch64_internal_mov_immediate
7042 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7043 return num_instr < 3;
7044 }
7045
7046 return false;
7047 }
7048
7049 /* Return TRUE if rtx X is immediate constant 0.0 */
7050 bool
7051 aarch64_float_const_zero_rtx_p (rtx x)
7052 {
7053 if (GET_MODE (x) == VOIDmode)
7054 return false;
7055
7056 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7057 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7058 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7059 }
7060
7061 /* Return TRUE if rtx X is immediate constant that fits in a single
7062 MOVI immediate operation. */
7063 bool
7064 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7065 {
7066 if (!TARGET_SIMD)
7067 return false;
7068
7069 machine_mode vmode;
7070 scalar_int_mode imode;
7071 unsigned HOST_WIDE_INT ival;
7072
7073 if (GET_CODE (x) == CONST_DOUBLE
7074 && SCALAR_FLOAT_MODE_P (mode))
7075 {
7076 if (!aarch64_reinterpret_float_as_int (x, &ival))
7077 return false;
7078
7079 /* We make a general exception for 0. */
7080 if (aarch64_float_const_zero_rtx_p (x))
7081 return true;
7082
7083 imode = int_mode_for_mode (mode).require ();
7084 }
7085 else if (GET_CODE (x) == CONST_INT
7086 && is_a <scalar_int_mode> (mode, &imode))
7087 ival = INTVAL (x);
7088 else
7089 return false;
7090
7091 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7092 a 128 bit vector mode. */
7093 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7094
7095 vmode = aarch64_simd_container_mode (imode, width);
7096 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7097
7098 return aarch64_simd_valid_immediate (v_op, NULL);
7099 }
7100
7101
7102 /* Return the fixed registers used for condition codes. */
7103
7104 static bool
7105 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7106 {
7107 *p1 = CC_REGNUM;
7108 *p2 = INVALID_REGNUM;
7109 return true;
7110 }
7111
7112 /* This function is used by the call expanders of the machine description.
7113 RESULT is the register in which the result is returned. It's NULL for
7114 "call" and "sibcall".
7115 MEM is the location of the function call.
7116 SIBCALL indicates whether this function call is normal call or sibling call.
7117 It will generate different pattern accordingly. */
7118
7119 void
7120 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7121 {
7122 rtx call, callee, tmp;
7123 rtvec vec;
7124 machine_mode mode;
7125
7126 gcc_assert (MEM_P (mem));
7127 callee = XEXP (mem, 0);
7128 mode = GET_MODE (callee);
7129 gcc_assert (mode == Pmode);
7130
7131 /* Decide if we should generate indirect calls by loading the
7132 address of the callee into a register before performing
7133 the branch-and-link. */
7134 if (SYMBOL_REF_P (callee)
7135 ? (aarch64_is_long_call_p (callee)
7136 || aarch64_is_noplt_call_p (callee))
7137 : !REG_P (callee))
7138 XEXP (mem, 0) = force_reg (mode, callee);
7139
7140 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7141
7142 if (result != NULL_RTX)
7143 call = gen_rtx_SET (result, call);
7144
7145 if (sibcall)
7146 tmp = ret_rtx;
7147 else
7148 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7149
7150 vec = gen_rtvec (2, call, tmp);
7151 call = gen_rtx_PARALLEL (VOIDmode, vec);
7152
7153 aarch64_emit_call_insn (call);
7154 }
7155
7156 /* Emit call insn with PAT and do aarch64-specific handling. */
7157
7158 void
7159 aarch64_emit_call_insn (rtx pat)
7160 {
7161 rtx insn = emit_call_insn (pat);
7162
7163 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7164 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7165 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7166 }
7167
7168 machine_mode
7169 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7170 {
7171 machine_mode mode_x = GET_MODE (x);
7172 rtx_code code_x = GET_CODE (x);
7173
7174 /* All floating point compares return CCFP if it is an equality
7175 comparison, and CCFPE otherwise. */
7176 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7177 {
7178 switch (code)
7179 {
7180 case EQ:
7181 case NE:
7182 case UNORDERED:
7183 case ORDERED:
7184 case UNLT:
7185 case UNLE:
7186 case UNGT:
7187 case UNGE:
7188 case UNEQ:
7189 return CCFPmode;
7190
7191 case LT:
7192 case LE:
7193 case GT:
7194 case GE:
7195 case LTGT:
7196 return CCFPEmode;
7197
7198 default:
7199 gcc_unreachable ();
7200 }
7201 }
7202
7203 /* Equality comparisons of short modes against zero can be performed
7204 using the TST instruction with the appropriate bitmask. */
7205 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7206 && (code == EQ || code == NE)
7207 && (mode_x == HImode || mode_x == QImode))
7208 return CC_NZmode;
7209
7210 /* Similarly, comparisons of zero_extends from shorter modes can
7211 be performed using an ANDS with an immediate mask. */
7212 if (y == const0_rtx && code_x == ZERO_EXTEND
7213 && (mode_x == SImode || mode_x == DImode)
7214 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7215 && (code == EQ || code == NE))
7216 return CC_NZmode;
7217
7218 if ((mode_x == SImode || mode_x == DImode)
7219 && y == const0_rtx
7220 && (code == EQ || code == NE || code == LT || code == GE)
7221 && (code_x == PLUS || code_x == MINUS || code_x == AND
7222 || code_x == NEG
7223 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7224 && CONST_INT_P (XEXP (x, 2)))))
7225 return CC_NZmode;
7226
7227 /* A compare with a shifted operand. Because of canonicalization,
7228 the comparison will have to be swapped when we emit the assembly
7229 code. */
7230 if ((mode_x == SImode || mode_x == DImode)
7231 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7232 && (code_x == ASHIFT || code_x == ASHIFTRT
7233 || code_x == LSHIFTRT
7234 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7235 return CC_SWPmode;
7236
7237 /* Similarly for a negated operand, but we can only do this for
7238 equalities. */
7239 if ((mode_x == SImode || mode_x == DImode)
7240 && (REG_P (y) || GET_CODE (y) == SUBREG)
7241 && (code == EQ || code == NE)
7242 && code_x == NEG)
7243 return CC_Zmode;
7244
7245 /* A test for unsigned overflow from an addition. */
7246 if ((mode_x == DImode || mode_x == TImode)
7247 && (code == LTU || code == GEU)
7248 && code_x == PLUS
7249 && rtx_equal_p (XEXP (x, 0), y))
7250 return CC_Cmode;
7251
7252 /* A test for unsigned overflow from an add with carry. */
7253 if ((mode_x == DImode || mode_x == TImode)
7254 && (code == LTU || code == GEU)
7255 && code_x == PLUS
7256 && CONST_SCALAR_INT_P (y)
7257 && (rtx_mode_t (y, mode_x)
7258 == (wi::shwi (1, mode_x)
7259 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7260 return CC_ADCmode;
7261
7262 /* A test for signed overflow. */
7263 if ((mode_x == DImode || mode_x == TImode)
7264 && code == NE
7265 && code_x == PLUS
7266 && GET_CODE (y) == SIGN_EXTEND)
7267 return CC_Vmode;
7268
7269 /* For everything else, return CCmode. */
7270 return CCmode;
7271 }
7272
7273 static int
7274 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7275
7276 int
7277 aarch64_get_condition_code (rtx x)
7278 {
7279 machine_mode mode = GET_MODE (XEXP (x, 0));
7280 enum rtx_code comp_code = GET_CODE (x);
7281
7282 if (GET_MODE_CLASS (mode) != MODE_CC)
7283 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7284 return aarch64_get_condition_code_1 (mode, comp_code);
7285 }
7286
7287 static int
7288 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7289 {
7290 switch (mode)
7291 {
7292 case E_CCFPmode:
7293 case E_CCFPEmode:
7294 switch (comp_code)
7295 {
7296 case GE: return AARCH64_GE;
7297 case GT: return AARCH64_GT;
7298 case LE: return AARCH64_LS;
7299 case LT: return AARCH64_MI;
7300 case NE: return AARCH64_NE;
7301 case EQ: return AARCH64_EQ;
7302 case ORDERED: return AARCH64_VC;
7303 case UNORDERED: return AARCH64_VS;
7304 case UNLT: return AARCH64_LT;
7305 case UNLE: return AARCH64_LE;
7306 case UNGT: return AARCH64_HI;
7307 case UNGE: return AARCH64_PL;
7308 default: return -1;
7309 }
7310 break;
7311
7312 case E_CCmode:
7313 switch (comp_code)
7314 {
7315 case NE: return AARCH64_NE;
7316 case EQ: return AARCH64_EQ;
7317 case GE: return AARCH64_GE;
7318 case GT: return AARCH64_GT;
7319 case LE: return AARCH64_LE;
7320 case LT: return AARCH64_LT;
7321 case GEU: return AARCH64_CS;
7322 case GTU: return AARCH64_HI;
7323 case LEU: return AARCH64_LS;
7324 case LTU: return AARCH64_CC;
7325 default: return -1;
7326 }
7327 break;
7328
7329 case E_CC_SWPmode:
7330 switch (comp_code)
7331 {
7332 case NE: return AARCH64_NE;
7333 case EQ: return AARCH64_EQ;
7334 case GE: return AARCH64_LE;
7335 case GT: return AARCH64_LT;
7336 case LE: return AARCH64_GE;
7337 case LT: return AARCH64_GT;
7338 case GEU: return AARCH64_LS;
7339 case GTU: return AARCH64_CC;
7340 case LEU: return AARCH64_CS;
7341 case LTU: return AARCH64_HI;
7342 default: return -1;
7343 }
7344 break;
7345
7346 case E_CC_NZmode:
7347 switch (comp_code)
7348 {
7349 case NE: return AARCH64_NE;
7350 case EQ: return AARCH64_EQ;
7351 case GE: return AARCH64_PL;
7352 case LT: return AARCH64_MI;
7353 default: return -1;
7354 }
7355 break;
7356
7357 case E_CC_Zmode:
7358 switch (comp_code)
7359 {
7360 case NE: return AARCH64_NE;
7361 case EQ: return AARCH64_EQ;
7362 default: return -1;
7363 }
7364 break;
7365
7366 case E_CC_Cmode:
7367 switch (comp_code)
7368 {
7369 case LTU: return AARCH64_CS;
7370 case GEU: return AARCH64_CC;
7371 default: return -1;
7372 }
7373 break;
7374
7375 case E_CC_ADCmode:
7376 switch (comp_code)
7377 {
7378 case GEU: return AARCH64_CS;
7379 case LTU: return AARCH64_CC;
7380 default: return -1;
7381 }
7382 break;
7383
7384 case E_CC_Vmode:
7385 switch (comp_code)
7386 {
7387 case NE: return AARCH64_VS;
7388 case EQ: return AARCH64_VC;
7389 default: return -1;
7390 }
7391 break;
7392
7393 default:
7394 return -1;
7395 }
7396
7397 return -1;
7398 }
7399
7400 bool
7401 aarch64_const_vec_all_same_in_range_p (rtx x,
7402 HOST_WIDE_INT minval,
7403 HOST_WIDE_INT maxval)
7404 {
7405 rtx elt;
7406 return (const_vec_duplicate_p (x, &elt)
7407 && CONST_INT_P (elt)
7408 && IN_RANGE (INTVAL (elt), minval, maxval));
7409 }
7410
7411 bool
7412 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7413 {
7414 return aarch64_const_vec_all_same_in_range_p (x, val, val);
7415 }
7416
7417 /* Return true if VEC is a constant in which every element is in the range
7418 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7419
7420 static bool
7421 aarch64_const_vec_all_in_range_p (rtx vec,
7422 HOST_WIDE_INT minval,
7423 HOST_WIDE_INT maxval)
7424 {
7425 if (GET_CODE (vec) != CONST_VECTOR
7426 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7427 return false;
7428
7429 int nunits;
7430 if (!CONST_VECTOR_STEPPED_P (vec))
7431 nunits = const_vector_encoded_nelts (vec);
7432 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7433 return false;
7434
7435 for (int i = 0; i < nunits; i++)
7436 {
7437 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7438 if (!CONST_INT_P (vec_elem)
7439 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7440 return false;
7441 }
7442 return true;
7443 }
7444
7445 /* N Z C V. */
7446 #define AARCH64_CC_V 1
7447 #define AARCH64_CC_C (1 << 1)
7448 #define AARCH64_CC_Z (1 << 2)
7449 #define AARCH64_CC_N (1 << 3)
7450
7451 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7452 static const int aarch64_nzcv_codes[] =
7453 {
7454 0, /* EQ, Z == 1. */
7455 AARCH64_CC_Z, /* NE, Z == 0. */
7456 0, /* CS, C == 1. */
7457 AARCH64_CC_C, /* CC, C == 0. */
7458 0, /* MI, N == 1. */
7459 AARCH64_CC_N, /* PL, N == 0. */
7460 0, /* VS, V == 1. */
7461 AARCH64_CC_V, /* VC, V == 0. */
7462 0, /* HI, C ==1 && Z == 0. */
7463 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
7464 AARCH64_CC_V, /* GE, N == V. */
7465 0, /* LT, N != V. */
7466 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
7467 0, /* LE, !(Z == 0 && N == V). */
7468 0, /* AL, Any. */
7469 0 /* NV, Any. */
7470 };
7471
7472 /* Print floating-point vector immediate operand X to F, negating it
7473 first if NEGATE is true. Return true on success, false if it isn't
7474 a constant we can handle. */
7475
7476 static bool
7477 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7478 {
7479 rtx elt;
7480
7481 if (!const_vec_duplicate_p (x, &elt))
7482 return false;
7483
7484 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7485 if (negate)
7486 r = real_value_negate (&r);
7487
7488 /* We only handle the SVE single-bit immediates here. */
7489 if (real_equal (&r, &dconst0))
7490 asm_fprintf (f, "0.0");
7491 else if (real_equal (&r, &dconst1))
7492 asm_fprintf (f, "1.0");
7493 else if (real_equal (&r, &dconsthalf))
7494 asm_fprintf (f, "0.5");
7495 else
7496 return false;
7497
7498 return true;
7499 }
7500
7501 /* Return the equivalent letter for size. */
7502 static char
7503 sizetochar (int size)
7504 {
7505 switch (size)
7506 {
7507 case 64: return 'd';
7508 case 32: return 's';
7509 case 16: return 'h';
7510 case 8 : return 'b';
7511 default: gcc_unreachable ();
7512 }
7513 }
7514
7515 /* Print operand X to file F in a target specific manner according to CODE.
7516 The acceptable formatting commands given by CODE are:
7517 'c': An integer or symbol address without a preceding #
7518 sign.
7519 'C': Take the duplicated element in a vector constant
7520 and print it in hex.
7521 'D': Take the duplicated element in a vector constant
7522 and print it as an unsigned integer, in decimal.
7523 'e': Print the sign/zero-extend size as a character 8->b,
7524 16->h, 32->w.
7525 'p': Prints N such that 2^N == X (X must be power of 2 and
7526 const int).
7527 'P': Print the number of non-zero bits in X (a const_int).
7528 'H': Print the higher numbered register of a pair (TImode)
7529 of regs.
7530 'm': Print a condition (eq, ne, etc).
7531 'M': Same as 'm', but invert condition.
7532 'N': Take the duplicated element in a vector constant
7533 and print the negative of it in decimal.
7534 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7535 'S/T/U/V': Print a FP/SIMD register name for a register list.
7536 The register printed is the FP/SIMD register name
7537 of X + 0/1/2/3 for S/T/U/V.
7538 'R': Print a scalar FP/SIMD register name + 1.
7539 'X': Print bottom 16 bits of integer constant in hex.
7540 'w/x': Print a general register name or the zero register
7541 (32-bit or 64-bit).
7542 '0': Print a normal operand, if it's a general register,
7543 then we assume DImode.
7544 'k': Print NZCV for conditional compare instructions.
7545 'A': Output address constant representing the first
7546 argument of X, specifying a relocation offset
7547 if appropriate.
7548 'L': Output constant address specified by X
7549 with a relocation offset if appropriate.
7550 'G': Prints address of X, specifying a PC relative
7551 relocation mode if appropriate.
7552 'y': Output address of LDP or STP - this is used for
7553 some LDP/STPs which don't use a PARALLEL in their
7554 pattern (so the mode needs to be adjusted).
7555 'z': Output address of a typical LDP or STP. */
7556
7557 static void
7558 aarch64_print_operand (FILE *f, rtx x, int code)
7559 {
7560 rtx elt;
7561 switch (code)
7562 {
7563 case 'c':
7564 switch (GET_CODE (x))
7565 {
7566 case CONST_INT:
7567 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7568 break;
7569
7570 case SYMBOL_REF:
7571 output_addr_const (f, x);
7572 break;
7573
7574 case CONST:
7575 if (GET_CODE (XEXP (x, 0)) == PLUS
7576 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7577 {
7578 output_addr_const (f, x);
7579 break;
7580 }
7581 /* Fall through. */
7582
7583 default:
7584 output_operand_lossage ("unsupported operand for code '%c'", code);
7585 }
7586 break;
7587
7588 case 'e':
7589 {
7590 int n;
7591
7592 if (!CONST_INT_P (x)
7593 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7594 {
7595 output_operand_lossage ("invalid operand for '%%%c'", code);
7596 return;
7597 }
7598
7599 switch (n)
7600 {
7601 case 3:
7602 fputc ('b', f);
7603 break;
7604 case 4:
7605 fputc ('h', f);
7606 break;
7607 case 5:
7608 fputc ('w', f);
7609 break;
7610 default:
7611 output_operand_lossage ("invalid operand for '%%%c'", code);
7612 return;
7613 }
7614 }
7615 break;
7616
7617 case 'p':
7618 {
7619 int n;
7620
7621 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7622 {
7623 output_operand_lossage ("invalid operand for '%%%c'", code);
7624 return;
7625 }
7626
7627 asm_fprintf (f, "%d", n);
7628 }
7629 break;
7630
7631 case 'P':
7632 if (!CONST_INT_P (x))
7633 {
7634 output_operand_lossage ("invalid operand for '%%%c'", code);
7635 return;
7636 }
7637
7638 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7639 break;
7640
7641 case 'H':
7642 if (x == const0_rtx)
7643 {
7644 asm_fprintf (f, "xzr");
7645 break;
7646 }
7647
7648 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7649 {
7650 output_operand_lossage ("invalid operand for '%%%c'", code);
7651 return;
7652 }
7653
7654 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7655 break;
7656
7657 case 'M':
7658 case 'm':
7659 {
7660 int cond_code;
7661 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7662 if (x == const_true_rtx)
7663 {
7664 if (code == 'M')
7665 fputs ("nv", f);
7666 return;
7667 }
7668
7669 if (!COMPARISON_P (x))
7670 {
7671 output_operand_lossage ("invalid operand for '%%%c'", code);
7672 return;
7673 }
7674
7675 cond_code = aarch64_get_condition_code (x);
7676 gcc_assert (cond_code >= 0);
7677 if (code == 'M')
7678 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7679 fputs (aarch64_condition_codes[cond_code], f);
7680 }
7681 break;
7682
7683 case 'N':
7684 if (!const_vec_duplicate_p (x, &elt))
7685 {
7686 output_operand_lossage ("invalid vector constant");
7687 return;
7688 }
7689
7690 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7691 asm_fprintf (f, "%wd", -INTVAL (elt));
7692 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7693 && aarch64_print_vector_float_operand (f, x, true))
7694 ;
7695 else
7696 {
7697 output_operand_lossage ("invalid vector constant");
7698 return;
7699 }
7700 break;
7701
7702 case 'b':
7703 case 'h':
7704 case 's':
7705 case 'd':
7706 case 'q':
7707 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7708 {
7709 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7710 return;
7711 }
7712 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7713 break;
7714
7715 case 'S':
7716 case 'T':
7717 case 'U':
7718 case 'V':
7719 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7720 {
7721 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7722 return;
7723 }
7724 asm_fprintf (f, "%c%d",
7725 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7726 REGNO (x) - V0_REGNUM + (code - 'S'));
7727 break;
7728
7729 case 'R':
7730 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7731 {
7732 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7733 return;
7734 }
7735 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7736 break;
7737
7738 case 'X':
7739 if (!CONST_INT_P (x))
7740 {
7741 output_operand_lossage ("invalid operand for '%%%c'", code);
7742 return;
7743 }
7744 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7745 break;
7746
7747 case 'C':
7748 {
7749 /* Print a replicated constant in hex. */
7750 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7751 {
7752 output_operand_lossage ("invalid operand for '%%%c'", code);
7753 return;
7754 }
7755 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7756 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7757 }
7758 break;
7759
7760 case 'D':
7761 {
7762 /* Print a replicated constant in decimal, treating it as
7763 unsigned. */
7764 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7765 {
7766 output_operand_lossage ("invalid operand for '%%%c'", code);
7767 return;
7768 }
7769 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7770 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7771 }
7772 break;
7773
7774 case 'w':
7775 case 'x':
7776 if (x == const0_rtx
7777 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7778 {
7779 asm_fprintf (f, "%czr", code);
7780 break;
7781 }
7782
7783 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7784 {
7785 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7786 break;
7787 }
7788
7789 if (REG_P (x) && REGNO (x) == SP_REGNUM)
7790 {
7791 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7792 break;
7793 }
7794
7795 /* Fall through */
7796
7797 case 0:
7798 if (x == NULL)
7799 {
7800 output_operand_lossage ("missing operand");
7801 return;
7802 }
7803
7804 switch (GET_CODE (x))
7805 {
7806 case REG:
7807 if (aarch64_sve_data_mode_p (GET_MODE (x)))
7808 {
7809 if (REG_NREGS (x) == 1)
7810 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7811 else
7812 {
7813 char suffix
7814 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7815 asm_fprintf (f, "{z%d.%c - z%d.%c}",
7816 REGNO (x) - V0_REGNUM, suffix,
7817 END_REGNO (x) - V0_REGNUM - 1, suffix);
7818 }
7819 }
7820 else
7821 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7822 break;
7823
7824 case MEM:
7825 output_address (GET_MODE (x), XEXP (x, 0));
7826 break;
7827
7828 case LABEL_REF:
7829 case SYMBOL_REF:
7830 output_addr_const (asm_out_file, x);
7831 break;
7832
7833 case CONST_INT:
7834 asm_fprintf (f, "%wd", INTVAL (x));
7835 break;
7836
7837 case CONST:
7838 if (!VECTOR_MODE_P (GET_MODE (x)))
7839 {
7840 output_addr_const (asm_out_file, x);
7841 break;
7842 }
7843 /* fall through */
7844
7845 case CONST_VECTOR:
7846 if (!const_vec_duplicate_p (x, &elt))
7847 {
7848 output_operand_lossage ("invalid vector constant");
7849 return;
7850 }
7851
7852 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7853 asm_fprintf (f, "%wd", INTVAL (elt));
7854 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7855 && aarch64_print_vector_float_operand (f, x, false))
7856 ;
7857 else
7858 {
7859 output_operand_lossage ("invalid vector constant");
7860 return;
7861 }
7862 break;
7863
7864 case CONST_DOUBLE:
7865 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7866 be getting CONST_DOUBLEs holding integers. */
7867 gcc_assert (GET_MODE (x) != VOIDmode);
7868 if (aarch64_float_const_zero_rtx_p (x))
7869 {
7870 fputc ('0', f);
7871 break;
7872 }
7873 else if (aarch64_float_const_representable_p (x))
7874 {
7875 #define buf_size 20
7876 char float_buf[buf_size] = {'\0'};
7877 real_to_decimal_for_mode (float_buf,
7878 CONST_DOUBLE_REAL_VALUE (x),
7879 buf_size, buf_size,
7880 1, GET_MODE (x));
7881 asm_fprintf (asm_out_file, "%s", float_buf);
7882 break;
7883 #undef buf_size
7884 }
7885 output_operand_lossage ("invalid constant");
7886 return;
7887 default:
7888 output_operand_lossage ("invalid operand");
7889 return;
7890 }
7891 break;
7892
7893 case 'A':
7894 if (GET_CODE (x) == HIGH)
7895 x = XEXP (x, 0);
7896
7897 switch (aarch64_classify_symbolic_expression (x))
7898 {
7899 case SYMBOL_SMALL_GOT_4G:
7900 asm_fprintf (asm_out_file, ":got:");
7901 break;
7902
7903 case SYMBOL_SMALL_TLSGD:
7904 asm_fprintf (asm_out_file, ":tlsgd:");
7905 break;
7906
7907 case SYMBOL_SMALL_TLSDESC:
7908 asm_fprintf (asm_out_file, ":tlsdesc:");
7909 break;
7910
7911 case SYMBOL_SMALL_TLSIE:
7912 asm_fprintf (asm_out_file, ":gottprel:");
7913 break;
7914
7915 case SYMBOL_TLSLE24:
7916 asm_fprintf (asm_out_file, ":tprel:");
7917 break;
7918
7919 case SYMBOL_TINY_GOT:
7920 gcc_unreachable ();
7921 break;
7922
7923 default:
7924 break;
7925 }
7926 output_addr_const (asm_out_file, x);
7927 break;
7928
7929 case 'L':
7930 switch (aarch64_classify_symbolic_expression (x))
7931 {
7932 case SYMBOL_SMALL_GOT_4G:
7933 asm_fprintf (asm_out_file, ":lo12:");
7934 break;
7935
7936 case SYMBOL_SMALL_TLSGD:
7937 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7938 break;
7939
7940 case SYMBOL_SMALL_TLSDESC:
7941 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7942 break;
7943
7944 case SYMBOL_SMALL_TLSIE:
7945 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7946 break;
7947
7948 case SYMBOL_TLSLE12:
7949 asm_fprintf (asm_out_file, ":tprel_lo12:");
7950 break;
7951
7952 case SYMBOL_TLSLE24:
7953 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7954 break;
7955
7956 case SYMBOL_TINY_GOT:
7957 asm_fprintf (asm_out_file, ":got:");
7958 break;
7959
7960 case SYMBOL_TINY_TLSIE:
7961 asm_fprintf (asm_out_file, ":gottprel:");
7962 break;
7963
7964 default:
7965 break;
7966 }
7967 output_addr_const (asm_out_file, x);
7968 break;
7969
7970 case 'G':
7971 switch (aarch64_classify_symbolic_expression (x))
7972 {
7973 case SYMBOL_TLSLE24:
7974 asm_fprintf (asm_out_file, ":tprel_hi12:");
7975 break;
7976 default:
7977 break;
7978 }
7979 output_addr_const (asm_out_file, x);
7980 break;
7981
7982 case 'k':
7983 {
7984 HOST_WIDE_INT cond_code;
7985
7986 if (!CONST_INT_P (x))
7987 {
7988 output_operand_lossage ("invalid operand for '%%%c'", code);
7989 return;
7990 }
7991
7992 cond_code = INTVAL (x);
7993 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7994 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7995 }
7996 break;
7997
7998 case 'y':
7999 case 'z':
8000 {
8001 machine_mode mode = GET_MODE (x);
8002
8003 if (GET_CODE (x) != MEM
8004 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8005 {
8006 output_operand_lossage ("invalid operand for '%%%c'", code);
8007 return;
8008 }
8009
8010 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8011 code == 'y'
8012 ? ADDR_QUERY_LDP_STP_N
8013 : ADDR_QUERY_LDP_STP))
8014 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8015 }
8016 break;
8017
8018 default:
8019 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8020 return;
8021 }
8022 }
8023
8024 /* Print address 'x' of a memory access with mode 'mode'.
8025 'op' is the context required by aarch64_classify_address. It can either be
8026 MEM for a normal memory access or PARALLEL for LDP/STP. */
8027 static bool
8028 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8029 aarch64_addr_query_type type)
8030 {
8031 struct aarch64_address_info addr;
8032 unsigned int size;
8033
8034 /* Check all addresses are Pmode - including ILP32. */
8035 if (GET_MODE (x) != Pmode
8036 && (!CONST_INT_P (x)
8037 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8038 {
8039 output_operand_lossage ("invalid address mode");
8040 return false;
8041 }
8042
8043 if (aarch64_classify_address (&addr, x, mode, true, type))
8044 switch (addr.type)
8045 {
8046 case ADDRESS_REG_IMM:
8047 if (known_eq (addr.const_offset, 0))
8048 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8049 else if (aarch64_sve_data_mode_p (mode))
8050 {
8051 HOST_WIDE_INT vnum
8052 = exact_div (addr.const_offset,
8053 BYTES_PER_SVE_VECTOR).to_constant ();
8054 asm_fprintf (f, "[%s, #%wd, mul vl]",
8055 reg_names[REGNO (addr.base)], vnum);
8056 }
8057 else if (aarch64_sve_pred_mode_p (mode))
8058 {
8059 HOST_WIDE_INT vnum
8060 = exact_div (addr.const_offset,
8061 BYTES_PER_SVE_PRED).to_constant ();
8062 asm_fprintf (f, "[%s, #%wd, mul vl]",
8063 reg_names[REGNO (addr.base)], vnum);
8064 }
8065 else
8066 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8067 INTVAL (addr.offset));
8068 return true;
8069
8070 case ADDRESS_REG_REG:
8071 if (addr.shift == 0)
8072 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8073 reg_names [REGNO (addr.offset)]);
8074 else
8075 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8076 reg_names [REGNO (addr.offset)], addr.shift);
8077 return true;
8078
8079 case ADDRESS_REG_UXTW:
8080 if (addr.shift == 0)
8081 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8082 REGNO (addr.offset) - R0_REGNUM);
8083 else
8084 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8085 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8086 return true;
8087
8088 case ADDRESS_REG_SXTW:
8089 if (addr.shift == 0)
8090 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8091 REGNO (addr.offset) - R0_REGNUM);
8092 else
8093 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8094 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8095 return true;
8096
8097 case ADDRESS_REG_WB:
8098 /* Writeback is only supported for fixed-width modes. */
8099 size = GET_MODE_SIZE (mode).to_constant ();
8100 switch (GET_CODE (x))
8101 {
8102 case PRE_INC:
8103 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8104 return true;
8105 case POST_INC:
8106 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8107 return true;
8108 case PRE_DEC:
8109 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8110 return true;
8111 case POST_DEC:
8112 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8113 return true;
8114 case PRE_MODIFY:
8115 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8116 INTVAL (addr.offset));
8117 return true;
8118 case POST_MODIFY:
8119 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8120 INTVAL (addr.offset));
8121 return true;
8122 default:
8123 break;
8124 }
8125 break;
8126
8127 case ADDRESS_LO_SUM:
8128 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8129 output_addr_const (f, addr.offset);
8130 asm_fprintf (f, "]");
8131 return true;
8132
8133 case ADDRESS_SYMBOLIC:
8134 output_addr_const (f, x);
8135 return true;
8136 }
8137
8138 return false;
8139 }
8140
8141 /* Print address 'x' of a memory access with mode 'mode'. */
8142 static void
8143 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8144 {
8145 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8146 output_addr_const (f, x);
8147 }
8148
8149 bool
8150 aarch64_label_mentioned_p (rtx x)
8151 {
8152 const char *fmt;
8153 int i;
8154
8155 if (GET_CODE (x) == LABEL_REF)
8156 return true;
8157
8158 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8159 referencing instruction, but they are constant offsets, not
8160 symbols. */
8161 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8162 return false;
8163
8164 fmt = GET_RTX_FORMAT (GET_CODE (x));
8165 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8166 {
8167 if (fmt[i] == 'E')
8168 {
8169 int j;
8170
8171 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8172 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8173 return 1;
8174 }
8175 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8176 return 1;
8177 }
8178
8179 return 0;
8180 }
8181
8182 /* Implement REGNO_REG_CLASS. */
8183
8184 enum reg_class
8185 aarch64_regno_regclass (unsigned regno)
8186 {
8187 if (GP_REGNUM_P (regno))
8188 return GENERAL_REGS;
8189
8190 if (regno == SP_REGNUM)
8191 return STACK_REG;
8192
8193 if (regno == FRAME_POINTER_REGNUM
8194 || regno == ARG_POINTER_REGNUM)
8195 return POINTER_REGS;
8196
8197 if (FP_REGNUM_P (regno))
8198 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
8199
8200 if (PR_REGNUM_P (regno))
8201 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8202
8203 return NO_REGS;
8204 }
8205
8206 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8207 If OFFSET is out of range, return an offset of an anchor point
8208 that is in range. Return 0 otherwise. */
8209
8210 static HOST_WIDE_INT
8211 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8212 machine_mode mode)
8213 {
8214 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8215 if (size > 16)
8216 return (offset + 0x400) & ~0x7f0;
8217
8218 /* For offsets that aren't a multiple of the access size, the limit is
8219 -256...255. */
8220 if (offset & (size - 1))
8221 {
8222 /* BLKmode typically uses LDP of X-registers. */
8223 if (mode == BLKmode)
8224 return (offset + 512) & ~0x3ff;
8225 return (offset + 0x100) & ~0x1ff;
8226 }
8227
8228 /* Small negative offsets are supported. */
8229 if (IN_RANGE (offset, -256, 0))
8230 return 0;
8231
8232 if (mode == TImode || mode == TFmode)
8233 return (offset + 0x100) & ~0x1ff;
8234
8235 /* Use 12-bit offset by access size. */
8236 return offset & (~0xfff * size);
8237 }
8238
8239 static rtx
8240 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
8241 {
8242 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8243 where mask is selected by alignment and size of the offset.
8244 We try to pick as large a range for the offset as possible to
8245 maximize the chance of a CSE. However, for aligned addresses
8246 we limit the range to 4k so that structures with different sized
8247 elements are likely to use the same base. We need to be careful
8248 not to split a CONST for some forms of address expression, otherwise
8249 it will generate sub-optimal code. */
8250
8251 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8252 {
8253 rtx base = XEXP (x, 0);
8254 rtx offset_rtx = XEXP (x, 1);
8255 HOST_WIDE_INT offset = INTVAL (offset_rtx);
8256
8257 if (GET_CODE (base) == PLUS)
8258 {
8259 rtx op0 = XEXP (base, 0);
8260 rtx op1 = XEXP (base, 1);
8261
8262 /* Force any scaling into a temp for CSE. */
8263 op0 = force_reg (Pmode, op0);
8264 op1 = force_reg (Pmode, op1);
8265
8266 /* Let the pointer register be in op0. */
8267 if (REG_POINTER (op1))
8268 std::swap (op0, op1);
8269
8270 /* If the pointer is virtual or frame related, then we know that
8271 virtual register instantiation or register elimination is going
8272 to apply a second constant. We want the two constants folded
8273 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8274 if (virt_or_elim_regno_p (REGNO (op0)))
8275 {
8276 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8277 NULL_RTX, true, OPTAB_DIRECT);
8278 return gen_rtx_PLUS (Pmode, base, op1);
8279 }
8280
8281 /* Otherwise, in order to encourage CSE (and thence loop strength
8282 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8283 base = expand_binop (Pmode, add_optab, op0, op1,
8284 NULL_RTX, true, OPTAB_DIRECT);
8285 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8286 }
8287
8288 HOST_WIDE_INT size;
8289 if (GET_MODE_SIZE (mode).is_constant (&size))
8290 {
8291 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8292 mode);
8293 if (base_offset != 0)
8294 {
8295 base = plus_constant (Pmode, base, base_offset);
8296 base = force_operand (base, NULL_RTX);
8297 return plus_constant (Pmode, base, offset - base_offset);
8298 }
8299 }
8300 }
8301
8302 return x;
8303 }
8304
8305 static reg_class_t
8306 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8307 reg_class_t rclass,
8308 machine_mode mode,
8309 secondary_reload_info *sri)
8310 {
8311 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8312 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8313 comment at the head of aarch64-sve.md for more details about the
8314 big-endian handling. */
8315 if (BYTES_BIG_ENDIAN
8316 && reg_class_subset_p (rclass, FP_REGS)
8317 && !((REG_P (x) && HARD_REGISTER_P (x))
8318 || aarch64_simd_valid_immediate (x, NULL))
8319 && aarch64_sve_data_mode_p (mode))
8320 {
8321 sri->icode = CODE_FOR_aarch64_sve_reload_be;
8322 return NO_REGS;
8323 }
8324
8325 /* If we have to disable direct literal pool loads and stores because the
8326 function is too big, then we need a scratch register. */
8327 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8328 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8329 || targetm.vector_mode_supported_p (GET_MODE (x)))
8330 && !aarch64_pcrelative_literal_loads)
8331 {
8332 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8333 return NO_REGS;
8334 }
8335
8336 /* Without the TARGET_SIMD instructions we cannot move a Q register
8337 to a Q register directly. We need a scratch. */
8338 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8339 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8340 && reg_class_subset_p (rclass, FP_REGS))
8341 {
8342 sri->icode = code_for_aarch64_reload_mov (mode);
8343 return NO_REGS;
8344 }
8345
8346 /* A TFmode or TImode memory access should be handled via an FP_REGS
8347 because AArch64 has richer addressing modes for LDR/STR instructions
8348 than LDP/STP instructions. */
8349 if (TARGET_FLOAT && rclass == GENERAL_REGS
8350 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8351 return FP_REGS;
8352
8353 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8354 return GENERAL_REGS;
8355
8356 return NO_REGS;
8357 }
8358
8359 static bool
8360 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8361 {
8362 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8363
8364 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8365 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8366 if (frame_pointer_needed)
8367 return to == HARD_FRAME_POINTER_REGNUM;
8368 return true;
8369 }
8370
8371 poly_int64
8372 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8373 {
8374 if (to == HARD_FRAME_POINTER_REGNUM)
8375 {
8376 if (from == ARG_POINTER_REGNUM)
8377 return cfun->machine->frame.hard_fp_offset;
8378
8379 if (from == FRAME_POINTER_REGNUM)
8380 return cfun->machine->frame.hard_fp_offset
8381 - cfun->machine->frame.locals_offset;
8382 }
8383
8384 if (to == STACK_POINTER_REGNUM)
8385 {
8386 if (from == FRAME_POINTER_REGNUM)
8387 return cfun->machine->frame.frame_size
8388 - cfun->machine->frame.locals_offset;
8389 }
8390
8391 return cfun->machine->frame.frame_size;
8392 }
8393
8394 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8395 previous frame. */
8396
8397 rtx
8398 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8399 {
8400 if (count != 0)
8401 return const0_rtx;
8402 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8403 }
8404
8405
8406 static void
8407 aarch64_asm_trampoline_template (FILE *f)
8408 {
8409 int offset1 = 16;
8410 int offset2 = 20;
8411
8412 if (aarch64_bti_enabled ())
8413 {
8414 asm_fprintf (f, "\thint\t34 // bti c\n");
8415 offset1 -= 4;
8416 offset2 -= 4;
8417 }
8418
8419 if (TARGET_ILP32)
8420 {
8421 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8422 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8423 offset1);
8424 }
8425 else
8426 {
8427 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8428 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8429 offset2);
8430 }
8431 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8432
8433 /* The trampoline needs an extra padding instruction. In case if BTI is
8434 enabled the padding instruction is replaced by the BTI instruction at
8435 the beginning. */
8436 if (!aarch64_bti_enabled ())
8437 assemble_aligned_integer (4, const0_rtx);
8438
8439 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8440 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8441 }
8442
8443 static void
8444 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8445 {
8446 rtx fnaddr, mem, a_tramp;
8447 const int tramp_code_sz = 16;
8448
8449 /* Don't need to copy the trailing D-words, we fill those in below. */
8450 emit_block_move (m_tramp, assemble_trampoline_template (),
8451 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8452 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8453 fnaddr = XEXP (DECL_RTL (fndecl), 0);
8454 if (GET_MODE (fnaddr) != ptr_mode)
8455 fnaddr = convert_memory_address (ptr_mode, fnaddr);
8456 emit_move_insn (mem, fnaddr);
8457
8458 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8459 emit_move_insn (mem, chain_value);
8460
8461 /* XXX We should really define a "clear_cache" pattern and use
8462 gen_clear_cache(). */
8463 a_tramp = XEXP (m_tramp, 0);
8464 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8465 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8466 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8467 ptr_mode);
8468 }
8469
8470 static unsigned char
8471 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8472 {
8473 /* ??? Logically we should only need to provide a value when
8474 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8475 can hold MODE, but at the moment we need to handle all modes.
8476 Just ignore any runtime parts for registers that can't store them. */
8477 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8478 unsigned int nregs;
8479 switch (regclass)
8480 {
8481 case TAILCALL_ADDR_REGS:
8482 case POINTER_REGS:
8483 case GENERAL_REGS:
8484 case ALL_REGS:
8485 case POINTER_AND_FP_REGS:
8486 case FP_REGS:
8487 case FP_LO_REGS:
8488 if (aarch64_sve_data_mode_p (mode)
8489 && constant_multiple_p (GET_MODE_SIZE (mode),
8490 BYTES_PER_SVE_VECTOR, &nregs))
8491 return nregs;
8492 return (aarch64_vector_data_mode_p (mode)
8493 ? CEIL (lowest_size, UNITS_PER_VREG)
8494 : CEIL (lowest_size, UNITS_PER_WORD));
8495 case STACK_REG:
8496 case PR_REGS:
8497 case PR_LO_REGS:
8498 case PR_HI_REGS:
8499 return 1;
8500
8501 case NO_REGS:
8502 return 0;
8503
8504 default:
8505 break;
8506 }
8507 gcc_unreachable ();
8508 }
8509
8510 static reg_class_t
8511 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8512 {
8513 if (regclass == POINTER_REGS)
8514 return GENERAL_REGS;
8515
8516 if (regclass == STACK_REG)
8517 {
8518 if (REG_P(x)
8519 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8520 return regclass;
8521
8522 return NO_REGS;
8523 }
8524
8525 /* Register eliminiation can result in a request for
8526 SP+constant->FP_REGS. We cannot support such operations which
8527 use SP as source and an FP_REG as destination, so reject out
8528 right now. */
8529 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8530 {
8531 rtx lhs = XEXP (x, 0);
8532
8533 /* Look through a possible SUBREG introduced by ILP32. */
8534 if (GET_CODE (lhs) == SUBREG)
8535 lhs = SUBREG_REG (lhs);
8536
8537 gcc_assert (REG_P (lhs));
8538 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8539 POINTER_REGS));
8540 return NO_REGS;
8541 }
8542
8543 return regclass;
8544 }
8545
8546 void
8547 aarch64_asm_output_labelref (FILE* f, const char *name)
8548 {
8549 asm_fprintf (f, "%U%s", name);
8550 }
8551
8552 static void
8553 aarch64_elf_asm_constructor (rtx symbol, int priority)
8554 {
8555 if (priority == DEFAULT_INIT_PRIORITY)
8556 default_ctor_section_asm_out_constructor (symbol, priority);
8557 else
8558 {
8559 section *s;
8560 /* While priority is known to be in range [0, 65535], so 18 bytes
8561 would be enough, the compiler might not know that. To avoid
8562 -Wformat-truncation false positive, use a larger size. */
8563 char buf[23];
8564 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8565 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8566 switch_to_section (s);
8567 assemble_align (POINTER_SIZE);
8568 assemble_aligned_integer (POINTER_BYTES, symbol);
8569 }
8570 }
8571
8572 static void
8573 aarch64_elf_asm_destructor (rtx symbol, int priority)
8574 {
8575 if (priority == DEFAULT_INIT_PRIORITY)
8576 default_dtor_section_asm_out_destructor (symbol, priority);
8577 else
8578 {
8579 section *s;
8580 /* While priority is known to be in range [0, 65535], so 18 bytes
8581 would be enough, the compiler might not know that. To avoid
8582 -Wformat-truncation false positive, use a larger size. */
8583 char buf[23];
8584 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8585 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8586 switch_to_section (s);
8587 assemble_align (POINTER_SIZE);
8588 assemble_aligned_integer (POINTER_BYTES, symbol);
8589 }
8590 }
8591
8592 const char*
8593 aarch64_output_casesi (rtx *operands)
8594 {
8595 char buf[100];
8596 char label[100];
8597 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8598 int index;
8599 static const char *const patterns[4][2] =
8600 {
8601 {
8602 "ldrb\t%w3, [%0,%w1,uxtw]",
8603 "add\t%3, %4, %w3, sxtb #2"
8604 },
8605 {
8606 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8607 "add\t%3, %4, %w3, sxth #2"
8608 },
8609 {
8610 "ldr\t%w3, [%0,%w1,uxtw #2]",
8611 "add\t%3, %4, %w3, sxtw #2"
8612 },
8613 /* We assume that DImode is only generated when not optimizing and
8614 that we don't really need 64-bit address offsets. That would
8615 imply an object file with 8GB of code in a single function! */
8616 {
8617 "ldr\t%w3, [%0,%w1,uxtw #2]",
8618 "add\t%3, %4, %w3, sxtw #2"
8619 }
8620 };
8621
8622 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8623
8624 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8625 index = exact_log2 (GET_MODE_SIZE (mode));
8626
8627 gcc_assert (index >= 0 && index <= 3);
8628
8629 /* Need to implement table size reduction, by chaning the code below. */
8630 output_asm_insn (patterns[index][0], operands);
8631 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8632 snprintf (buf, sizeof (buf),
8633 "adr\t%%4, %s", targetm.strip_name_encoding (label));
8634 output_asm_insn (buf, operands);
8635 output_asm_insn (patterns[index][1], operands);
8636 output_asm_insn ("br\t%3", operands);
8637 assemble_label (asm_out_file, label);
8638 return "";
8639 }
8640
8641
8642 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8643 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8644 operator. */
8645
8646 int
8647 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8648 {
8649 if (shift >= 0 && shift <= 3)
8650 {
8651 int size;
8652 for (size = 8; size <= 32; size *= 2)
8653 {
8654 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8655 if (mask == bits << shift)
8656 return size;
8657 }
8658 }
8659 return 0;
8660 }
8661
8662 /* Constant pools are per function only when PC relative
8663 literal loads are true or we are in the large memory
8664 model. */
8665
8666 static inline bool
8667 aarch64_can_use_per_function_literal_pools_p (void)
8668 {
8669 return (aarch64_pcrelative_literal_loads
8670 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8671 }
8672
8673 static bool
8674 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8675 {
8676 /* We can't use blocks for constants when we're using a per-function
8677 constant pool. */
8678 return !aarch64_can_use_per_function_literal_pools_p ();
8679 }
8680
8681 /* Select appropriate section for constants depending
8682 on where we place literal pools. */
8683
8684 static section *
8685 aarch64_select_rtx_section (machine_mode mode,
8686 rtx x,
8687 unsigned HOST_WIDE_INT align)
8688 {
8689 if (aarch64_can_use_per_function_literal_pools_p ())
8690 return function_section (current_function_decl);
8691
8692 return default_elf_select_rtx_section (mode, x, align);
8693 }
8694
8695 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8696 void
8697 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8698 HOST_WIDE_INT offset)
8699 {
8700 /* When using per-function literal pools, we must ensure that any code
8701 section is aligned to the minimal instruction length, lest we get
8702 errors from the assembler re "unaligned instructions". */
8703 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8704 ASM_OUTPUT_ALIGN (f, 2);
8705 }
8706
8707 /* Costs. */
8708
8709 /* Helper function for rtx cost calculation. Strip a shift expression
8710 from X. Returns the inner operand if successful, or the original
8711 expression on failure. */
8712 static rtx
8713 aarch64_strip_shift (rtx x)
8714 {
8715 rtx op = x;
8716
8717 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8718 we can convert both to ROR during final output. */
8719 if ((GET_CODE (op) == ASHIFT
8720 || GET_CODE (op) == ASHIFTRT
8721 || GET_CODE (op) == LSHIFTRT
8722 || GET_CODE (op) == ROTATERT
8723 || GET_CODE (op) == ROTATE)
8724 && CONST_INT_P (XEXP (op, 1)))
8725 return XEXP (op, 0);
8726
8727 if (GET_CODE (op) == MULT
8728 && CONST_INT_P (XEXP (op, 1))
8729 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8730 return XEXP (op, 0);
8731
8732 return x;
8733 }
8734
8735 /* Helper function for rtx cost calculation. Strip an extend
8736 expression from X. Returns the inner operand if successful, or the
8737 original expression on failure. We deal with a number of possible
8738 canonicalization variations here. If STRIP_SHIFT is true, then
8739 we can strip off a shift also. */
8740 static rtx
8741 aarch64_strip_extend (rtx x, bool strip_shift)
8742 {
8743 scalar_int_mode mode;
8744 rtx op = x;
8745
8746 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8747 return op;
8748
8749 /* Zero and sign extraction of a widened value. */
8750 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8751 && XEXP (op, 2) == const0_rtx
8752 && GET_CODE (XEXP (op, 0)) == MULT
8753 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8754 XEXP (op, 1)))
8755 return XEXP (XEXP (op, 0), 0);
8756
8757 /* It can also be represented (for zero-extend) as an AND with an
8758 immediate. */
8759 if (GET_CODE (op) == AND
8760 && GET_CODE (XEXP (op, 0)) == MULT
8761 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8762 && CONST_INT_P (XEXP (op, 1))
8763 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8764 INTVAL (XEXP (op, 1))) != 0)
8765 return XEXP (XEXP (op, 0), 0);
8766
8767 /* Now handle extended register, as this may also have an optional
8768 left shift by 1..4. */
8769 if (strip_shift
8770 && GET_CODE (op) == ASHIFT
8771 && CONST_INT_P (XEXP (op, 1))
8772 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8773 op = XEXP (op, 0);
8774
8775 if (GET_CODE (op) == ZERO_EXTEND
8776 || GET_CODE (op) == SIGN_EXTEND)
8777 op = XEXP (op, 0);
8778
8779 if (op != x)
8780 return op;
8781
8782 return x;
8783 }
8784
8785 /* Return true iff CODE is a shift supported in combination
8786 with arithmetic instructions. */
8787
8788 static bool
8789 aarch64_shift_p (enum rtx_code code)
8790 {
8791 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8792 }
8793
8794
8795 /* Return true iff X is a cheap shift without a sign extend. */
8796
8797 static bool
8798 aarch64_cheap_mult_shift_p (rtx x)
8799 {
8800 rtx op0, op1;
8801
8802 op0 = XEXP (x, 0);
8803 op1 = XEXP (x, 1);
8804
8805 if (!(aarch64_tune_params.extra_tuning_flags
8806 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8807 return false;
8808
8809 if (GET_CODE (op0) == SIGN_EXTEND)
8810 return false;
8811
8812 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8813 && UINTVAL (op1) <= 4)
8814 return true;
8815
8816 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8817 return false;
8818
8819 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8820
8821 if (l2 > 0 && l2 <= 4)
8822 return true;
8823
8824 return false;
8825 }
8826
8827 /* Helper function for rtx cost calculation. Calculate the cost of
8828 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8829 Return the calculated cost of the expression, recursing manually in to
8830 operands where needed. */
8831
8832 static int
8833 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8834 {
8835 rtx op0, op1;
8836 const struct cpu_cost_table *extra_cost
8837 = aarch64_tune_params.insn_extra_cost;
8838 int cost = 0;
8839 bool compound_p = (outer == PLUS || outer == MINUS);
8840 machine_mode mode = GET_MODE (x);
8841
8842 gcc_checking_assert (code == MULT);
8843
8844 op0 = XEXP (x, 0);
8845 op1 = XEXP (x, 1);
8846
8847 if (VECTOR_MODE_P (mode))
8848 mode = GET_MODE_INNER (mode);
8849
8850 /* Integer multiply/fma. */
8851 if (GET_MODE_CLASS (mode) == MODE_INT)
8852 {
8853 /* The multiply will be canonicalized as a shift, cost it as such. */
8854 if (aarch64_shift_p (GET_CODE (x))
8855 || (CONST_INT_P (op1)
8856 && exact_log2 (INTVAL (op1)) > 0))
8857 {
8858 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8859 || GET_CODE (op0) == SIGN_EXTEND;
8860 if (speed)
8861 {
8862 if (compound_p)
8863 {
8864 /* If the shift is considered cheap,
8865 then don't add any cost. */
8866 if (aarch64_cheap_mult_shift_p (x))
8867 ;
8868 else if (REG_P (op1))
8869 /* ARITH + shift-by-register. */
8870 cost += extra_cost->alu.arith_shift_reg;
8871 else if (is_extend)
8872 /* ARITH + extended register. We don't have a cost field
8873 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8874 cost += extra_cost->alu.extend_arith;
8875 else
8876 /* ARITH + shift-by-immediate. */
8877 cost += extra_cost->alu.arith_shift;
8878 }
8879 else
8880 /* LSL (immediate). */
8881 cost += extra_cost->alu.shift;
8882
8883 }
8884 /* Strip extends as we will have costed them in the case above. */
8885 if (is_extend)
8886 op0 = aarch64_strip_extend (op0, true);
8887
8888 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8889
8890 return cost;
8891 }
8892
8893 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8894 compound and let the below cases handle it. After all, MNEG is a
8895 special-case alias of MSUB. */
8896 if (GET_CODE (op0) == NEG)
8897 {
8898 op0 = XEXP (op0, 0);
8899 compound_p = true;
8900 }
8901
8902 /* Integer multiplies or FMAs have zero/sign extending variants. */
8903 if ((GET_CODE (op0) == ZERO_EXTEND
8904 && GET_CODE (op1) == ZERO_EXTEND)
8905 || (GET_CODE (op0) == SIGN_EXTEND
8906 && GET_CODE (op1) == SIGN_EXTEND))
8907 {
8908 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8909 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8910
8911 if (speed)
8912 {
8913 if (compound_p)
8914 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8915 cost += extra_cost->mult[0].extend_add;
8916 else
8917 /* MUL/SMULL/UMULL. */
8918 cost += extra_cost->mult[0].extend;
8919 }
8920
8921 return cost;
8922 }
8923
8924 /* This is either an integer multiply or a MADD. In both cases
8925 we want to recurse and cost the operands. */
8926 cost += rtx_cost (op0, mode, MULT, 0, speed);
8927 cost += rtx_cost (op1, mode, MULT, 1, speed);
8928
8929 if (speed)
8930 {
8931 if (compound_p)
8932 /* MADD/MSUB. */
8933 cost += extra_cost->mult[mode == DImode].add;
8934 else
8935 /* MUL. */
8936 cost += extra_cost->mult[mode == DImode].simple;
8937 }
8938
8939 return cost;
8940 }
8941 else
8942 {
8943 if (speed)
8944 {
8945 /* Floating-point FMA/FMUL can also support negations of the
8946 operands, unless the rounding mode is upward or downward in
8947 which case FNMUL is different than FMUL with operand negation. */
8948 bool neg0 = GET_CODE (op0) == NEG;
8949 bool neg1 = GET_CODE (op1) == NEG;
8950 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8951 {
8952 if (neg0)
8953 op0 = XEXP (op0, 0);
8954 if (neg1)
8955 op1 = XEXP (op1, 0);
8956 }
8957
8958 if (compound_p)
8959 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8960 cost += extra_cost->fp[mode == DFmode].fma;
8961 else
8962 /* FMUL/FNMUL. */
8963 cost += extra_cost->fp[mode == DFmode].mult;
8964 }
8965
8966 cost += rtx_cost (op0, mode, MULT, 0, speed);
8967 cost += rtx_cost (op1, mode, MULT, 1, speed);
8968 return cost;
8969 }
8970 }
8971
8972 static int
8973 aarch64_address_cost (rtx x,
8974 machine_mode mode,
8975 addr_space_t as ATTRIBUTE_UNUSED,
8976 bool speed)
8977 {
8978 enum rtx_code c = GET_CODE (x);
8979 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8980 struct aarch64_address_info info;
8981 int cost = 0;
8982 info.shift = 0;
8983
8984 if (!aarch64_classify_address (&info, x, mode, false))
8985 {
8986 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8987 {
8988 /* This is a CONST or SYMBOL ref which will be split
8989 in a different way depending on the code model in use.
8990 Cost it through the generic infrastructure. */
8991 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8992 /* Divide through by the cost of one instruction to
8993 bring it to the same units as the address costs. */
8994 cost_symbol_ref /= COSTS_N_INSNS (1);
8995 /* The cost is then the cost of preparing the address,
8996 followed by an immediate (possibly 0) offset. */
8997 return cost_symbol_ref + addr_cost->imm_offset;
8998 }
8999 else
9000 {
9001 /* This is most likely a jump table from a case
9002 statement. */
9003 return addr_cost->register_offset;
9004 }
9005 }
9006
9007 switch (info.type)
9008 {
9009 case ADDRESS_LO_SUM:
9010 case ADDRESS_SYMBOLIC:
9011 case ADDRESS_REG_IMM:
9012 cost += addr_cost->imm_offset;
9013 break;
9014
9015 case ADDRESS_REG_WB:
9016 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9017 cost += addr_cost->pre_modify;
9018 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9019 cost += addr_cost->post_modify;
9020 else
9021 gcc_unreachable ();
9022
9023 break;
9024
9025 case ADDRESS_REG_REG:
9026 cost += addr_cost->register_offset;
9027 break;
9028
9029 case ADDRESS_REG_SXTW:
9030 cost += addr_cost->register_sextend;
9031 break;
9032
9033 case ADDRESS_REG_UXTW:
9034 cost += addr_cost->register_zextend;
9035 break;
9036
9037 default:
9038 gcc_unreachable ();
9039 }
9040
9041
9042 if (info.shift > 0)
9043 {
9044 /* For the sake of calculating the cost of the shifted register
9045 component, we can treat same sized modes in the same way. */
9046 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9047 cost += addr_cost->addr_scale_costs.hi;
9048 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9049 cost += addr_cost->addr_scale_costs.si;
9050 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9051 cost += addr_cost->addr_scale_costs.di;
9052 else
9053 /* We can't tell, or this is a 128-bit vector. */
9054 cost += addr_cost->addr_scale_costs.ti;
9055 }
9056
9057 return cost;
9058 }
9059
9060 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9061 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9062 to be taken. */
9063
9064 int
9065 aarch64_branch_cost (bool speed_p, bool predictable_p)
9066 {
9067 /* When optimizing for speed, use the cost of unpredictable branches. */
9068 const struct cpu_branch_cost *branch_costs =
9069 aarch64_tune_params.branch_costs;
9070
9071 if (!speed_p || predictable_p)
9072 return branch_costs->predictable;
9073 else
9074 return branch_costs->unpredictable;
9075 }
9076
9077 /* Return true if the RTX X in mode MODE is a zero or sign extract
9078 usable in an ADD or SUB (extended register) instruction. */
9079 static bool
9080 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9081 {
9082 /* Catch add with a sign extract.
9083 This is add_<optab><mode>_multp2. */
9084 if (GET_CODE (x) == SIGN_EXTRACT
9085 || GET_CODE (x) == ZERO_EXTRACT)
9086 {
9087 rtx op0 = XEXP (x, 0);
9088 rtx op1 = XEXP (x, 1);
9089 rtx op2 = XEXP (x, 2);
9090
9091 if (GET_CODE (op0) == MULT
9092 && CONST_INT_P (op1)
9093 && op2 == const0_rtx
9094 && CONST_INT_P (XEXP (op0, 1))
9095 && aarch64_is_extend_from_extract (mode,
9096 XEXP (op0, 1),
9097 op1))
9098 {
9099 return true;
9100 }
9101 }
9102 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9103 No shift. */
9104 else if (GET_CODE (x) == SIGN_EXTEND
9105 || GET_CODE (x) == ZERO_EXTEND)
9106 return REG_P (XEXP (x, 0));
9107
9108 return false;
9109 }
9110
9111 static bool
9112 aarch64_frint_unspec_p (unsigned int u)
9113 {
9114 switch (u)
9115 {
9116 case UNSPEC_FRINTZ:
9117 case UNSPEC_FRINTP:
9118 case UNSPEC_FRINTM:
9119 case UNSPEC_FRINTA:
9120 case UNSPEC_FRINTN:
9121 case UNSPEC_FRINTX:
9122 case UNSPEC_FRINTI:
9123 return true;
9124
9125 default:
9126 return false;
9127 }
9128 }
9129
9130 /* Return true iff X is an rtx that will match an extr instruction
9131 i.e. as described in the *extr<mode>5_insn family of patterns.
9132 OP0 and OP1 will be set to the operands of the shifts involved
9133 on success and will be NULL_RTX otherwise. */
9134
9135 static bool
9136 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9137 {
9138 rtx op0, op1;
9139 scalar_int_mode mode;
9140 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9141 return false;
9142
9143 *res_op0 = NULL_RTX;
9144 *res_op1 = NULL_RTX;
9145
9146 if (GET_CODE (x) != IOR)
9147 return false;
9148
9149 op0 = XEXP (x, 0);
9150 op1 = XEXP (x, 1);
9151
9152 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9153 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9154 {
9155 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9156 if (GET_CODE (op1) == ASHIFT)
9157 std::swap (op0, op1);
9158
9159 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9160 return false;
9161
9162 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9163 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9164
9165 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9166 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9167 {
9168 *res_op0 = XEXP (op0, 0);
9169 *res_op1 = XEXP (op1, 0);
9170 return true;
9171 }
9172 }
9173
9174 return false;
9175 }
9176
9177 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9178 storing it in *COST. Result is true if the total cost of the operation
9179 has now been calculated. */
9180 static bool
9181 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9182 {
9183 rtx inner;
9184 rtx comparator;
9185 enum rtx_code cmpcode;
9186
9187 if (COMPARISON_P (op0))
9188 {
9189 inner = XEXP (op0, 0);
9190 comparator = XEXP (op0, 1);
9191 cmpcode = GET_CODE (op0);
9192 }
9193 else
9194 {
9195 inner = op0;
9196 comparator = const0_rtx;
9197 cmpcode = NE;
9198 }
9199
9200 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9201 {
9202 /* Conditional branch. */
9203 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9204 return true;
9205 else
9206 {
9207 if (cmpcode == NE || cmpcode == EQ)
9208 {
9209 if (comparator == const0_rtx)
9210 {
9211 /* TBZ/TBNZ/CBZ/CBNZ. */
9212 if (GET_CODE (inner) == ZERO_EXTRACT)
9213 /* TBZ/TBNZ. */
9214 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9215 ZERO_EXTRACT, 0, speed);
9216 else
9217 /* CBZ/CBNZ. */
9218 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9219
9220 return true;
9221 }
9222 }
9223 else if (cmpcode == LT || cmpcode == GE)
9224 {
9225 /* TBZ/TBNZ. */
9226 if (comparator == const0_rtx)
9227 return true;
9228 }
9229 }
9230 }
9231 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9232 {
9233 /* CCMP. */
9234 if (GET_CODE (op1) == COMPARE)
9235 {
9236 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9237 if (XEXP (op1, 1) == const0_rtx)
9238 *cost += 1;
9239 if (speed)
9240 {
9241 machine_mode mode = GET_MODE (XEXP (op1, 0));
9242 const struct cpu_cost_table *extra_cost
9243 = aarch64_tune_params.insn_extra_cost;
9244
9245 if (GET_MODE_CLASS (mode) == MODE_INT)
9246 *cost += extra_cost->alu.arith;
9247 else
9248 *cost += extra_cost->fp[mode == DFmode].compare;
9249 }
9250 return true;
9251 }
9252
9253 /* It's a conditional operation based on the status flags,
9254 so it must be some flavor of CSEL. */
9255
9256 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9257 if (GET_CODE (op1) == NEG
9258 || GET_CODE (op1) == NOT
9259 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9260 op1 = XEXP (op1, 0);
9261 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9262 {
9263 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9264 op1 = XEXP (op1, 0);
9265 op2 = XEXP (op2, 0);
9266 }
9267
9268 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9269 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9270 return true;
9271 }
9272
9273 /* We don't know what this is, cost all operands. */
9274 return false;
9275 }
9276
9277 /* Check whether X is a bitfield operation of the form shift + extend that
9278 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9279 operand to which the bitfield operation is applied. Otherwise return
9280 NULL_RTX. */
9281
9282 static rtx
9283 aarch64_extend_bitfield_pattern_p (rtx x)
9284 {
9285 rtx_code outer_code = GET_CODE (x);
9286 machine_mode outer_mode = GET_MODE (x);
9287
9288 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9289 && outer_mode != SImode && outer_mode != DImode)
9290 return NULL_RTX;
9291
9292 rtx inner = XEXP (x, 0);
9293 rtx_code inner_code = GET_CODE (inner);
9294 machine_mode inner_mode = GET_MODE (inner);
9295 rtx op = NULL_RTX;
9296
9297 switch (inner_code)
9298 {
9299 case ASHIFT:
9300 if (CONST_INT_P (XEXP (inner, 1))
9301 && (inner_mode == QImode || inner_mode == HImode))
9302 op = XEXP (inner, 0);
9303 break;
9304 case LSHIFTRT:
9305 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9306 && (inner_mode == QImode || inner_mode == HImode))
9307 op = XEXP (inner, 0);
9308 break;
9309 case ASHIFTRT:
9310 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9311 && (inner_mode == QImode || inner_mode == HImode))
9312 op = XEXP (inner, 0);
9313 break;
9314 default:
9315 break;
9316 }
9317
9318 return op;
9319 }
9320
9321 /* Return true if the mask and a shift amount from an RTX of the form
9322 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9323 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9324
9325 bool
9326 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9327 rtx shft_amnt)
9328 {
9329 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9330 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9331 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9332 && (INTVAL (mask)
9333 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9334 }
9335
9336 /* Calculate the cost of calculating X, storing it in *COST. Result
9337 is true if the total cost of the operation has now been calculated. */
9338 static bool
9339 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9340 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9341 {
9342 rtx op0, op1, op2;
9343 const struct cpu_cost_table *extra_cost
9344 = aarch64_tune_params.insn_extra_cost;
9345 int code = GET_CODE (x);
9346 scalar_int_mode int_mode;
9347
9348 /* By default, assume that everything has equivalent cost to the
9349 cheapest instruction. Any additional costs are applied as a delta
9350 above this default. */
9351 *cost = COSTS_N_INSNS (1);
9352
9353 switch (code)
9354 {
9355 case SET:
9356 /* The cost depends entirely on the operands to SET. */
9357 *cost = 0;
9358 op0 = SET_DEST (x);
9359 op1 = SET_SRC (x);
9360
9361 switch (GET_CODE (op0))
9362 {
9363 case MEM:
9364 if (speed)
9365 {
9366 rtx address = XEXP (op0, 0);
9367 if (VECTOR_MODE_P (mode))
9368 *cost += extra_cost->ldst.storev;
9369 else if (GET_MODE_CLASS (mode) == MODE_INT)
9370 *cost += extra_cost->ldst.store;
9371 else if (mode == SFmode)
9372 *cost += extra_cost->ldst.storef;
9373 else if (mode == DFmode)
9374 *cost += extra_cost->ldst.stored;
9375
9376 *cost +=
9377 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9378 0, speed));
9379 }
9380
9381 *cost += rtx_cost (op1, mode, SET, 1, speed);
9382 return true;
9383
9384 case SUBREG:
9385 if (! REG_P (SUBREG_REG (op0)))
9386 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9387
9388 /* Fall through. */
9389 case REG:
9390 /* The cost is one per vector-register copied. */
9391 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9392 {
9393 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9394 *cost = COSTS_N_INSNS (nregs);
9395 }
9396 /* const0_rtx is in general free, but we will use an
9397 instruction to set a register to 0. */
9398 else if (REG_P (op1) || op1 == const0_rtx)
9399 {
9400 /* The cost is 1 per register copied. */
9401 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9402 *cost = COSTS_N_INSNS (nregs);
9403 }
9404 else
9405 /* Cost is just the cost of the RHS of the set. */
9406 *cost += rtx_cost (op1, mode, SET, 1, speed);
9407 return true;
9408
9409 case ZERO_EXTRACT:
9410 case SIGN_EXTRACT:
9411 /* Bit-field insertion. Strip any redundant widening of
9412 the RHS to meet the width of the target. */
9413 if (GET_CODE (op1) == SUBREG)
9414 op1 = SUBREG_REG (op1);
9415 if ((GET_CODE (op1) == ZERO_EXTEND
9416 || GET_CODE (op1) == SIGN_EXTEND)
9417 && CONST_INT_P (XEXP (op0, 1))
9418 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9419 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9420 op1 = XEXP (op1, 0);
9421
9422 if (CONST_INT_P (op1))
9423 {
9424 /* MOV immediate is assumed to always be cheap. */
9425 *cost = COSTS_N_INSNS (1);
9426 }
9427 else
9428 {
9429 /* BFM. */
9430 if (speed)
9431 *cost += extra_cost->alu.bfi;
9432 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9433 }
9434
9435 return true;
9436
9437 default:
9438 /* We can't make sense of this, assume default cost. */
9439 *cost = COSTS_N_INSNS (1);
9440 return false;
9441 }
9442 return false;
9443
9444 case CONST_INT:
9445 /* If an instruction can incorporate a constant within the
9446 instruction, the instruction's expression avoids calling
9447 rtx_cost() on the constant. If rtx_cost() is called on a
9448 constant, then it is usually because the constant must be
9449 moved into a register by one or more instructions.
9450
9451 The exception is constant 0, which can be expressed
9452 as XZR/WZR and is therefore free. The exception to this is
9453 if we have (set (reg) (const0_rtx)) in which case we must cost
9454 the move. However, we can catch that when we cost the SET, so
9455 we don't need to consider that here. */
9456 if (x == const0_rtx)
9457 *cost = 0;
9458 else
9459 {
9460 /* To an approximation, building any other constant is
9461 proportionally expensive to the number of instructions
9462 required to build that constant. This is true whether we
9463 are compiling for SPEED or otherwise. */
9464 if (!is_a <scalar_int_mode> (mode, &int_mode))
9465 int_mode = word_mode;
9466 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9467 (NULL_RTX, x, false, int_mode));
9468 }
9469 return true;
9470
9471 case CONST_DOUBLE:
9472
9473 /* First determine number of instructions to do the move
9474 as an integer constant. */
9475 if (!aarch64_float_const_representable_p (x)
9476 && !aarch64_can_const_movi_rtx_p (x, mode)
9477 && aarch64_float_const_rtx_p (x))
9478 {
9479 unsigned HOST_WIDE_INT ival;
9480 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9481 gcc_assert (succeed);
9482
9483 scalar_int_mode imode = (mode == HFmode
9484 ? SImode
9485 : int_mode_for_mode (mode).require ());
9486 int ncost = aarch64_internal_mov_immediate
9487 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9488 *cost += COSTS_N_INSNS (ncost);
9489 return true;
9490 }
9491
9492 if (speed)
9493 {
9494 /* mov[df,sf]_aarch64. */
9495 if (aarch64_float_const_representable_p (x))
9496 /* FMOV (scalar immediate). */
9497 *cost += extra_cost->fp[mode == DFmode].fpconst;
9498 else if (!aarch64_float_const_zero_rtx_p (x))
9499 {
9500 /* This will be a load from memory. */
9501 if (mode == DFmode)
9502 *cost += extra_cost->ldst.loadd;
9503 else
9504 *cost += extra_cost->ldst.loadf;
9505 }
9506 else
9507 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9508 or MOV v0.s[0], wzr - neither of which are modeled by the
9509 cost tables. Just use the default cost. */
9510 {
9511 }
9512 }
9513
9514 return true;
9515
9516 case MEM:
9517 if (speed)
9518 {
9519 /* For loads we want the base cost of a load, plus an
9520 approximation for the additional cost of the addressing
9521 mode. */
9522 rtx address = XEXP (x, 0);
9523 if (VECTOR_MODE_P (mode))
9524 *cost += extra_cost->ldst.loadv;
9525 else if (GET_MODE_CLASS (mode) == MODE_INT)
9526 *cost += extra_cost->ldst.load;
9527 else if (mode == SFmode)
9528 *cost += extra_cost->ldst.loadf;
9529 else if (mode == DFmode)
9530 *cost += extra_cost->ldst.loadd;
9531
9532 *cost +=
9533 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9534 0, speed));
9535 }
9536
9537 return true;
9538
9539 case NEG:
9540 op0 = XEXP (x, 0);
9541
9542 if (VECTOR_MODE_P (mode))
9543 {
9544 if (speed)
9545 {
9546 /* FNEG. */
9547 *cost += extra_cost->vect.alu;
9548 }
9549 return false;
9550 }
9551
9552 if (GET_MODE_CLASS (mode) == MODE_INT)
9553 {
9554 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9555 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9556 {
9557 /* CSETM. */
9558 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9559 return true;
9560 }
9561
9562 /* Cost this as SUB wzr, X. */
9563 op0 = CONST0_RTX (mode);
9564 op1 = XEXP (x, 0);
9565 goto cost_minus;
9566 }
9567
9568 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9569 {
9570 /* Support (neg(fma...)) as a single instruction only if
9571 sign of zeros is unimportant. This matches the decision
9572 making in aarch64.md. */
9573 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9574 {
9575 /* FNMADD. */
9576 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9577 return true;
9578 }
9579 if (GET_CODE (op0) == MULT)
9580 {
9581 /* FNMUL. */
9582 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9583 return true;
9584 }
9585 if (speed)
9586 /* FNEG. */
9587 *cost += extra_cost->fp[mode == DFmode].neg;
9588 return false;
9589 }
9590
9591 return false;
9592
9593 case CLRSB:
9594 case CLZ:
9595 if (speed)
9596 {
9597 if (VECTOR_MODE_P (mode))
9598 *cost += extra_cost->vect.alu;
9599 else
9600 *cost += extra_cost->alu.clz;
9601 }
9602
9603 return false;
9604
9605 case COMPARE:
9606 op0 = XEXP (x, 0);
9607 op1 = XEXP (x, 1);
9608
9609 if (op1 == const0_rtx
9610 && GET_CODE (op0) == AND)
9611 {
9612 x = op0;
9613 mode = GET_MODE (op0);
9614 goto cost_logic;
9615 }
9616
9617 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9618 {
9619 /* TODO: A write to the CC flags possibly costs extra, this
9620 needs encoding in the cost tables. */
9621
9622 mode = GET_MODE (op0);
9623 /* ANDS. */
9624 if (GET_CODE (op0) == AND)
9625 {
9626 x = op0;
9627 goto cost_logic;
9628 }
9629
9630 if (GET_CODE (op0) == PLUS)
9631 {
9632 /* ADDS (and CMN alias). */
9633 x = op0;
9634 goto cost_plus;
9635 }
9636
9637 if (GET_CODE (op0) == MINUS)
9638 {
9639 /* SUBS. */
9640 x = op0;
9641 goto cost_minus;
9642 }
9643
9644 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9645 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9646 && CONST_INT_P (XEXP (op0, 2)))
9647 {
9648 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9649 Handle it here directly rather than going to cost_logic
9650 since we know the immediate generated for the TST is valid
9651 so we can avoid creating an intermediate rtx for it only
9652 for costing purposes. */
9653 if (speed)
9654 *cost += extra_cost->alu.logical;
9655
9656 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9657 ZERO_EXTRACT, 0, speed);
9658 return true;
9659 }
9660
9661 if (GET_CODE (op1) == NEG)
9662 {
9663 /* CMN. */
9664 if (speed)
9665 *cost += extra_cost->alu.arith;
9666
9667 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9668 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9669 return true;
9670 }
9671
9672 /* CMP.
9673
9674 Compare can freely swap the order of operands, and
9675 canonicalization puts the more complex operation first.
9676 But the integer MINUS logic expects the shift/extend
9677 operation in op1. */
9678 if (! (REG_P (op0)
9679 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9680 {
9681 op0 = XEXP (x, 1);
9682 op1 = XEXP (x, 0);
9683 }
9684 goto cost_minus;
9685 }
9686
9687 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9688 {
9689 /* FCMP. */
9690 if (speed)
9691 *cost += extra_cost->fp[mode == DFmode].compare;
9692
9693 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9694 {
9695 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9696 /* FCMP supports constant 0.0 for no extra cost. */
9697 return true;
9698 }
9699 return false;
9700 }
9701
9702 if (VECTOR_MODE_P (mode))
9703 {
9704 /* Vector compare. */
9705 if (speed)
9706 *cost += extra_cost->vect.alu;
9707
9708 if (aarch64_float_const_zero_rtx_p (op1))
9709 {
9710 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9711 cost. */
9712 return true;
9713 }
9714 return false;
9715 }
9716 return false;
9717
9718 case MINUS:
9719 {
9720 op0 = XEXP (x, 0);
9721 op1 = XEXP (x, 1);
9722
9723 cost_minus:
9724 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9725
9726 /* Detect valid immediates. */
9727 if ((GET_MODE_CLASS (mode) == MODE_INT
9728 || (GET_MODE_CLASS (mode) == MODE_CC
9729 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9730 && CONST_INT_P (op1)
9731 && aarch64_uimm12_shift (INTVAL (op1)))
9732 {
9733 if (speed)
9734 /* SUB(S) (immediate). */
9735 *cost += extra_cost->alu.arith;
9736 return true;
9737 }
9738
9739 /* Look for SUB (extended register). */
9740 if (is_a <scalar_int_mode> (mode, &int_mode)
9741 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9742 {
9743 if (speed)
9744 *cost += extra_cost->alu.extend_arith;
9745
9746 op1 = aarch64_strip_extend (op1, true);
9747 *cost += rtx_cost (op1, VOIDmode,
9748 (enum rtx_code) GET_CODE (op1), 0, speed);
9749 return true;
9750 }
9751
9752 rtx new_op1 = aarch64_strip_extend (op1, false);
9753
9754 /* Cost this as an FMA-alike operation. */
9755 if ((GET_CODE (new_op1) == MULT
9756 || aarch64_shift_p (GET_CODE (new_op1)))
9757 && code != COMPARE)
9758 {
9759 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9760 (enum rtx_code) code,
9761 speed);
9762 return true;
9763 }
9764
9765 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9766
9767 if (speed)
9768 {
9769 if (VECTOR_MODE_P (mode))
9770 {
9771 /* Vector SUB. */
9772 *cost += extra_cost->vect.alu;
9773 }
9774 else if (GET_MODE_CLASS (mode) == MODE_INT)
9775 {
9776 /* SUB(S). */
9777 *cost += extra_cost->alu.arith;
9778 }
9779 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9780 {
9781 /* FSUB. */
9782 *cost += extra_cost->fp[mode == DFmode].addsub;
9783 }
9784 }
9785 return true;
9786 }
9787
9788 case PLUS:
9789 {
9790 rtx new_op0;
9791
9792 op0 = XEXP (x, 0);
9793 op1 = XEXP (x, 1);
9794
9795 cost_plus:
9796 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9797 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9798 {
9799 /* CSINC. */
9800 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9801 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9802 return true;
9803 }
9804
9805 if (GET_MODE_CLASS (mode) == MODE_INT
9806 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9807 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9808 {
9809 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9810
9811 if (speed)
9812 /* ADD (immediate). */
9813 *cost += extra_cost->alu.arith;
9814 return true;
9815 }
9816
9817 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9818
9819 /* Look for ADD (extended register). */
9820 if (is_a <scalar_int_mode> (mode, &int_mode)
9821 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9822 {
9823 if (speed)
9824 *cost += extra_cost->alu.extend_arith;
9825
9826 op0 = aarch64_strip_extend (op0, true);
9827 *cost += rtx_cost (op0, VOIDmode,
9828 (enum rtx_code) GET_CODE (op0), 0, speed);
9829 return true;
9830 }
9831
9832 /* Strip any extend, leave shifts behind as we will
9833 cost them through mult_cost. */
9834 new_op0 = aarch64_strip_extend (op0, false);
9835
9836 if (GET_CODE (new_op0) == MULT
9837 || aarch64_shift_p (GET_CODE (new_op0)))
9838 {
9839 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9840 speed);
9841 return true;
9842 }
9843
9844 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9845
9846 if (speed)
9847 {
9848 if (VECTOR_MODE_P (mode))
9849 {
9850 /* Vector ADD. */
9851 *cost += extra_cost->vect.alu;
9852 }
9853 else if (GET_MODE_CLASS (mode) == MODE_INT)
9854 {
9855 /* ADD. */
9856 *cost += extra_cost->alu.arith;
9857 }
9858 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9859 {
9860 /* FADD. */
9861 *cost += extra_cost->fp[mode == DFmode].addsub;
9862 }
9863 }
9864 return true;
9865 }
9866
9867 case BSWAP:
9868 *cost = COSTS_N_INSNS (1);
9869
9870 if (speed)
9871 {
9872 if (VECTOR_MODE_P (mode))
9873 *cost += extra_cost->vect.alu;
9874 else
9875 *cost += extra_cost->alu.rev;
9876 }
9877 return false;
9878
9879 case IOR:
9880 if (aarch_rev16_p (x))
9881 {
9882 *cost = COSTS_N_INSNS (1);
9883
9884 if (speed)
9885 {
9886 if (VECTOR_MODE_P (mode))
9887 *cost += extra_cost->vect.alu;
9888 else
9889 *cost += extra_cost->alu.rev;
9890 }
9891 return true;
9892 }
9893
9894 if (aarch64_extr_rtx_p (x, &op0, &op1))
9895 {
9896 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9897 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9898 if (speed)
9899 *cost += extra_cost->alu.shift;
9900
9901 return true;
9902 }
9903 /* Fall through. */
9904 case XOR:
9905 case AND:
9906 cost_logic:
9907 op0 = XEXP (x, 0);
9908 op1 = XEXP (x, 1);
9909
9910 if (VECTOR_MODE_P (mode))
9911 {
9912 if (speed)
9913 *cost += extra_cost->vect.alu;
9914 return true;
9915 }
9916
9917 if (code == AND
9918 && GET_CODE (op0) == MULT
9919 && CONST_INT_P (XEXP (op0, 1))
9920 && CONST_INT_P (op1)
9921 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9922 INTVAL (op1)) != 0)
9923 {
9924 /* This is a UBFM/SBFM. */
9925 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9926 if (speed)
9927 *cost += extra_cost->alu.bfx;
9928 return true;
9929 }
9930
9931 if (is_int_mode (mode, &int_mode))
9932 {
9933 if (CONST_INT_P (op1))
9934 {
9935 /* We have a mask + shift version of a UBFIZ
9936 i.e. the *andim_ashift<mode>_bfiz pattern. */
9937 if (GET_CODE (op0) == ASHIFT
9938 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9939 XEXP (op0, 1)))
9940 {
9941 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9942 (enum rtx_code) code, 0, speed);
9943 if (speed)
9944 *cost += extra_cost->alu.bfx;
9945
9946 return true;
9947 }
9948 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9949 {
9950 /* We possibly get the immediate for free, this is not
9951 modelled. */
9952 *cost += rtx_cost (op0, int_mode,
9953 (enum rtx_code) code, 0, speed);
9954 if (speed)
9955 *cost += extra_cost->alu.logical;
9956
9957 return true;
9958 }
9959 }
9960 else
9961 {
9962 rtx new_op0 = op0;
9963
9964 /* Handle ORN, EON, or BIC. */
9965 if (GET_CODE (op0) == NOT)
9966 op0 = XEXP (op0, 0);
9967
9968 new_op0 = aarch64_strip_shift (op0);
9969
9970 /* If we had a shift on op0 then this is a logical-shift-
9971 by-register/immediate operation. Otherwise, this is just
9972 a logical operation. */
9973 if (speed)
9974 {
9975 if (new_op0 != op0)
9976 {
9977 /* Shift by immediate. */
9978 if (CONST_INT_P (XEXP (op0, 1)))
9979 *cost += extra_cost->alu.log_shift;
9980 else
9981 *cost += extra_cost->alu.log_shift_reg;
9982 }
9983 else
9984 *cost += extra_cost->alu.logical;
9985 }
9986
9987 /* In both cases we want to cost both operands. */
9988 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9989 0, speed);
9990 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9991 1, speed);
9992
9993 return true;
9994 }
9995 }
9996 return false;
9997
9998 case NOT:
9999 x = XEXP (x, 0);
10000 op0 = aarch64_strip_shift (x);
10001
10002 if (VECTOR_MODE_P (mode))
10003 {
10004 /* Vector NOT. */
10005 *cost += extra_cost->vect.alu;
10006 return false;
10007 }
10008
10009 /* MVN-shifted-reg. */
10010 if (op0 != x)
10011 {
10012 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10013
10014 if (speed)
10015 *cost += extra_cost->alu.log_shift;
10016
10017 return true;
10018 }
10019 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10020 Handle the second form here taking care that 'a' in the above can
10021 be a shift. */
10022 else if (GET_CODE (op0) == XOR)
10023 {
10024 rtx newop0 = XEXP (op0, 0);
10025 rtx newop1 = XEXP (op0, 1);
10026 rtx op0_stripped = aarch64_strip_shift (newop0);
10027
10028 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10029 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10030
10031 if (speed)
10032 {
10033 if (op0_stripped != newop0)
10034 *cost += extra_cost->alu.log_shift;
10035 else
10036 *cost += extra_cost->alu.logical;
10037 }
10038
10039 return true;
10040 }
10041 /* MVN. */
10042 if (speed)
10043 *cost += extra_cost->alu.logical;
10044
10045 return false;
10046
10047 case ZERO_EXTEND:
10048
10049 op0 = XEXP (x, 0);
10050 /* If a value is written in SI mode, then zero extended to DI
10051 mode, the operation will in general be free as a write to
10052 a 'w' register implicitly zeroes the upper bits of an 'x'
10053 register. However, if this is
10054
10055 (set (reg) (zero_extend (reg)))
10056
10057 we must cost the explicit register move. */
10058 if (mode == DImode
10059 && GET_MODE (op0) == SImode
10060 && outer == SET)
10061 {
10062 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10063
10064 /* If OP_COST is non-zero, then the cost of the zero extend
10065 is effectively the cost of the inner operation. Otherwise
10066 we have a MOV instruction and we take the cost from the MOV
10067 itself. This is true independently of whether we are
10068 optimizing for space or time. */
10069 if (op_cost)
10070 *cost = op_cost;
10071
10072 return true;
10073 }
10074 else if (MEM_P (op0))
10075 {
10076 /* All loads can zero extend to any size for free. */
10077 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10078 return true;
10079 }
10080
10081 op0 = aarch64_extend_bitfield_pattern_p (x);
10082 if (op0)
10083 {
10084 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10085 if (speed)
10086 *cost += extra_cost->alu.bfx;
10087 return true;
10088 }
10089
10090 if (speed)
10091 {
10092 if (VECTOR_MODE_P (mode))
10093 {
10094 /* UMOV. */
10095 *cost += extra_cost->vect.alu;
10096 }
10097 else
10098 {
10099 /* We generate an AND instead of UXTB/UXTH. */
10100 *cost += extra_cost->alu.logical;
10101 }
10102 }
10103 return false;
10104
10105 case SIGN_EXTEND:
10106 if (MEM_P (XEXP (x, 0)))
10107 {
10108 /* LDRSH. */
10109 if (speed)
10110 {
10111 rtx address = XEXP (XEXP (x, 0), 0);
10112 *cost += extra_cost->ldst.load_sign_extend;
10113
10114 *cost +=
10115 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10116 0, speed));
10117 }
10118 return true;
10119 }
10120
10121 op0 = aarch64_extend_bitfield_pattern_p (x);
10122 if (op0)
10123 {
10124 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10125 if (speed)
10126 *cost += extra_cost->alu.bfx;
10127 return true;
10128 }
10129
10130 if (speed)
10131 {
10132 if (VECTOR_MODE_P (mode))
10133 *cost += extra_cost->vect.alu;
10134 else
10135 *cost += extra_cost->alu.extend;
10136 }
10137 return false;
10138
10139 case ASHIFT:
10140 op0 = XEXP (x, 0);
10141 op1 = XEXP (x, 1);
10142
10143 if (CONST_INT_P (op1))
10144 {
10145 if (speed)
10146 {
10147 if (VECTOR_MODE_P (mode))
10148 {
10149 /* Vector shift (immediate). */
10150 *cost += extra_cost->vect.alu;
10151 }
10152 else
10153 {
10154 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10155 aliases. */
10156 *cost += extra_cost->alu.shift;
10157 }
10158 }
10159
10160 /* We can incorporate zero/sign extend for free. */
10161 if (GET_CODE (op0) == ZERO_EXTEND
10162 || GET_CODE (op0) == SIGN_EXTEND)
10163 op0 = XEXP (op0, 0);
10164
10165 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10166 return true;
10167 }
10168 else
10169 {
10170 if (VECTOR_MODE_P (mode))
10171 {
10172 if (speed)
10173 /* Vector shift (register). */
10174 *cost += extra_cost->vect.alu;
10175 }
10176 else
10177 {
10178 if (speed)
10179 /* LSLV. */
10180 *cost += extra_cost->alu.shift_reg;
10181
10182 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10183 && CONST_INT_P (XEXP (op1, 1))
10184 && known_eq (INTVAL (XEXP (op1, 1)),
10185 GET_MODE_BITSIZE (mode) - 1))
10186 {
10187 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10188 /* We already demanded XEXP (op1, 0) to be REG_P, so
10189 don't recurse into it. */
10190 return true;
10191 }
10192 }
10193 return false; /* All arguments need to be in registers. */
10194 }
10195
10196 case ROTATE:
10197 case ROTATERT:
10198 case LSHIFTRT:
10199 case ASHIFTRT:
10200 op0 = XEXP (x, 0);
10201 op1 = XEXP (x, 1);
10202
10203 if (CONST_INT_P (op1))
10204 {
10205 /* ASR (immediate) and friends. */
10206 if (speed)
10207 {
10208 if (VECTOR_MODE_P (mode))
10209 *cost += extra_cost->vect.alu;
10210 else
10211 *cost += extra_cost->alu.shift;
10212 }
10213
10214 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10215 return true;
10216 }
10217 else
10218 {
10219 if (VECTOR_MODE_P (mode))
10220 {
10221 if (speed)
10222 /* Vector shift (register). */
10223 *cost += extra_cost->vect.alu;
10224 }
10225 else
10226 {
10227 if (speed)
10228 /* ASR (register) and friends. */
10229 *cost += extra_cost->alu.shift_reg;
10230
10231 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10232 && CONST_INT_P (XEXP (op1, 1))
10233 && known_eq (INTVAL (XEXP (op1, 1)),
10234 GET_MODE_BITSIZE (mode) - 1))
10235 {
10236 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10237 /* We already demanded XEXP (op1, 0) to be REG_P, so
10238 don't recurse into it. */
10239 return true;
10240 }
10241 }
10242 return false; /* All arguments need to be in registers. */
10243 }
10244
10245 case SYMBOL_REF:
10246
10247 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10248 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10249 {
10250 /* LDR. */
10251 if (speed)
10252 *cost += extra_cost->ldst.load;
10253 }
10254 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10255 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10256 {
10257 /* ADRP, followed by ADD. */
10258 *cost += COSTS_N_INSNS (1);
10259 if (speed)
10260 *cost += 2 * extra_cost->alu.arith;
10261 }
10262 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10263 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10264 {
10265 /* ADR. */
10266 if (speed)
10267 *cost += extra_cost->alu.arith;
10268 }
10269
10270 if (flag_pic)
10271 {
10272 /* One extra load instruction, after accessing the GOT. */
10273 *cost += COSTS_N_INSNS (1);
10274 if (speed)
10275 *cost += extra_cost->ldst.load;
10276 }
10277 return true;
10278
10279 case HIGH:
10280 case LO_SUM:
10281 /* ADRP/ADD (immediate). */
10282 if (speed)
10283 *cost += extra_cost->alu.arith;
10284 return true;
10285
10286 case ZERO_EXTRACT:
10287 case SIGN_EXTRACT:
10288 /* UBFX/SBFX. */
10289 if (speed)
10290 {
10291 if (VECTOR_MODE_P (mode))
10292 *cost += extra_cost->vect.alu;
10293 else
10294 *cost += extra_cost->alu.bfx;
10295 }
10296
10297 /* We can trust that the immediates used will be correct (there
10298 are no by-register forms), so we need only cost op0. */
10299 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10300 return true;
10301
10302 case MULT:
10303 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10304 /* aarch64_rtx_mult_cost always handles recursion to its
10305 operands. */
10306 return true;
10307
10308 case MOD:
10309 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10310 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10311 an unconditional negate. This case should only ever be reached through
10312 the set_smod_pow2_cheap check in expmed.c. */
10313 if (CONST_INT_P (XEXP (x, 1))
10314 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10315 && (mode == SImode || mode == DImode))
10316 {
10317 /* We expand to 4 instructions. Reset the baseline. */
10318 *cost = COSTS_N_INSNS (4);
10319
10320 if (speed)
10321 *cost += 2 * extra_cost->alu.logical
10322 + 2 * extra_cost->alu.arith;
10323
10324 return true;
10325 }
10326
10327 /* Fall-through. */
10328 case UMOD:
10329 if (speed)
10330 {
10331 /* Slighly prefer UMOD over SMOD. */
10332 if (VECTOR_MODE_P (mode))
10333 *cost += extra_cost->vect.alu;
10334 else if (GET_MODE_CLASS (mode) == MODE_INT)
10335 *cost += (extra_cost->mult[mode == DImode].add
10336 + extra_cost->mult[mode == DImode].idiv
10337 + (code == MOD ? 1 : 0));
10338 }
10339 return false; /* All arguments need to be in registers. */
10340
10341 case DIV:
10342 case UDIV:
10343 case SQRT:
10344 if (speed)
10345 {
10346 if (VECTOR_MODE_P (mode))
10347 *cost += extra_cost->vect.alu;
10348 else if (GET_MODE_CLASS (mode) == MODE_INT)
10349 /* There is no integer SQRT, so only DIV and UDIV can get
10350 here. */
10351 *cost += (extra_cost->mult[mode == DImode].idiv
10352 /* Slighly prefer UDIV over SDIV. */
10353 + (code == DIV ? 1 : 0));
10354 else
10355 *cost += extra_cost->fp[mode == DFmode].div;
10356 }
10357 return false; /* All arguments need to be in registers. */
10358
10359 case IF_THEN_ELSE:
10360 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10361 XEXP (x, 2), cost, speed);
10362
10363 case EQ:
10364 case NE:
10365 case GT:
10366 case GTU:
10367 case LT:
10368 case LTU:
10369 case GE:
10370 case GEU:
10371 case LE:
10372 case LEU:
10373
10374 return false; /* All arguments must be in registers. */
10375
10376 case FMA:
10377 op0 = XEXP (x, 0);
10378 op1 = XEXP (x, 1);
10379 op2 = XEXP (x, 2);
10380
10381 if (speed)
10382 {
10383 if (VECTOR_MODE_P (mode))
10384 *cost += extra_cost->vect.alu;
10385 else
10386 *cost += extra_cost->fp[mode == DFmode].fma;
10387 }
10388
10389 /* FMSUB, FNMADD, and FNMSUB are free. */
10390 if (GET_CODE (op0) == NEG)
10391 op0 = XEXP (op0, 0);
10392
10393 if (GET_CODE (op2) == NEG)
10394 op2 = XEXP (op2, 0);
10395
10396 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10397 and the by-element operand as operand 0. */
10398 if (GET_CODE (op1) == NEG)
10399 op1 = XEXP (op1, 0);
10400
10401 /* Catch vector-by-element operations. The by-element operand can
10402 either be (vec_duplicate (vec_select (x))) or just
10403 (vec_select (x)), depending on whether we are multiplying by
10404 a vector or a scalar.
10405
10406 Canonicalization is not very good in these cases, FMA4 will put the
10407 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10408 if (GET_CODE (op0) == VEC_DUPLICATE)
10409 op0 = XEXP (op0, 0);
10410 else if (GET_CODE (op1) == VEC_DUPLICATE)
10411 op1 = XEXP (op1, 0);
10412
10413 if (GET_CODE (op0) == VEC_SELECT)
10414 op0 = XEXP (op0, 0);
10415 else if (GET_CODE (op1) == VEC_SELECT)
10416 op1 = XEXP (op1, 0);
10417
10418 /* If the remaining parameters are not registers,
10419 get the cost to put them into registers. */
10420 *cost += rtx_cost (op0, mode, FMA, 0, speed);
10421 *cost += rtx_cost (op1, mode, FMA, 1, speed);
10422 *cost += rtx_cost (op2, mode, FMA, 2, speed);
10423 return true;
10424
10425 case FLOAT:
10426 case UNSIGNED_FLOAT:
10427 if (speed)
10428 *cost += extra_cost->fp[mode == DFmode].fromint;
10429 return false;
10430
10431 case FLOAT_EXTEND:
10432 if (speed)
10433 {
10434 if (VECTOR_MODE_P (mode))
10435 {
10436 /*Vector truncate. */
10437 *cost += extra_cost->vect.alu;
10438 }
10439 else
10440 *cost += extra_cost->fp[mode == DFmode].widen;
10441 }
10442 return false;
10443
10444 case FLOAT_TRUNCATE:
10445 if (speed)
10446 {
10447 if (VECTOR_MODE_P (mode))
10448 {
10449 /*Vector conversion. */
10450 *cost += extra_cost->vect.alu;
10451 }
10452 else
10453 *cost += extra_cost->fp[mode == DFmode].narrow;
10454 }
10455 return false;
10456
10457 case FIX:
10458 case UNSIGNED_FIX:
10459 x = XEXP (x, 0);
10460 /* Strip the rounding part. They will all be implemented
10461 by the fcvt* family of instructions anyway. */
10462 if (GET_CODE (x) == UNSPEC)
10463 {
10464 unsigned int uns_code = XINT (x, 1);
10465
10466 if (uns_code == UNSPEC_FRINTA
10467 || uns_code == UNSPEC_FRINTM
10468 || uns_code == UNSPEC_FRINTN
10469 || uns_code == UNSPEC_FRINTP
10470 || uns_code == UNSPEC_FRINTZ)
10471 x = XVECEXP (x, 0, 0);
10472 }
10473
10474 if (speed)
10475 {
10476 if (VECTOR_MODE_P (mode))
10477 *cost += extra_cost->vect.alu;
10478 else
10479 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10480 }
10481
10482 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10483 fixed-point fcvt. */
10484 if (GET_CODE (x) == MULT
10485 && ((VECTOR_MODE_P (mode)
10486 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10487 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10488 {
10489 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10490 0, speed);
10491 return true;
10492 }
10493
10494 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10495 return true;
10496
10497 case ABS:
10498 if (VECTOR_MODE_P (mode))
10499 {
10500 /* ABS (vector). */
10501 if (speed)
10502 *cost += extra_cost->vect.alu;
10503 }
10504 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10505 {
10506 op0 = XEXP (x, 0);
10507
10508 /* FABD, which is analogous to FADD. */
10509 if (GET_CODE (op0) == MINUS)
10510 {
10511 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10512 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10513 if (speed)
10514 *cost += extra_cost->fp[mode == DFmode].addsub;
10515
10516 return true;
10517 }
10518 /* Simple FABS is analogous to FNEG. */
10519 if (speed)
10520 *cost += extra_cost->fp[mode == DFmode].neg;
10521 }
10522 else
10523 {
10524 /* Integer ABS will either be split to
10525 two arithmetic instructions, or will be an ABS
10526 (scalar), which we don't model. */
10527 *cost = COSTS_N_INSNS (2);
10528 if (speed)
10529 *cost += 2 * extra_cost->alu.arith;
10530 }
10531 return false;
10532
10533 case SMAX:
10534 case SMIN:
10535 if (speed)
10536 {
10537 if (VECTOR_MODE_P (mode))
10538 *cost += extra_cost->vect.alu;
10539 else
10540 {
10541 /* FMAXNM/FMINNM/FMAX/FMIN.
10542 TODO: This may not be accurate for all implementations, but
10543 we do not model this in the cost tables. */
10544 *cost += extra_cost->fp[mode == DFmode].addsub;
10545 }
10546 }
10547 return false;
10548
10549 case UNSPEC:
10550 /* The floating point round to integer frint* instructions. */
10551 if (aarch64_frint_unspec_p (XINT (x, 1)))
10552 {
10553 if (speed)
10554 *cost += extra_cost->fp[mode == DFmode].roundint;
10555
10556 return false;
10557 }
10558
10559 if (XINT (x, 1) == UNSPEC_RBIT)
10560 {
10561 if (speed)
10562 *cost += extra_cost->alu.rev;
10563
10564 return false;
10565 }
10566 break;
10567
10568 case TRUNCATE:
10569
10570 /* Decompose <su>muldi3_highpart. */
10571 if (/* (truncate:DI */
10572 mode == DImode
10573 /* (lshiftrt:TI */
10574 && GET_MODE (XEXP (x, 0)) == TImode
10575 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10576 /* (mult:TI */
10577 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10578 /* (ANY_EXTEND:TI (reg:DI))
10579 (ANY_EXTEND:TI (reg:DI))) */
10580 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10581 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10582 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10583 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10584 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10585 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10586 /* (const_int 64) */
10587 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10588 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10589 {
10590 /* UMULH/SMULH. */
10591 if (speed)
10592 *cost += extra_cost->mult[mode == DImode].extend;
10593 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10594 mode, MULT, 0, speed);
10595 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10596 mode, MULT, 1, speed);
10597 return true;
10598 }
10599
10600 /* Fall through. */
10601 default:
10602 break;
10603 }
10604
10605 if (dump_file
10606 && flag_aarch64_verbose_cost)
10607 fprintf (dump_file,
10608 "\nFailed to cost RTX. Assuming default cost.\n");
10609
10610 return true;
10611 }
10612
10613 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10614 calculated for X. This cost is stored in *COST. Returns true
10615 if the total cost of X was calculated. */
10616 static bool
10617 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10618 int param, int *cost, bool speed)
10619 {
10620 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10621
10622 if (dump_file
10623 && flag_aarch64_verbose_cost)
10624 {
10625 print_rtl_single (dump_file, x);
10626 fprintf (dump_file, "\n%s cost: %d (%s)\n",
10627 speed ? "Hot" : "Cold",
10628 *cost, result ? "final" : "partial");
10629 }
10630
10631 return result;
10632 }
10633
10634 static int
10635 aarch64_register_move_cost (machine_mode mode,
10636 reg_class_t from_i, reg_class_t to_i)
10637 {
10638 enum reg_class from = (enum reg_class) from_i;
10639 enum reg_class to = (enum reg_class) to_i;
10640 const struct cpu_regmove_cost *regmove_cost
10641 = aarch64_tune_params.regmove_cost;
10642
10643 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10644 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10645 to = GENERAL_REGS;
10646
10647 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10648 from = GENERAL_REGS;
10649
10650 /* Moving between GPR and stack cost is the same as GP2GP. */
10651 if ((from == GENERAL_REGS && to == STACK_REG)
10652 || (to == GENERAL_REGS && from == STACK_REG))
10653 return regmove_cost->GP2GP;
10654
10655 /* To/From the stack register, we move via the gprs. */
10656 if (to == STACK_REG || from == STACK_REG)
10657 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10658 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10659
10660 if (known_eq (GET_MODE_SIZE (mode), 16))
10661 {
10662 /* 128-bit operations on general registers require 2 instructions. */
10663 if (from == GENERAL_REGS && to == GENERAL_REGS)
10664 return regmove_cost->GP2GP * 2;
10665 else if (from == GENERAL_REGS)
10666 return regmove_cost->GP2FP * 2;
10667 else if (to == GENERAL_REGS)
10668 return regmove_cost->FP2GP * 2;
10669
10670 /* When AdvSIMD instructions are disabled it is not possible to move
10671 a 128-bit value directly between Q registers. This is handled in
10672 secondary reload. A general register is used as a scratch to move
10673 the upper DI value and the lower DI value is moved directly,
10674 hence the cost is the sum of three moves. */
10675 if (! TARGET_SIMD)
10676 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10677
10678 return regmove_cost->FP2FP;
10679 }
10680
10681 if (from == GENERAL_REGS && to == GENERAL_REGS)
10682 return regmove_cost->GP2GP;
10683 else if (from == GENERAL_REGS)
10684 return regmove_cost->GP2FP;
10685 else if (to == GENERAL_REGS)
10686 return regmove_cost->FP2GP;
10687
10688 return regmove_cost->FP2FP;
10689 }
10690
10691 static int
10692 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10693 reg_class_t rclass ATTRIBUTE_UNUSED,
10694 bool in ATTRIBUTE_UNUSED)
10695 {
10696 return aarch64_tune_params.memmov_cost;
10697 }
10698
10699 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10700 to optimize 1.0/sqrt. */
10701
10702 static bool
10703 use_rsqrt_p (machine_mode mode)
10704 {
10705 return (!flag_trapping_math
10706 && flag_unsafe_math_optimizations
10707 && ((aarch64_tune_params.approx_modes->recip_sqrt
10708 & AARCH64_APPROX_MODE (mode))
10709 || flag_mrecip_low_precision_sqrt));
10710 }
10711
10712 /* Function to decide when to use the approximate reciprocal square root
10713 builtin. */
10714
10715 static tree
10716 aarch64_builtin_reciprocal (tree fndecl)
10717 {
10718 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10719
10720 if (!use_rsqrt_p (mode))
10721 return NULL_TREE;
10722 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10723 }
10724
10725 /* Emit instruction sequence to compute either the approximate square root
10726 or its approximate reciprocal, depending on the flag RECP, and return
10727 whether the sequence was emitted or not. */
10728
10729 bool
10730 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10731 {
10732 machine_mode mode = GET_MODE (dst);
10733
10734 if (GET_MODE_INNER (mode) == HFmode)
10735 {
10736 gcc_assert (!recp);
10737 return false;
10738 }
10739
10740 if (!recp)
10741 {
10742 if (!(flag_mlow_precision_sqrt
10743 || (aarch64_tune_params.approx_modes->sqrt
10744 & AARCH64_APPROX_MODE (mode))))
10745 return false;
10746
10747 if (flag_finite_math_only
10748 || flag_trapping_math
10749 || !flag_unsafe_math_optimizations
10750 || optimize_function_for_size_p (cfun))
10751 return false;
10752 }
10753 else
10754 /* Caller assumes we cannot fail. */
10755 gcc_assert (use_rsqrt_p (mode));
10756
10757 machine_mode mmsk = mode_for_int_vector (mode).require ();
10758 rtx xmsk = gen_reg_rtx (mmsk);
10759 if (!recp)
10760 /* When calculating the approximate square root, compare the
10761 argument with 0.0 and create a mask. */
10762 emit_insn (gen_rtx_SET (xmsk,
10763 gen_rtx_NEG (mmsk,
10764 gen_rtx_EQ (mmsk, src,
10765 CONST0_RTX (mode)))));
10766
10767 /* Estimate the approximate reciprocal square root. */
10768 rtx xdst = gen_reg_rtx (mode);
10769 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10770
10771 /* Iterate over the series twice for SF and thrice for DF. */
10772 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10773
10774 /* Optionally iterate over the series once less for faster performance
10775 while sacrificing the accuracy. */
10776 if ((recp && flag_mrecip_low_precision_sqrt)
10777 || (!recp && flag_mlow_precision_sqrt))
10778 iterations--;
10779
10780 /* Iterate over the series to calculate the approximate reciprocal square
10781 root. */
10782 rtx x1 = gen_reg_rtx (mode);
10783 while (iterations--)
10784 {
10785 rtx x2 = gen_reg_rtx (mode);
10786 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10787
10788 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10789
10790 if (iterations > 0)
10791 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10792 }
10793
10794 if (!recp)
10795 {
10796 /* Qualify the approximate reciprocal square root when the argument is
10797 0.0 by squashing the intermediary result to 0.0. */
10798 rtx xtmp = gen_reg_rtx (mmsk);
10799 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10800 gen_rtx_SUBREG (mmsk, xdst, 0)));
10801 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10802
10803 /* Calculate the approximate square root. */
10804 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10805 }
10806
10807 /* Finalize the approximation. */
10808 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10809
10810 return true;
10811 }
10812
10813 /* Emit the instruction sequence to compute the approximation for the division
10814 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10815
10816 bool
10817 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10818 {
10819 machine_mode mode = GET_MODE (quo);
10820
10821 if (GET_MODE_INNER (mode) == HFmode)
10822 return false;
10823
10824 bool use_approx_division_p = (flag_mlow_precision_div
10825 || (aarch64_tune_params.approx_modes->division
10826 & AARCH64_APPROX_MODE (mode)));
10827
10828 if (!flag_finite_math_only
10829 || flag_trapping_math
10830 || !flag_unsafe_math_optimizations
10831 || optimize_function_for_size_p (cfun)
10832 || !use_approx_division_p)
10833 return false;
10834
10835 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10836 return false;
10837
10838 /* Estimate the approximate reciprocal. */
10839 rtx xrcp = gen_reg_rtx (mode);
10840 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10841
10842 /* Iterate over the series twice for SF and thrice for DF. */
10843 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10844
10845 /* Optionally iterate over the series once less for faster performance,
10846 while sacrificing the accuracy. */
10847 if (flag_mlow_precision_div)
10848 iterations--;
10849
10850 /* Iterate over the series to calculate the approximate reciprocal. */
10851 rtx xtmp = gen_reg_rtx (mode);
10852 while (iterations--)
10853 {
10854 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10855
10856 if (iterations > 0)
10857 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10858 }
10859
10860 if (num != CONST1_RTX (mode))
10861 {
10862 /* As the approximate reciprocal of DEN is already calculated, only
10863 calculate the approximate division when NUM is not 1.0. */
10864 rtx xnum = force_reg (mode, num);
10865 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10866 }
10867
10868 /* Finalize the approximation. */
10869 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10870 return true;
10871 }
10872
10873 /* Return the number of instructions that can be issued per cycle. */
10874 static int
10875 aarch64_sched_issue_rate (void)
10876 {
10877 return aarch64_tune_params.issue_rate;
10878 }
10879
10880 static int
10881 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10882 {
10883 int issue_rate = aarch64_sched_issue_rate ();
10884
10885 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10886 }
10887
10888
10889 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10890 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10891 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10892
10893 static int
10894 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10895 int ready_index)
10896 {
10897 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10898 }
10899
10900
10901 /* Vectorizer cost model target hooks. */
10902
10903 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10904 static int
10905 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10906 tree vectype,
10907 int misalign ATTRIBUTE_UNUSED)
10908 {
10909 unsigned elements;
10910 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10911 bool fp = false;
10912
10913 if (vectype != NULL)
10914 fp = FLOAT_TYPE_P (vectype);
10915
10916 switch (type_of_cost)
10917 {
10918 case scalar_stmt:
10919 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10920
10921 case scalar_load:
10922 return costs->scalar_load_cost;
10923
10924 case scalar_store:
10925 return costs->scalar_store_cost;
10926
10927 case vector_stmt:
10928 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10929
10930 case vector_load:
10931 return costs->vec_align_load_cost;
10932
10933 case vector_store:
10934 return costs->vec_store_cost;
10935
10936 case vec_to_scalar:
10937 return costs->vec_to_scalar_cost;
10938
10939 case scalar_to_vec:
10940 return costs->scalar_to_vec_cost;
10941
10942 case unaligned_load:
10943 case vector_gather_load:
10944 return costs->vec_unalign_load_cost;
10945
10946 case unaligned_store:
10947 case vector_scatter_store:
10948 return costs->vec_unalign_store_cost;
10949
10950 case cond_branch_taken:
10951 return costs->cond_taken_branch_cost;
10952
10953 case cond_branch_not_taken:
10954 return costs->cond_not_taken_branch_cost;
10955
10956 case vec_perm:
10957 return costs->vec_permute_cost;
10958
10959 case vec_promote_demote:
10960 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10961
10962 case vec_construct:
10963 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10964 return elements / 2 + 1;
10965
10966 default:
10967 gcc_unreachable ();
10968 }
10969 }
10970
10971 /* Implement targetm.vectorize.add_stmt_cost. */
10972 static unsigned
10973 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10974 struct _stmt_vec_info *stmt_info, int misalign,
10975 enum vect_cost_model_location where)
10976 {
10977 unsigned *cost = (unsigned *) data;
10978 unsigned retval = 0;
10979
10980 if (flag_vect_cost_model)
10981 {
10982 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10983 int stmt_cost =
10984 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10985
10986 /* Statements in an inner loop relative to the loop being
10987 vectorized are weighted more heavily. The value here is
10988 arbitrary and could potentially be improved with analysis. */
10989 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10990 count *= 50; /* FIXME */
10991
10992 retval = (unsigned) (count * stmt_cost);
10993 cost[where] += retval;
10994 }
10995
10996 return retval;
10997 }
10998
10999 static void initialize_aarch64_code_model (struct gcc_options *);
11000
11001 /* Parse the TO_PARSE string and put the architecture struct that it
11002 selects into RES and the architectural features into ISA_FLAGS.
11003 Return an aarch64_parse_opt_result describing the parse result.
11004 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11005 When the TO_PARSE string contains an invalid extension,
11006 a copy of the string is created and stored to INVALID_EXTENSION. */
11007
11008 static enum aarch64_parse_opt_result
11009 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11010 unsigned long *isa_flags, std::string *invalid_extension)
11011 {
11012 const char *ext;
11013 const struct processor *arch;
11014 size_t len;
11015
11016 ext = strchr (to_parse, '+');
11017
11018 if (ext != NULL)
11019 len = ext - to_parse;
11020 else
11021 len = strlen (to_parse);
11022
11023 if (len == 0)
11024 return AARCH64_PARSE_MISSING_ARG;
11025
11026
11027 /* Loop through the list of supported ARCHes to find a match. */
11028 for (arch = all_architectures; arch->name != NULL; arch++)
11029 {
11030 if (strlen (arch->name) == len
11031 && strncmp (arch->name, to_parse, len) == 0)
11032 {
11033 unsigned long isa_temp = arch->flags;
11034
11035 if (ext != NULL)
11036 {
11037 /* TO_PARSE string contains at least one extension. */
11038 enum aarch64_parse_opt_result ext_res
11039 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11040
11041 if (ext_res != AARCH64_PARSE_OK)
11042 return ext_res;
11043 }
11044 /* Extension parsing was successful. Confirm the result
11045 arch and ISA flags. */
11046 *res = arch;
11047 *isa_flags = isa_temp;
11048 return AARCH64_PARSE_OK;
11049 }
11050 }
11051
11052 /* ARCH name not found in list. */
11053 return AARCH64_PARSE_INVALID_ARG;
11054 }
11055
11056 /* Parse the TO_PARSE string and put the result tuning in RES and the
11057 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11058 describing the parse result. If there is an error parsing, RES and
11059 ISA_FLAGS are left unchanged.
11060 When the TO_PARSE string contains an invalid extension,
11061 a copy of the string is created and stored to INVALID_EXTENSION. */
11062
11063 static enum aarch64_parse_opt_result
11064 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11065 unsigned long *isa_flags, std::string *invalid_extension)
11066 {
11067 const char *ext;
11068 const struct processor *cpu;
11069 size_t len;
11070
11071 ext = strchr (to_parse, '+');
11072
11073 if (ext != NULL)
11074 len = ext - to_parse;
11075 else
11076 len = strlen (to_parse);
11077
11078 if (len == 0)
11079 return AARCH64_PARSE_MISSING_ARG;
11080
11081
11082 /* Loop through the list of supported CPUs to find a match. */
11083 for (cpu = all_cores; cpu->name != NULL; cpu++)
11084 {
11085 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11086 {
11087 unsigned long isa_temp = cpu->flags;
11088
11089
11090 if (ext != NULL)
11091 {
11092 /* TO_PARSE string contains at least one extension. */
11093 enum aarch64_parse_opt_result ext_res
11094 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11095
11096 if (ext_res != AARCH64_PARSE_OK)
11097 return ext_res;
11098 }
11099 /* Extension parsing was successfull. Confirm the result
11100 cpu and ISA flags. */
11101 *res = cpu;
11102 *isa_flags = isa_temp;
11103 return AARCH64_PARSE_OK;
11104 }
11105 }
11106
11107 /* CPU name not found in list. */
11108 return AARCH64_PARSE_INVALID_ARG;
11109 }
11110
11111 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11112 Return an aarch64_parse_opt_result describing the parse result.
11113 If the parsing fails the RES does not change. */
11114
11115 static enum aarch64_parse_opt_result
11116 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11117 {
11118 const struct processor *cpu;
11119
11120 /* Loop through the list of supported CPUs to find a match. */
11121 for (cpu = all_cores; cpu->name != NULL; cpu++)
11122 {
11123 if (strcmp (cpu->name, to_parse) == 0)
11124 {
11125 *res = cpu;
11126 return AARCH64_PARSE_OK;
11127 }
11128 }
11129
11130 /* CPU name not found in list. */
11131 return AARCH64_PARSE_INVALID_ARG;
11132 }
11133
11134 /* Parse TOKEN, which has length LENGTH to see if it is an option
11135 described in FLAG. If it is, return the index bit for that fusion type.
11136 If not, error (printing OPTION_NAME) and return zero. */
11137
11138 static unsigned int
11139 aarch64_parse_one_option_token (const char *token,
11140 size_t length,
11141 const struct aarch64_flag_desc *flag,
11142 const char *option_name)
11143 {
11144 for (; flag->name != NULL; flag++)
11145 {
11146 if (length == strlen (flag->name)
11147 && !strncmp (flag->name, token, length))
11148 return flag->flag;
11149 }
11150
11151 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
11152 return 0;
11153 }
11154
11155 /* Parse OPTION which is a comma-separated list of flags to enable.
11156 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11157 default state we inherit from the CPU tuning structures. OPTION_NAME
11158 gives the top-level option we are parsing in the -moverride string,
11159 for use in error messages. */
11160
11161 static unsigned int
11162 aarch64_parse_boolean_options (const char *option,
11163 const struct aarch64_flag_desc *flags,
11164 unsigned int initial_state,
11165 const char *option_name)
11166 {
11167 const char separator = '.';
11168 const char* specs = option;
11169 const char* ntoken = option;
11170 unsigned int found_flags = initial_state;
11171
11172 while ((ntoken = strchr (specs, separator)))
11173 {
11174 size_t token_length = ntoken - specs;
11175 unsigned token_ops = aarch64_parse_one_option_token (specs,
11176 token_length,
11177 flags,
11178 option_name);
11179 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11180 in the token stream, reset the supported operations. So:
11181
11182 adrp+add.cmp+branch.none.adrp+add
11183
11184 would have the result of turning on only adrp+add fusion. */
11185 if (!token_ops)
11186 found_flags = 0;
11187
11188 found_flags |= token_ops;
11189 specs = ++ntoken;
11190 }
11191
11192 /* We ended with a comma, print something. */
11193 if (!(*specs))
11194 {
11195 error ("%s string ill-formed\n", option_name);
11196 return 0;
11197 }
11198
11199 /* We still have one more token to parse. */
11200 size_t token_length = strlen (specs);
11201 unsigned token_ops = aarch64_parse_one_option_token (specs,
11202 token_length,
11203 flags,
11204 option_name);
11205 if (!token_ops)
11206 found_flags = 0;
11207
11208 found_flags |= token_ops;
11209 return found_flags;
11210 }
11211
11212 /* Support for overriding instruction fusion. */
11213
11214 static void
11215 aarch64_parse_fuse_string (const char *fuse_string,
11216 struct tune_params *tune)
11217 {
11218 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11219 aarch64_fusible_pairs,
11220 tune->fusible_ops,
11221 "fuse=");
11222 }
11223
11224 /* Support for overriding other tuning flags. */
11225
11226 static void
11227 aarch64_parse_tune_string (const char *tune_string,
11228 struct tune_params *tune)
11229 {
11230 tune->extra_tuning_flags
11231 = aarch64_parse_boolean_options (tune_string,
11232 aarch64_tuning_flags,
11233 tune->extra_tuning_flags,
11234 "tune=");
11235 }
11236
11237 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11238 Accept the valid SVE vector widths allowed by
11239 aarch64_sve_vector_bits_enum and use it to override sve_width
11240 in TUNE. */
11241
11242 static void
11243 aarch64_parse_sve_width_string (const char *tune_string,
11244 struct tune_params *tune)
11245 {
11246 int width = -1;
11247
11248 int n = sscanf (tune_string, "%d", &width);
11249 if (n == EOF)
11250 {
11251 error ("invalid format for sve_width");
11252 return;
11253 }
11254 switch (width)
11255 {
11256 case SVE_128:
11257 case SVE_256:
11258 case SVE_512:
11259 case SVE_1024:
11260 case SVE_2048:
11261 break;
11262 default:
11263 error ("invalid sve_width value: %d", width);
11264 }
11265 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11266 }
11267
11268 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11269 we understand. If it is, extract the option string and handoff to
11270 the appropriate function. */
11271
11272 void
11273 aarch64_parse_one_override_token (const char* token,
11274 size_t length,
11275 struct tune_params *tune)
11276 {
11277 const struct aarch64_tuning_override_function *fn
11278 = aarch64_tuning_override_functions;
11279
11280 const char *option_part = strchr (token, '=');
11281 if (!option_part)
11282 {
11283 error ("tuning string missing in option (%s)", token);
11284 return;
11285 }
11286
11287 /* Get the length of the option name. */
11288 length = option_part - token;
11289 /* Skip the '=' to get to the option string. */
11290 option_part++;
11291
11292 for (; fn->name != NULL; fn++)
11293 {
11294 if (!strncmp (fn->name, token, length))
11295 {
11296 fn->parse_override (option_part, tune);
11297 return;
11298 }
11299 }
11300
11301 error ("unknown tuning option (%s)",token);
11302 return;
11303 }
11304
11305 /* A checking mechanism for the implementation of the tls size. */
11306
11307 static void
11308 initialize_aarch64_tls_size (struct gcc_options *opts)
11309 {
11310 if (aarch64_tls_size == 0)
11311 aarch64_tls_size = 24;
11312
11313 switch (opts->x_aarch64_cmodel_var)
11314 {
11315 case AARCH64_CMODEL_TINY:
11316 /* Both the default and maximum TLS size allowed under tiny is 1M which
11317 needs two instructions to address, so we clamp the size to 24. */
11318 if (aarch64_tls_size > 24)
11319 aarch64_tls_size = 24;
11320 break;
11321 case AARCH64_CMODEL_SMALL:
11322 /* The maximum TLS size allowed under small is 4G. */
11323 if (aarch64_tls_size > 32)
11324 aarch64_tls_size = 32;
11325 break;
11326 case AARCH64_CMODEL_LARGE:
11327 /* The maximum TLS size allowed under large is 16E.
11328 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11329 if (aarch64_tls_size > 48)
11330 aarch64_tls_size = 48;
11331 break;
11332 default:
11333 gcc_unreachable ();
11334 }
11335
11336 return;
11337 }
11338
11339 /* Parse STRING looking for options in the format:
11340 string :: option:string
11341 option :: name=substring
11342 name :: {a-z}
11343 substring :: defined by option. */
11344
11345 static void
11346 aarch64_parse_override_string (const char* input_string,
11347 struct tune_params* tune)
11348 {
11349 const char separator = ':';
11350 size_t string_length = strlen (input_string) + 1;
11351 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11352 char *string = string_root;
11353 strncpy (string, input_string, string_length);
11354 string[string_length - 1] = '\0';
11355
11356 char* ntoken = string;
11357
11358 while ((ntoken = strchr (string, separator)))
11359 {
11360 size_t token_length = ntoken - string;
11361 /* Make this substring look like a string. */
11362 *ntoken = '\0';
11363 aarch64_parse_one_override_token (string, token_length, tune);
11364 string = ++ntoken;
11365 }
11366
11367 /* One last option to parse. */
11368 aarch64_parse_one_override_token (string, strlen (string), tune);
11369 free (string_root);
11370 }
11371
11372
11373 static void
11374 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11375 {
11376 if (accepted_branch_protection_string)
11377 {
11378 opts->x_aarch64_branch_protection_string
11379 = xstrdup (accepted_branch_protection_string);
11380 }
11381
11382 /* PR 70044: We have to be careful about being called multiple times for the
11383 same function. This means all changes should be repeatable. */
11384
11385 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11386 Disable the frame pointer flag so the mid-end will not use a frame
11387 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11388 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11389 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11390 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11391 if (opts->x_flag_omit_frame_pointer == 0)
11392 opts->x_flag_omit_frame_pointer = 2;
11393
11394 /* If not optimizing for size, set the default
11395 alignment to what the target wants. */
11396 if (!opts->x_optimize_size)
11397 {
11398 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11399 opts->x_str_align_loops = aarch64_tune_params.loop_align;
11400 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11401 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11402 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11403 opts->x_str_align_functions = aarch64_tune_params.function_align;
11404 }
11405
11406 /* We default to no pc-relative literal loads. */
11407
11408 aarch64_pcrelative_literal_loads = false;
11409
11410 /* If -mpc-relative-literal-loads is set on the command line, this
11411 implies that the user asked for PC relative literal loads. */
11412 if (opts->x_pcrelative_literal_loads == 1)
11413 aarch64_pcrelative_literal_loads = true;
11414
11415 /* In the tiny memory model it makes no sense to disallow PC relative
11416 literal pool loads. */
11417 if (aarch64_cmodel == AARCH64_CMODEL_TINY
11418 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11419 aarch64_pcrelative_literal_loads = true;
11420
11421 /* When enabling the lower precision Newton series for the square root, also
11422 enable it for the reciprocal square root, since the latter is an
11423 intermediary step for the former. */
11424 if (flag_mlow_precision_sqrt)
11425 flag_mrecip_low_precision_sqrt = true;
11426 }
11427
11428 /* 'Unpack' up the internal tuning structs and update the options
11429 in OPTS. The caller must have set up selected_tune and selected_arch
11430 as all the other target-specific codegen decisions are
11431 derived from them. */
11432
11433 void
11434 aarch64_override_options_internal (struct gcc_options *opts)
11435 {
11436 aarch64_tune_flags = selected_tune->flags;
11437 aarch64_tune = selected_tune->sched_core;
11438 /* Make a copy of the tuning parameters attached to the core, which
11439 we may later overwrite. */
11440 aarch64_tune_params = *(selected_tune->tune);
11441 aarch64_architecture_version = selected_arch->architecture_version;
11442
11443 if (opts->x_aarch64_override_tune_string)
11444 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11445 &aarch64_tune_params);
11446
11447 /* This target defaults to strict volatile bitfields. */
11448 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11449 opts->x_flag_strict_volatile_bitfields = 1;
11450
11451 if (aarch64_stack_protector_guard == SSP_GLOBAL
11452 && opts->x_aarch64_stack_protector_guard_offset_str)
11453 {
11454 error ("incompatible options -mstack-protector-guard=global and"
11455 "-mstack-protector-guard-offset=%qs",
11456 aarch64_stack_protector_guard_offset_str);
11457 }
11458
11459 if (aarch64_stack_protector_guard == SSP_SYSREG
11460 && !(opts->x_aarch64_stack_protector_guard_offset_str
11461 && opts->x_aarch64_stack_protector_guard_reg_str))
11462 {
11463 error ("both -mstack-protector-guard-offset and "
11464 "-mstack-protector-guard-reg must be used "
11465 "with -mstack-protector-guard=sysreg");
11466 }
11467
11468 if (opts->x_aarch64_stack_protector_guard_reg_str)
11469 {
11470 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11471 error ("specify a system register with a small string length.");
11472 }
11473
11474 if (opts->x_aarch64_stack_protector_guard_offset_str)
11475 {
11476 char *end;
11477 const char *str = aarch64_stack_protector_guard_offset_str;
11478 errno = 0;
11479 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11480 if (!*str || *end || errno)
11481 error ("%qs is not a valid offset in %qs", str,
11482 "-mstack-protector-guard-offset=");
11483 aarch64_stack_protector_guard_offset = offs;
11484 }
11485
11486 initialize_aarch64_code_model (opts);
11487 initialize_aarch64_tls_size (opts);
11488
11489 int queue_depth = 0;
11490 switch (aarch64_tune_params.autoprefetcher_model)
11491 {
11492 case tune_params::AUTOPREFETCHER_OFF:
11493 queue_depth = -1;
11494 break;
11495 case tune_params::AUTOPREFETCHER_WEAK:
11496 queue_depth = 0;
11497 break;
11498 case tune_params::AUTOPREFETCHER_STRONG:
11499 queue_depth = max_insn_queue_index + 1;
11500 break;
11501 default:
11502 gcc_unreachable ();
11503 }
11504
11505 /* We don't mind passing in global_options_set here as we don't use
11506 the *options_set structs anyway. */
11507 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11508 queue_depth,
11509 opts->x_param_values,
11510 global_options_set.x_param_values);
11511
11512 /* Set up parameters to be used in prefetching algorithm. Do not
11513 override the defaults unless we are tuning for a core we have
11514 researched values for. */
11515 if (aarch64_tune_params.prefetch->num_slots > 0)
11516 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11517 aarch64_tune_params.prefetch->num_slots,
11518 opts->x_param_values,
11519 global_options_set.x_param_values);
11520 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11521 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11522 aarch64_tune_params.prefetch->l1_cache_size,
11523 opts->x_param_values,
11524 global_options_set.x_param_values);
11525 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11526 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11527 aarch64_tune_params.prefetch->l1_cache_line_size,
11528 opts->x_param_values,
11529 global_options_set.x_param_values);
11530 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11531 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11532 aarch64_tune_params.prefetch->l2_cache_size,
11533 opts->x_param_values,
11534 global_options_set.x_param_values);
11535 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11536 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11537 0,
11538 opts->x_param_values,
11539 global_options_set.x_param_values);
11540 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11541 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11542 aarch64_tune_params.prefetch->minimum_stride,
11543 opts->x_param_values,
11544 global_options_set.x_param_values);
11545
11546 /* Use the alternative scheduling-pressure algorithm by default. */
11547 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11548 opts->x_param_values,
11549 global_options_set.x_param_values);
11550
11551 /* If the user hasn't changed it via configure then set the default to 64 KB
11552 for the backend. */
11553 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11554 DEFAULT_STK_CLASH_GUARD_SIZE == 0
11555 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11556 opts->x_param_values,
11557 global_options_set.x_param_values);
11558
11559 /* Validate the guard size. */
11560 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11561
11562 /* Enforce that interval is the same size as size so the mid-end does the
11563 right thing. */
11564 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11565 guard_size,
11566 opts->x_param_values,
11567 global_options_set.x_param_values);
11568
11569 /* The maybe_set calls won't update the value if the user has explicitly set
11570 one. Which means we need to validate that probing interval and guard size
11571 are equal. */
11572 int probe_interval
11573 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11574 if (guard_size != probe_interval)
11575 error ("stack clash guard size '%d' must be equal to probing interval "
11576 "'%d'", guard_size, probe_interval);
11577
11578 /* Enable sw prefetching at specified optimization level for
11579 CPUS that have prefetch. Lower optimization level threshold by 1
11580 when profiling is enabled. */
11581 if (opts->x_flag_prefetch_loop_arrays < 0
11582 && !opts->x_optimize_size
11583 && aarch64_tune_params.prefetch->default_opt_level >= 0
11584 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11585 opts->x_flag_prefetch_loop_arrays = 1;
11586
11587 if (opts->x_aarch64_arch_string == NULL)
11588 opts->x_aarch64_arch_string = selected_arch->name;
11589 if (opts->x_aarch64_cpu_string == NULL)
11590 opts->x_aarch64_cpu_string = selected_cpu->name;
11591 if (opts->x_aarch64_tune_string == NULL)
11592 opts->x_aarch64_tune_string = selected_tune->name;
11593
11594 aarch64_override_options_after_change_1 (opts);
11595 }
11596
11597 /* Print a hint with a suggestion for a core or architecture name that
11598 most closely resembles what the user passed in STR. ARCH is true if
11599 the user is asking for an architecture name. ARCH is false if the user
11600 is asking for a core name. */
11601
11602 static void
11603 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11604 {
11605 auto_vec<const char *> candidates;
11606 const struct processor *entry = arch ? all_architectures : all_cores;
11607 for (; entry->name != NULL; entry++)
11608 candidates.safe_push (entry->name);
11609
11610 #ifdef HAVE_LOCAL_CPU_DETECT
11611 /* Add also "native" as possible value. */
11612 if (arch)
11613 candidates.safe_push ("native");
11614 #endif
11615
11616 char *s;
11617 const char *hint = candidates_list_and_hint (str, s, candidates);
11618 if (hint)
11619 inform (input_location, "valid arguments are: %s;"
11620 " did you mean %qs?", s, hint);
11621 else
11622 inform (input_location, "valid arguments are: %s", s);
11623
11624 XDELETEVEC (s);
11625 }
11626
11627 /* Print a hint with a suggestion for a core name that most closely resembles
11628 what the user passed in STR. */
11629
11630 inline static void
11631 aarch64_print_hint_for_core (const char *str)
11632 {
11633 aarch64_print_hint_for_core_or_arch (str, false);
11634 }
11635
11636 /* Print a hint with a suggestion for an architecture name that most closely
11637 resembles what the user passed in STR. */
11638
11639 inline static void
11640 aarch64_print_hint_for_arch (const char *str)
11641 {
11642 aarch64_print_hint_for_core_or_arch (str, true);
11643 }
11644
11645
11646 /* Print a hint with a suggestion for an extension name
11647 that most closely resembles what the user passed in STR. */
11648
11649 void
11650 aarch64_print_hint_for_extensions (const std::string &str)
11651 {
11652 auto_vec<const char *> candidates;
11653 aarch64_get_all_extension_candidates (&candidates);
11654 char *s;
11655 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11656 if (hint)
11657 inform (input_location, "valid arguments are: %s;"
11658 " did you mean %qs?", s, hint);
11659 else
11660 inform (input_location, "valid arguments are: %s;", s);
11661
11662 XDELETEVEC (s);
11663 }
11664
11665 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11666 specified in STR and throw errors if appropriate. Put the results if
11667 they are valid in RES and ISA_FLAGS. Return whether the option is
11668 valid. */
11669
11670 static bool
11671 aarch64_validate_mcpu (const char *str, const struct processor **res,
11672 unsigned long *isa_flags)
11673 {
11674 std::string invalid_extension;
11675 enum aarch64_parse_opt_result parse_res
11676 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11677
11678 if (parse_res == AARCH64_PARSE_OK)
11679 return true;
11680
11681 switch (parse_res)
11682 {
11683 case AARCH64_PARSE_MISSING_ARG:
11684 error ("missing cpu name in %<-mcpu=%s%>", str);
11685 break;
11686 case AARCH64_PARSE_INVALID_ARG:
11687 error ("unknown value %qs for -mcpu", str);
11688 aarch64_print_hint_for_core (str);
11689 break;
11690 case AARCH64_PARSE_INVALID_FEATURE:
11691 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11692 invalid_extension.c_str (), str);
11693 aarch64_print_hint_for_extensions (invalid_extension);
11694 break;
11695 default:
11696 gcc_unreachable ();
11697 }
11698
11699 return false;
11700 }
11701
11702 /* Parses CONST_STR for branch protection features specified in
11703 aarch64_branch_protect_types, and set any global variables required. Returns
11704 the parsing result and assigns LAST_STR to the last processed token from
11705 CONST_STR so that it can be used for error reporting. */
11706
11707 static enum
11708 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11709 char** last_str)
11710 {
11711 char *str_root = xstrdup (const_str);
11712 char* token_save = NULL;
11713 char *str = strtok_r (str_root, "+", &token_save);
11714 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11715 if (!str)
11716 res = AARCH64_PARSE_MISSING_ARG;
11717 else
11718 {
11719 char *next_str = strtok_r (NULL, "+", &token_save);
11720 /* Reset the branch protection features to their defaults. */
11721 aarch64_handle_no_branch_protection (NULL, NULL);
11722
11723 while (str && res == AARCH64_PARSE_OK)
11724 {
11725 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11726 bool found = false;
11727 /* Search for this type. */
11728 while (type && type->name && !found && res == AARCH64_PARSE_OK)
11729 {
11730 if (strcmp (str, type->name) == 0)
11731 {
11732 found = true;
11733 res = type->handler (str, next_str);
11734 str = next_str;
11735 next_str = strtok_r (NULL, "+", &token_save);
11736 }
11737 else
11738 type++;
11739 }
11740 if (found && res == AARCH64_PARSE_OK)
11741 {
11742 bool found_subtype = true;
11743 /* Loop through each token until we find one that isn't a
11744 subtype. */
11745 while (found_subtype)
11746 {
11747 found_subtype = false;
11748 const aarch64_branch_protect_type *subtype = type->subtypes;
11749 /* Search for the subtype. */
11750 while (str && subtype && subtype->name && !found_subtype
11751 && res == AARCH64_PARSE_OK)
11752 {
11753 if (strcmp (str, subtype->name) == 0)
11754 {
11755 found_subtype = true;
11756 res = subtype->handler (str, next_str);
11757 str = next_str;
11758 next_str = strtok_r (NULL, "+", &token_save);
11759 }
11760 else
11761 subtype++;
11762 }
11763 }
11764 }
11765 else if (!found)
11766 res = AARCH64_PARSE_INVALID_ARG;
11767 }
11768 }
11769 /* Copy the last processed token into the argument to pass it back.
11770 Used by option and attribute validation to print the offending token. */
11771 if (last_str)
11772 {
11773 if (str) strcpy (*last_str, str);
11774 else *last_str = NULL;
11775 }
11776 if (res == AARCH64_PARSE_OK)
11777 {
11778 /* If needed, alloc the accepted string then copy in const_str.
11779 Used by override_option_after_change_1. */
11780 if (!accepted_branch_protection_string)
11781 accepted_branch_protection_string = (char *) xmalloc (
11782 BRANCH_PROTECT_STR_MAX
11783 + 1);
11784 strncpy (accepted_branch_protection_string, const_str,
11785 BRANCH_PROTECT_STR_MAX + 1);
11786 /* Forcibly null-terminate. */
11787 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11788 }
11789 return res;
11790 }
11791
11792 static bool
11793 aarch64_validate_mbranch_protection (const char *const_str)
11794 {
11795 char *str = (char *) xmalloc (strlen (const_str));
11796 enum aarch64_parse_opt_result res =
11797 aarch64_parse_branch_protection (const_str, &str);
11798 if (res == AARCH64_PARSE_INVALID_ARG)
11799 error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11800 else if (res == AARCH64_PARSE_MISSING_ARG)
11801 error ("missing arg for %<-mbranch-protection=%>");
11802 free (str);
11803 return res == AARCH64_PARSE_OK;
11804 }
11805
11806 /* Validate a command-line -march option. Parse the arch and extensions
11807 (if any) specified in STR and throw errors if appropriate. Put the
11808 results, if they are valid, in RES and ISA_FLAGS. Return whether the
11809 option is valid. */
11810
11811 static bool
11812 aarch64_validate_march (const char *str, const struct processor **res,
11813 unsigned long *isa_flags)
11814 {
11815 std::string invalid_extension;
11816 enum aarch64_parse_opt_result parse_res
11817 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11818
11819 if (parse_res == AARCH64_PARSE_OK)
11820 return true;
11821
11822 switch (parse_res)
11823 {
11824 case AARCH64_PARSE_MISSING_ARG:
11825 error ("missing arch name in %<-march=%s%>", str);
11826 break;
11827 case AARCH64_PARSE_INVALID_ARG:
11828 error ("unknown value %qs for -march", str);
11829 aarch64_print_hint_for_arch (str);
11830 break;
11831 case AARCH64_PARSE_INVALID_FEATURE:
11832 error ("invalid feature modifier %qs in %<-march=%s%>",
11833 invalid_extension.c_str (), str);
11834 aarch64_print_hint_for_extensions (invalid_extension);
11835 break;
11836 default:
11837 gcc_unreachable ();
11838 }
11839
11840 return false;
11841 }
11842
11843 /* Validate a command-line -mtune option. Parse the cpu
11844 specified in STR and throw errors if appropriate. Put the
11845 result, if it is valid, in RES. Return whether the option is
11846 valid. */
11847
11848 static bool
11849 aarch64_validate_mtune (const char *str, const struct processor **res)
11850 {
11851 enum aarch64_parse_opt_result parse_res
11852 = aarch64_parse_tune (str, res);
11853
11854 if (parse_res == AARCH64_PARSE_OK)
11855 return true;
11856
11857 switch (parse_res)
11858 {
11859 case AARCH64_PARSE_MISSING_ARG:
11860 error ("missing cpu name in %<-mtune=%s%>", str);
11861 break;
11862 case AARCH64_PARSE_INVALID_ARG:
11863 error ("unknown value %qs for -mtune", str);
11864 aarch64_print_hint_for_core (str);
11865 break;
11866 default:
11867 gcc_unreachable ();
11868 }
11869 return false;
11870 }
11871
11872 /* Return the CPU corresponding to the enum CPU.
11873 If it doesn't specify a cpu, return the default. */
11874
11875 static const struct processor *
11876 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11877 {
11878 if (cpu != aarch64_none)
11879 return &all_cores[cpu];
11880
11881 /* The & 0x3f is to extract the bottom 6 bits that encode the
11882 default cpu as selected by the --with-cpu GCC configure option
11883 in config.gcc.
11884 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11885 flags mechanism should be reworked to make it more sane. */
11886 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11887 }
11888
11889 /* Return the architecture corresponding to the enum ARCH.
11890 If it doesn't specify a valid architecture, return the default. */
11891
11892 static const struct processor *
11893 aarch64_get_arch (enum aarch64_arch arch)
11894 {
11895 if (arch != aarch64_no_arch)
11896 return &all_architectures[arch];
11897
11898 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11899
11900 return &all_architectures[cpu->arch];
11901 }
11902
11903 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
11904
11905 static poly_uint16
11906 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11907 {
11908 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11909 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11910 deciding which .md file patterns to use and when deciding whether
11911 something is a legitimate address or constant. */
11912 if (value == SVE_SCALABLE || value == SVE_128)
11913 return poly_uint16 (2, 2);
11914 else
11915 return (int) value / 64;
11916 }
11917
11918 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
11919 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11920 tuning structs. In particular it must set selected_tune and
11921 aarch64_isa_flags that define the available ISA features and tuning
11922 decisions. It must also set selected_arch as this will be used to
11923 output the .arch asm tags for each function. */
11924
11925 static void
11926 aarch64_override_options (void)
11927 {
11928 unsigned long cpu_isa = 0;
11929 unsigned long arch_isa = 0;
11930 aarch64_isa_flags = 0;
11931
11932 bool valid_cpu = true;
11933 bool valid_tune = true;
11934 bool valid_arch = true;
11935
11936 selected_cpu = NULL;
11937 selected_arch = NULL;
11938 selected_tune = NULL;
11939
11940 if (aarch64_branch_protection_string)
11941 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11942
11943 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11944 If either of -march or -mtune is given, they override their
11945 respective component of -mcpu. */
11946 if (aarch64_cpu_string)
11947 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11948 &cpu_isa);
11949
11950 if (aarch64_arch_string)
11951 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11952 &arch_isa);
11953
11954 if (aarch64_tune_string)
11955 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11956
11957 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11958 SUBTARGET_OVERRIDE_OPTIONS;
11959 #endif
11960
11961 /* If the user did not specify a processor, choose the default
11962 one for them. This will be the CPU set during configuration using
11963 --with-cpu, otherwise it is "generic". */
11964 if (!selected_cpu)
11965 {
11966 if (selected_arch)
11967 {
11968 selected_cpu = &all_cores[selected_arch->ident];
11969 aarch64_isa_flags = arch_isa;
11970 explicit_arch = selected_arch->arch;
11971 }
11972 else
11973 {
11974 /* Get default configure-time CPU. */
11975 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
11976 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
11977 }
11978
11979 if (selected_tune)
11980 explicit_tune_core = selected_tune->ident;
11981 }
11982 /* If both -mcpu and -march are specified check that they are architecturally
11983 compatible, warn if they're not and prefer the -march ISA flags. */
11984 else if (selected_arch)
11985 {
11986 if (selected_arch->arch != selected_cpu->arch)
11987 {
11988 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
11989 all_architectures[selected_cpu->arch].name,
11990 selected_arch->name);
11991 }
11992 aarch64_isa_flags = arch_isa;
11993 explicit_arch = selected_arch->arch;
11994 explicit_tune_core = selected_tune ? selected_tune->ident
11995 : selected_cpu->ident;
11996 }
11997 else
11998 {
11999 /* -mcpu but no -march. */
12000 aarch64_isa_flags = cpu_isa;
12001 explicit_tune_core = selected_tune ? selected_tune->ident
12002 : selected_cpu->ident;
12003 gcc_assert (selected_cpu);
12004 selected_arch = &all_architectures[selected_cpu->arch];
12005 explicit_arch = selected_arch->arch;
12006 }
12007
12008 /* Set the arch as well as we will need it when outputing
12009 the .arch directive in assembly. */
12010 if (!selected_arch)
12011 {
12012 gcc_assert (selected_cpu);
12013 selected_arch = &all_architectures[selected_cpu->arch];
12014 }
12015
12016 if (!selected_tune)
12017 selected_tune = selected_cpu;
12018
12019 if (aarch64_enable_bti == 2)
12020 {
12021 #ifdef TARGET_ENABLE_BTI
12022 aarch64_enable_bti = 1;
12023 #else
12024 aarch64_enable_bti = 0;
12025 #endif
12026 }
12027
12028 /* Return address signing is currently not supported for ILP32 targets. For
12029 LP64 targets use the configured option in the absence of a command-line
12030 option for -mbranch-protection. */
12031 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12032 {
12033 #ifdef TARGET_ENABLE_PAC_RET
12034 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12035 aarch64_ra_sign_key = AARCH64_KEY_A;
12036 #else
12037 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12038 #endif
12039 }
12040
12041 #ifndef HAVE_AS_MABI_OPTION
12042 /* The compiler may have been configured with 2.23.* binutils, which does
12043 not have support for ILP32. */
12044 if (TARGET_ILP32)
12045 error ("assembler does not support -mabi=ilp32");
12046 #endif
12047
12048 /* Convert -msve-vector-bits to a VG count. */
12049 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12050
12051 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12052 sorry ("return address signing is only supported for -mabi=lp64");
12053
12054 /* Make sure we properly set up the explicit options. */
12055 if ((aarch64_cpu_string && valid_cpu)
12056 || (aarch64_tune_string && valid_tune))
12057 gcc_assert (explicit_tune_core != aarch64_none);
12058
12059 if ((aarch64_cpu_string && valid_cpu)
12060 || (aarch64_arch_string && valid_arch))
12061 gcc_assert (explicit_arch != aarch64_no_arch);
12062
12063 /* The pass to insert speculation tracking runs before
12064 shrink-wrapping and the latter does not know how to update the
12065 tracking status. So disable it in this case. */
12066 if (aarch64_track_speculation)
12067 flag_shrink_wrap = 0;
12068
12069 aarch64_override_options_internal (&global_options);
12070
12071 /* Save these options as the default ones in case we push and pop them later
12072 while processing functions with potential target attributes. */
12073 target_option_default_node = target_option_current_node
12074 = build_target_option_node (&global_options);
12075 }
12076
12077 /* Implement targetm.override_options_after_change. */
12078
12079 static void
12080 aarch64_override_options_after_change (void)
12081 {
12082 aarch64_override_options_after_change_1 (&global_options);
12083 }
12084
12085 static struct machine_function *
12086 aarch64_init_machine_status (void)
12087 {
12088 struct machine_function *machine;
12089 machine = ggc_cleared_alloc<machine_function> ();
12090 return machine;
12091 }
12092
12093 void
12094 aarch64_init_expanders (void)
12095 {
12096 init_machine_status = aarch64_init_machine_status;
12097 }
12098
12099 /* A checking mechanism for the implementation of the various code models. */
12100 static void
12101 initialize_aarch64_code_model (struct gcc_options *opts)
12102 {
12103 if (opts->x_flag_pic)
12104 {
12105 switch (opts->x_aarch64_cmodel_var)
12106 {
12107 case AARCH64_CMODEL_TINY:
12108 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12109 break;
12110 case AARCH64_CMODEL_SMALL:
12111 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12112 aarch64_cmodel = (flag_pic == 2
12113 ? AARCH64_CMODEL_SMALL_PIC
12114 : AARCH64_CMODEL_SMALL_SPIC);
12115 #else
12116 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12117 #endif
12118 break;
12119 case AARCH64_CMODEL_LARGE:
12120 sorry ("code model %qs with -f%s", "large",
12121 opts->x_flag_pic > 1 ? "PIC" : "pic");
12122 break;
12123 default:
12124 gcc_unreachable ();
12125 }
12126 }
12127 else
12128 aarch64_cmodel = opts->x_aarch64_cmodel_var;
12129 }
12130
12131 /* Implement TARGET_OPTION_SAVE. */
12132
12133 static void
12134 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12135 {
12136 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12137 ptr->x_aarch64_branch_protection_string
12138 = opts->x_aarch64_branch_protection_string;
12139 }
12140
12141 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12142 using the information saved in PTR. */
12143
12144 static void
12145 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12146 {
12147 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12148 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12149 opts->x_explicit_arch = ptr->x_explicit_arch;
12150 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12151 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12152 opts->x_aarch64_branch_protection_string
12153 = ptr->x_aarch64_branch_protection_string;
12154 if (opts->x_aarch64_branch_protection_string)
12155 {
12156 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12157 NULL);
12158 }
12159
12160 aarch64_override_options_internal (opts);
12161 }
12162
12163 /* Implement TARGET_OPTION_PRINT. */
12164
12165 static void
12166 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12167 {
12168 const struct processor *cpu
12169 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12170 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
12171 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12172 std::string extension
12173 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12174
12175 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12176 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12177 arch->name, extension.c_str ());
12178 }
12179
12180 static GTY(()) tree aarch64_previous_fndecl;
12181
12182 void
12183 aarch64_reset_previous_fndecl (void)
12184 {
12185 aarch64_previous_fndecl = NULL;
12186 }
12187
12188 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12189 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12190 make sure optab availability predicates are recomputed when necessary. */
12191
12192 void
12193 aarch64_save_restore_target_globals (tree new_tree)
12194 {
12195 if (TREE_TARGET_GLOBALS (new_tree))
12196 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12197 else if (new_tree == target_option_default_node)
12198 restore_target_globals (&default_target_globals);
12199 else
12200 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12201 }
12202
12203 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12204 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12205 of the function, if such exists. This function may be called multiple
12206 times on a single function so use aarch64_previous_fndecl to avoid
12207 setting up identical state. */
12208
12209 static void
12210 aarch64_set_current_function (tree fndecl)
12211 {
12212 if (!fndecl || fndecl == aarch64_previous_fndecl)
12213 return;
12214
12215 tree old_tree = (aarch64_previous_fndecl
12216 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12217 : NULL_TREE);
12218
12219 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12220
12221 /* If current function has no attributes but the previous one did,
12222 use the default node. */
12223 if (!new_tree && old_tree)
12224 new_tree = target_option_default_node;
12225
12226 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12227 the default have been handled by aarch64_save_restore_target_globals from
12228 aarch64_pragma_target_parse. */
12229 if (old_tree == new_tree)
12230 return;
12231
12232 aarch64_previous_fndecl = fndecl;
12233
12234 /* First set the target options. */
12235 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12236
12237 aarch64_save_restore_target_globals (new_tree);
12238 }
12239
12240 /* Enum describing the various ways we can handle attributes.
12241 In many cases we can reuse the generic option handling machinery. */
12242
12243 enum aarch64_attr_opt_type
12244 {
12245 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
12246 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
12247 aarch64_attr_enum, /* Attribute sets an enum variable. */
12248 aarch64_attr_custom /* Attribute requires a custom handling function. */
12249 };
12250
12251 /* All the information needed to handle a target attribute.
12252 NAME is the name of the attribute.
12253 ATTR_TYPE specifies the type of behavior of the attribute as described
12254 in the definition of enum aarch64_attr_opt_type.
12255 ALLOW_NEG is true if the attribute supports a "no-" form.
12256 HANDLER is the function that takes the attribute string as an argument
12257 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12258 OPT_NUM is the enum specifying the option that the attribute modifies.
12259 This is needed for attributes that mirror the behavior of a command-line
12260 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12261 aarch64_attr_enum. */
12262
12263 struct aarch64_attribute_info
12264 {
12265 const char *name;
12266 enum aarch64_attr_opt_type attr_type;
12267 bool allow_neg;
12268 bool (*handler) (const char *);
12269 enum opt_code opt_num;
12270 };
12271
12272 /* Handle the ARCH_STR argument to the arch= target attribute. */
12273
12274 static bool
12275 aarch64_handle_attr_arch (const char *str)
12276 {
12277 const struct processor *tmp_arch = NULL;
12278 std::string invalid_extension;
12279 enum aarch64_parse_opt_result parse_res
12280 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12281
12282 if (parse_res == AARCH64_PARSE_OK)
12283 {
12284 gcc_assert (tmp_arch);
12285 selected_arch = tmp_arch;
12286 explicit_arch = selected_arch->arch;
12287 return true;
12288 }
12289
12290 switch (parse_res)
12291 {
12292 case AARCH64_PARSE_MISSING_ARG:
12293 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12294 break;
12295 case AARCH64_PARSE_INVALID_ARG:
12296 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12297 aarch64_print_hint_for_arch (str);
12298 break;
12299 case AARCH64_PARSE_INVALID_FEATURE:
12300 error ("invalid feature modifier %s of value (\"%s\") in "
12301 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12302 aarch64_print_hint_for_extensions (invalid_extension);
12303 break;
12304 default:
12305 gcc_unreachable ();
12306 }
12307
12308 return false;
12309 }
12310
12311 /* Handle the argument CPU_STR to the cpu= target attribute. */
12312
12313 static bool
12314 aarch64_handle_attr_cpu (const char *str)
12315 {
12316 const struct processor *tmp_cpu = NULL;
12317 std::string invalid_extension;
12318 enum aarch64_parse_opt_result parse_res
12319 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12320
12321 if (parse_res == AARCH64_PARSE_OK)
12322 {
12323 gcc_assert (tmp_cpu);
12324 selected_tune = tmp_cpu;
12325 explicit_tune_core = selected_tune->ident;
12326
12327 selected_arch = &all_architectures[tmp_cpu->arch];
12328 explicit_arch = selected_arch->arch;
12329 return true;
12330 }
12331
12332 switch (parse_res)
12333 {
12334 case AARCH64_PARSE_MISSING_ARG:
12335 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12336 break;
12337 case AARCH64_PARSE_INVALID_ARG:
12338 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12339 aarch64_print_hint_for_core (str);
12340 break;
12341 case AARCH64_PARSE_INVALID_FEATURE:
12342 error ("invalid feature modifier %s of value (\"%s\") in "
12343 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12344 aarch64_print_hint_for_extensions (invalid_extension);
12345 break;
12346 default:
12347 gcc_unreachable ();
12348 }
12349
12350 return false;
12351 }
12352
12353 /* Handle the argument STR to the branch-protection= attribute. */
12354
12355 static bool
12356 aarch64_handle_attr_branch_protection (const char* str)
12357 {
12358 char *err_str = (char *) xmalloc (strlen (str));
12359 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12360 &err_str);
12361 bool success = false;
12362 switch (res)
12363 {
12364 case AARCH64_PARSE_MISSING_ARG:
12365 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12366 " attribute");
12367 break;
12368 case AARCH64_PARSE_INVALID_ARG:
12369 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12370 "=\")%> pragma or attribute", err_str);
12371 break;
12372 case AARCH64_PARSE_OK:
12373 success = true;
12374 /* Fall through. */
12375 case AARCH64_PARSE_INVALID_FEATURE:
12376 break;
12377 default:
12378 gcc_unreachable ();
12379 }
12380 free (err_str);
12381 return success;
12382 }
12383
12384 /* Handle the argument STR to the tune= target attribute. */
12385
12386 static bool
12387 aarch64_handle_attr_tune (const char *str)
12388 {
12389 const struct processor *tmp_tune = NULL;
12390 enum aarch64_parse_opt_result parse_res
12391 = aarch64_parse_tune (str, &tmp_tune);
12392
12393 if (parse_res == AARCH64_PARSE_OK)
12394 {
12395 gcc_assert (tmp_tune);
12396 selected_tune = tmp_tune;
12397 explicit_tune_core = selected_tune->ident;
12398 return true;
12399 }
12400
12401 switch (parse_res)
12402 {
12403 case AARCH64_PARSE_INVALID_ARG:
12404 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12405 aarch64_print_hint_for_core (str);
12406 break;
12407 default:
12408 gcc_unreachable ();
12409 }
12410
12411 return false;
12412 }
12413
12414 /* Parse an architecture extensions target attribute string specified in STR.
12415 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12416 if successful. Update aarch64_isa_flags to reflect the ISA features
12417 modified. */
12418
12419 static bool
12420 aarch64_handle_attr_isa_flags (char *str)
12421 {
12422 enum aarch64_parse_opt_result parse_res;
12423 unsigned long isa_flags = aarch64_isa_flags;
12424
12425 /* We allow "+nothing" in the beginning to clear out all architectural
12426 features if the user wants to handpick specific features. */
12427 if (strncmp ("+nothing", str, 8) == 0)
12428 {
12429 isa_flags = 0;
12430 str += 8;
12431 }
12432
12433 std::string invalid_extension;
12434 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12435
12436 if (parse_res == AARCH64_PARSE_OK)
12437 {
12438 aarch64_isa_flags = isa_flags;
12439 return true;
12440 }
12441
12442 switch (parse_res)
12443 {
12444 case AARCH64_PARSE_MISSING_ARG:
12445 error ("missing value in %<target()%> pragma or attribute");
12446 break;
12447
12448 case AARCH64_PARSE_INVALID_FEATURE:
12449 error ("invalid feature modifier %s of value (\"%s\") in "
12450 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12451 break;
12452
12453 default:
12454 gcc_unreachable ();
12455 }
12456
12457 return false;
12458 }
12459
12460 /* The target attributes that we support. On top of these we also support just
12461 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12462 handled explicitly in aarch64_process_one_target_attr. */
12463
12464 static const struct aarch64_attribute_info aarch64_attributes[] =
12465 {
12466 { "general-regs-only", aarch64_attr_mask, false, NULL,
12467 OPT_mgeneral_regs_only },
12468 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12469 OPT_mfix_cortex_a53_835769 },
12470 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12471 OPT_mfix_cortex_a53_843419 },
12472 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12473 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12474 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12475 OPT_momit_leaf_frame_pointer },
12476 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12477 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12478 OPT_march_ },
12479 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12480 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12481 OPT_mtune_ },
12482 { "branch-protection", aarch64_attr_custom, false,
12483 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12484 { "sign-return-address", aarch64_attr_enum, false, NULL,
12485 OPT_msign_return_address_ },
12486 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12487 };
12488
12489 /* Parse ARG_STR which contains the definition of one target attribute.
12490 Show appropriate errors if any or return true if the attribute is valid. */
12491
12492 static bool
12493 aarch64_process_one_target_attr (char *arg_str)
12494 {
12495 bool invert = false;
12496
12497 size_t len = strlen (arg_str);
12498
12499 if (len == 0)
12500 {
12501 error ("malformed %<target()%> pragma or attribute");
12502 return false;
12503 }
12504
12505 char *str_to_check = (char *) alloca (len + 1);
12506 strcpy (str_to_check, arg_str);
12507
12508 /* Skip leading whitespace. */
12509 while (*str_to_check == ' ' || *str_to_check == '\t')
12510 str_to_check++;
12511
12512 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12513 It is easier to detect and handle it explicitly here rather than going
12514 through the machinery for the rest of the target attributes in this
12515 function. */
12516 if (*str_to_check == '+')
12517 return aarch64_handle_attr_isa_flags (str_to_check);
12518
12519 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12520 {
12521 invert = true;
12522 str_to_check += 3;
12523 }
12524 char *arg = strchr (str_to_check, '=');
12525
12526 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12527 and point ARG to "foo". */
12528 if (arg)
12529 {
12530 *arg = '\0';
12531 arg++;
12532 }
12533 const struct aarch64_attribute_info *p_attr;
12534 bool found = false;
12535 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12536 {
12537 /* If the names don't match up, or the user has given an argument
12538 to an attribute that doesn't accept one, or didn't give an argument
12539 to an attribute that expects one, fail to match. */
12540 if (strcmp (str_to_check, p_attr->name) != 0)
12541 continue;
12542
12543 found = true;
12544 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12545 || p_attr->attr_type == aarch64_attr_enum;
12546
12547 if (attr_need_arg_p ^ (arg != NULL))
12548 {
12549 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12550 return false;
12551 }
12552
12553 /* If the name matches but the attribute does not allow "no-" versions
12554 then we can't match. */
12555 if (invert && !p_attr->allow_neg)
12556 {
12557 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12558 return false;
12559 }
12560
12561 switch (p_attr->attr_type)
12562 {
12563 /* Has a custom handler registered.
12564 For example, cpu=, arch=, tune=. */
12565 case aarch64_attr_custom:
12566 gcc_assert (p_attr->handler);
12567 if (!p_attr->handler (arg))
12568 return false;
12569 break;
12570
12571 /* Either set or unset a boolean option. */
12572 case aarch64_attr_bool:
12573 {
12574 struct cl_decoded_option decoded;
12575
12576 generate_option (p_attr->opt_num, NULL, !invert,
12577 CL_TARGET, &decoded);
12578 aarch64_handle_option (&global_options, &global_options_set,
12579 &decoded, input_location);
12580 break;
12581 }
12582 /* Set or unset a bit in the target_flags. aarch64_handle_option
12583 should know what mask to apply given the option number. */
12584 case aarch64_attr_mask:
12585 {
12586 struct cl_decoded_option decoded;
12587 /* We only need to specify the option number.
12588 aarch64_handle_option will know which mask to apply. */
12589 decoded.opt_index = p_attr->opt_num;
12590 decoded.value = !invert;
12591 aarch64_handle_option (&global_options, &global_options_set,
12592 &decoded, input_location);
12593 break;
12594 }
12595 /* Use the option setting machinery to set an option to an enum. */
12596 case aarch64_attr_enum:
12597 {
12598 gcc_assert (arg);
12599 bool valid;
12600 int value;
12601 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12602 &value, CL_TARGET);
12603 if (valid)
12604 {
12605 set_option (&global_options, NULL, p_attr->opt_num, value,
12606 NULL, DK_UNSPECIFIED, input_location,
12607 global_dc);
12608 }
12609 else
12610 {
12611 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12612 }
12613 break;
12614 }
12615 default:
12616 gcc_unreachable ();
12617 }
12618 }
12619
12620 /* If we reached here we either have found an attribute and validated
12621 it or didn't match any. If we matched an attribute but its arguments
12622 were malformed we will have returned false already. */
12623 return found;
12624 }
12625
12626 /* Count how many times the character C appears in
12627 NULL-terminated string STR. */
12628
12629 static unsigned int
12630 num_occurences_in_str (char c, char *str)
12631 {
12632 unsigned int res = 0;
12633 while (*str != '\0')
12634 {
12635 if (*str == c)
12636 res++;
12637
12638 str++;
12639 }
12640
12641 return res;
12642 }
12643
12644 /* Parse the tree in ARGS that contains the target attribute information
12645 and update the global target options space. */
12646
12647 bool
12648 aarch64_process_target_attr (tree args)
12649 {
12650 if (TREE_CODE (args) == TREE_LIST)
12651 {
12652 do
12653 {
12654 tree head = TREE_VALUE (args);
12655 if (head)
12656 {
12657 if (!aarch64_process_target_attr (head))
12658 return false;
12659 }
12660 args = TREE_CHAIN (args);
12661 } while (args);
12662
12663 return true;
12664 }
12665
12666 if (TREE_CODE (args) != STRING_CST)
12667 {
12668 error ("attribute %<target%> argument not a string");
12669 return false;
12670 }
12671
12672 size_t len = strlen (TREE_STRING_POINTER (args));
12673 char *str_to_check = (char *) alloca (len + 1);
12674 strcpy (str_to_check, TREE_STRING_POINTER (args));
12675
12676 if (len == 0)
12677 {
12678 error ("malformed %<target()%> pragma or attribute");
12679 return false;
12680 }
12681
12682 /* Used to catch empty spaces between commas i.e.
12683 attribute ((target ("attr1,,attr2"))). */
12684 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12685
12686 /* Handle multiple target attributes separated by ','. */
12687 char *token = strtok_r (str_to_check, ",", &str_to_check);
12688
12689 unsigned int num_attrs = 0;
12690 while (token)
12691 {
12692 num_attrs++;
12693 if (!aarch64_process_one_target_attr (token))
12694 {
12695 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12696 return false;
12697 }
12698
12699 token = strtok_r (NULL, ",", &str_to_check);
12700 }
12701
12702 if (num_attrs != num_commas + 1)
12703 {
12704 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12705 return false;
12706 }
12707
12708 return true;
12709 }
12710
12711 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
12712 process attribute ((target ("..."))). */
12713
12714 static bool
12715 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12716 {
12717 struct cl_target_option cur_target;
12718 bool ret;
12719 tree old_optimize;
12720 tree new_target, new_optimize;
12721 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12722
12723 /* If what we're processing is the current pragma string then the
12724 target option node is already stored in target_option_current_node
12725 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
12726 having to re-parse the string. This is especially useful to keep
12727 arm_neon.h compile times down since that header contains a lot
12728 of intrinsics enclosed in pragmas. */
12729 if (!existing_target && args == current_target_pragma)
12730 {
12731 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12732 return true;
12733 }
12734 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12735
12736 old_optimize = build_optimization_node (&global_options);
12737 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12738
12739 /* If the function changed the optimization levels as well as setting
12740 target options, start with the optimizations specified. */
12741 if (func_optimize && func_optimize != old_optimize)
12742 cl_optimization_restore (&global_options,
12743 TREE_OPTIMIZATION (func_optimize));
12744
12745 /* Save the current target options to restore at the end. */
12746 cl_target_option_save (&cur_target, &global_options);
12747
12748 /* If fndecl already has some target attributes applied to it, unpack
12749 them so that we add this attribute on top of them, rather than
12750 overwriting them. */
12751 if (existing_target)
12752 {
12753 struct cl_target_option *existing_options
12754 = TREE_TARGET_OPTION (existing_target);
12755
12756 if (existing_options)
12757 cl_target_option_restore (&global_options, existing_options);
12758 }
12759 else
12760 cl_target_option_restore (&global_options,
12761 TREE_TARGET_OPTION (target_option_current_node));
12762
12763 ret = aarch64_process_target_attr (args);
12764
12765 /* Set up any additional state. */
12766 if (ret)
12767 {
12768 aarch64_override_options_internal (&global_options);
12769 /* Initialize SIMD builtins if we haven't already.
12770 Set current_target_pragma to NULL for the duration so that
12771 the builtin initialization code doesn't try to tag the functions
12772 being built with the attributes specified by any current pragma, thus
12773 going into an infinite recursion. */
12774 if (TARGET_SIMD)
12775 {
12776 tree saved_current_target_pragma = current_target_pragma;
12777 current_target_pragma = NULL;
12778 aarch64_init_simd_builtins ();
12779 current_target_pragma = saved_current_target_pragma;
12780 }
12781 new_target = build_target_option_node (&global_options);
12782 }
12783 else
12784 new_target = NULL;
12785
12786 new_optimize = build_optimization_node (&global_options);
12787
12788 if (fndecl && ret)
12789 {
12790 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12791
12792 if (old_optimize != new_optimize)
12793 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12794 }
12795
12796 cl_target_option_restore (&global_options, &cur_target);
12797
12798 if (old_optimize != new_optimize)
12799 cl_optimization_restore (&global_options,
12800 TREE_OPTIMIZATION (old_optimize));
12801 return ret;
12802 }
12803
12804 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
12805 tri-bool options (yes, no, don't care) and the default value is
12806 DEF, determine whether to reject inlining. */
12807
12808 static bool
12809 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12810 int dont_care, int def)
12811 {
12812 /* If the callee doesn't care, always allow inlining. */
12813 if (callee == dont_care)
12814 return true;
12815
12816 /* If the caller doesn't care, always allow inlining. */
12817 if (caller == dont_care)
12818 return true;
12819
12820 /* Otherwise, allow inlining if either the callee and caller values
12821 agree, or if the callee is using the default value. */
12822 return (callee == caller || callee == def);
12823 }
12824
12825 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
12826 to inline CALLEE into CALLER based on target-specific info.
12827 Make sure that the caller and callee have compatible architectural
12828 features. Then go through the other possible target attributes
12829 and see if they can block inlining. Try not to reject always_inline
12830 callees unless they are incompatible architecturally. */
12831
12832 static bool
12833 aarch64_can_inline_p (tree caller, tree callee)
12834 {
12835 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12836 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12837
12838 struct cl_target_option *caller_opts
12839 = TREE_TARGET_OPTION (caller_tree ? caller_tree
12840 : target_option_default_node);
12841
12842 struct cl_target_option *callee_opts
12843 = TREE_TARGET_OPTION (callee_tree ? callee_tree
12844 : target_option_default_node);
12845
12846 /* Callee's ISA flags should be a subset of the caller's. */
12847 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12848 != callee_opts->x_aarch64_isa_flags)
12849 return false;
12850
12851 /* Allow non-strict aligned functions inlining into strict
12852 aligned ones. */
12853 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12854 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12855 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12856 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12857 return false;
12858
12859 bool always_inline = lookup_attribute ("always_inline",
12860 DECL_ATTRIBUTES (callee));
12861
12862 /* If the architectural features match up and the callee is always_inline
12863 then the other attributes don't matter. */
12864 if (always_inline)
12865 return true;
12866
12867 if (caller_opts->x_aarch64_cmodel_var
12868 != callee_opts->x_aarch64_cmodel_var)
12869 return false;
12870
12871 if (caller_opts->x_aarch64_tls_dialect
12872 != callee_opts->x_aarch64_tls_dialect)
12873 return false;
12874
12875 /* Honour explicit requests to workaround errata. */
12876 if (!aarch64_tribools_ok_for_inlining_p (
12877 caller_opts->x_aarch64_fix_a53_err835769,
12878 callee_opts->x_aarch64_fix_a53_err835769,
12879 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12880 return false;
12881
12882 if (!aarch64_tribools_ok_for_inlining_p (
12883 caller_opts->x_aarch64_fix_a53_err843419,
12884 callee_opts->x_aarch64_fix_a53_err843419,
12885 2, TARGET_FIX_ERR_A53_843419))
12886 return false;
12887
12888 /* If the user explicitly specified -momit-leaf-frame-pointer for the
12889 caller and calle and they don't match up, reject inlining. */
12890 if (!aarch64_tribools_ok_for_inlining_p (
12891 caller_opts->x_flag_omit_leaf_frame_pointer,
12892 callee_opts->x_flag_omit_leaf_frame_pointer,
12893 2, 1))
12894 return false;
12895
12896 /* If the callee has specific tuning overrides, respect them. */
12897 if (callee_opts->x_aarch64_override_tune_string != NULL
12898 && caller_opts->x_aarch64_override_tune_string == NULL)
12899 return false;
12900
12901 /* If the user specified tuning override strings for the
12902 caller and callee and they don't match up, reject inlining.
12903 We just do a string compare here, we don't analyze the meaning
12904 of the string, as it would be too costly for little gain. */
12905 if (callee_opts->x_aarch64_override_tune_string
12906 && caller_opts->x_aarch64_override_tune_string
12907 && (strcmp (callee_opts->x_aarch64_override_tune_string,
12908 caller_opts->x_aarch64_override_tune_string) != 0))
12909 return false;
12910
12911 return true;
12912 }
12913
12914 /* Return true if SYMBOL_REF X binds locally. */
12915
12916 static bool
12917 aarch64_symbol_binds_local_p (const_rtx x)
12918 {
12919 return (SYMBOL_REF_DECL (x)
12920 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12921 : SYMBOL_REF_LOCAL_P (x));
12922 }
12923
12924 /* Return true if SYMBOL_REF X is thread local */
12925 static bool
12926 aarch64_tls_symbol_p (rtx x)
12927 {
12928 if (! TARGET_HAVE_TLS)
12929 return false;
12930
12931 if (GET_CODE (x) != SYMBOL_REF)
12932 return false;
12933
12934 return SYMBOL_REF_TLS_MODEL (x) != 0;
12935 }
12936
12937 /* Classify a TLS symbol into one of the TLS kinds. */
12938 enum aarch64_symbol_type
12939 aarch64_classify_tls_symbol (rtx x)
12940 {
12941 enum tls_model tls_kind = tls_symbolic_operand_type (x);
12942
12943 switch (tls_kind)
12944 {
12945 case TLS_MODEL_GLOBAL_DYNAMIC:
12946 case TLS_MODEL_LOCAL_DYNAMIC:
12947 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12948
12949 case TLS_MODEL_INITIAL_EXEC:
12950 switch (aarch64_cmodel)
12951 {
12952 case AARCH64_CMODEL_TINY:
12953 case AARCH64_CMODEL_TINY_PIC:
12954 return SYMBOL_TINY_TLSIE;
12955 default:
12956 return SYMBOL_SMALL_TLSIE;
12957 }
12958
12959 case TLS_MODEL_LOCAL_EXEC:
12960 if (aarch64_tls_size == 12)
12961 return SYMBOL_TLSLE12;
12962 else if (aarch64_tls_size == 24)
12963 return SYMBOL_TLSLE24;
12964 else if (aarch64_tls_size == 32)
12965 return SYMBOL_TLSLE32;
12966 else if (aarch64_tls_size == 48)
12967 return SYMBOL_TLSLE48;
12968 else
12969 gcc_unreachable ();
12970
12971 case TLS_MODEL_EMULATED:
12972 case TLS_MODEL_NONE:
12973 return SYMBOL_FORCE_TO_MEM;
12974
12975 default:
12976 gcc_unreachable ();
12977 }
12978 }
12979
12980 /* Return the correct method for accessing X + OFFSET, where X is either
12981 a SYMBOL_REF or LABEL_REF. */
12982
12983 enum aarch64_symbol_type
12984 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
12985 {
12986 if (GET_CODE (x) == LABEL_REF)
12987 {
12988 switch (aarch64_cmodel)
12989 {
12990 case AARCH64_CMODEL_LARGE:
12991 return SYMBOL_FORCE_TO_MEM;
12992
12993 case AARCH64_CMODEL_TINY_PIC:
12994 case AARCH64_CMODEL_TINY:
12995 return SYMBOL_TINY_ABSOLUTE;
12996
12997 case AARCH64_CMODEL_SMALL_SPIC:
12998 case AARCH64_CMODEL_SMALL_PIC:
12999 case AARCH64_CMODEL_SMALL:
13000 return SYMBOL_SMALL_ABSOLUTE;
13001
13002 default:
13003 gcc_unreachable ();
13004 }
13005 }
13006
13007 if (GET_CODE (x) == SYMBOL_REF)
13008 {
13009 if (aarch64_tls_symbol_p (x))
13010 return aarch64_classify_tls_symbol (x);
13011
13012 switch (aarch64_cmodel)
13013 {
13014 case AARCH64_CMODEL_TINY:
13015 /* When we retrieve symbol + offset address, we have to make sure
13016 the offset does not cause overflow of the final address. But
13017 we have no way of knowing the address of symbol at compile time
13018 so we can't accurately say if the distance between the PC and
13019 symbol + offset is outside the addressible range of +/-1M in the
13020 TINY code model. So we rely on images not being greater than
13021 1M and cap the offset at 1M and anything beyond 1M will have to
13022 be loaded using an alternative mechanism. Furthermore if the
13023 symbol is a weak reference to something that isn't known to
13024 resolve to a symbol in this module, then force to memory. */
13025 if ((SYMBOL_REF_WEAK (x)
13026 && !aarch64_symbol_binds_local_p (x))
13027 || !IN_RANGE (offset, -1048575, 1048575))
13028 return SYMBOL_FORCE_TO_MEM;
13029 return SYMBOL_TINY_ABSOLUTE;
13030
13031 case AARCH64_CMODEL_SMALL:
13032 /* Same reasoning as the tiny code model, but the offset cap here is
13033 4G. */
13034 if ((SYMBOL_REF_WEAK (x)
13035 && !aarch64_symbol_binds_local_p (x))
13036 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13037 HOST_WIDE_INT_C (4294967264)))
13038 return SYMBOL_FORCE_TO_MEM;
13039 return SYMBOL_SMALL_ABSOLUTE;
13040
13041 case AARCH64_CMODEL_TINY_PIC:
13042 if (!aarch64_symbol_binds_local_p (x))
13043 return SYMBOL_TINY_GOT;
13044 return SYMBOL_TINY_ABSOLUTE;
13045
13046 case AARCH64_CMODEL_SMALL_SPIC:
13047 case AARCH64_CMODEL_SMALL_PIC:
13048 if (!aarch64_symbol_binds_local_p (x))
13049 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13050 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13051 return SYMBOL_SMALL_ABSOLUTE;
13052
13053 case AARCH64_CMODEL_LARGE:
13054 /* This is alright even in PIC code as the constant
13055 pool reference is always PC relative and within
13056 the same translation unit. */
13057 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13058 return SYMBOL_SMALL_ABSOLUTE;
13059 else
13060 return SYMBOL_FORCE_TO_MEM;
13061
13062 default:
13063 gcc_unreachable ();
13064 }
13065 }
13066
13067 /* By default push everything into the constant pool. */
13068 return SYMBOL_FORCE_TO_MEM;
13069 }
13070
13071 bool
13072 aarch64_constant_address_p (rtx x)
13073 {
13074 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13075 }
13076
13077 bool
13078 aarch64_legitimate_pic_operand_p (rtx x)
13079 {
13080 if (GET_CODE (x) == SYMBOL_REF
13081 || (GET_CODE (x) == CONST
13082 && GET_CODE (XEXP (x, 0)) == PLUS
13083 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13084 return false;
13085
13086 return true;
13087 }
13088
13089 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13090 that should be rematerialized rather than spilled. */
13091
13092 static bool
13093 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13094 {
13095 /* Support CSE and rematerialization of common constants. */
13096 if (CONST_INT_P (x)
13097 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13098 || GET_CODE (x) == CONST_VECTOR)
13099 return true;
13100
13101 /* Do not allow vector struct mode constants for Advanced SIMD.
13102 We could support 0 and -1 easily, but they need support in
13103 aarch64-simd.md. */
13104 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13105 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13106 return false;
13107
13108 /* Only accept variable-length vector constants if they can be
13109 handled directly.
13110
13111 ??? It would be possible to handle rematerialization of other
13112 constants via secondary reloads. */
13113 if (vec_flags & VEC_ANY_SVE)
13114 return aarch64_simd_valid_immediate (x, NULL);
13115
13116 if (GET_CODE (x) == HIGH)
13117 x = XEXP (x, 0);
13118
13119 /* Accept polynomial constants that can be calculated by using the
13120 destination of a move as the sole temporary. Constants that
13121 require a second temporary cannot be rematerialized (they can't be
13122 forced to memory and also aren't legitimate constants). */
13123 poly_int64 offset;
13124 if (poly_int_rtx_p (x, &offset))
13125 return aarch64_offset_temporaries (false, offset) <= 1;
13126
13127 /* If an offset is being added to something else, we need to allow the
13128 base to be moved into the destination register, meaning that there
13129 are no free temporaries for the offset. */
13130 x = strip_offset (x, &offset);
13131 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13132 return false;
13133
13134 /* Do not allow const (plus (anchor_symbol, const_int)). */
13135 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13136 return false;
13137
13138 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13139 so spilling them is better than rematerialization. */
13140 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13141 return true;
13142
13143 /* Label references are always constant. */
13144 if (GET_CODE (x) == LABEL_REF)
13145 return true;
13146
13147 return false;
13148 }
13149
13150 rtx
13151 aarch64_load_tp (rtx target)
13152 {
13153 if (!target
13154 || GET_MODE (target) != Pmode
13155 || !register_operand (target, Pmode))
13156 target = gen_reg_rtx (Pmode);
13157
13158 /* Can return in any reg. */
13159 emit_insn (gen_aarch64_load_tp_hard (target));
13160 return target;
13161 }
13162
13163 /* On AAPCS systems, this is the "struct __va_list". */
13164 static GTY(()) tree va_list_type;
13165
13166 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13167 Return the type to use as __builtin_va_list.
13168
13169 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13170
13171 struct __va_list
13172 {
13173 void *__stack;
13174 void *__gr_top;
13175 void *__vr_top;
13176 int __gr_offs;
13177 int __vr_offs;
13178 }; */
13179
13180 static tree
13181 aarch64_build_builtin_va_list (void)
13182 {
13183 tree va_list_name;
13184 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13185
13186 /* Create the type. */
13187 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13188 /* Give it the required name. */
13189 va_list_name = build_decl (BUILTINS_LOCATION,
13190 TYPE_DECL,
13191 get_identifier ("__va_list"),
13192 va_list_type);
13193 DECL_ARTIFICIAL (va_list_name) = 1;
13194 TYPE_NAME (va_list_type) = va_list_name;
13195 TYPE_STUB_DECL (va_list_type) = va_list_name;
13196
13197 /* Create the fields. */
13198 f_stack = build_decl (BUILTINS_LOCATION,
13199 FIELD_DECL, get_identifier ("__stack"),
13200 ptr_type_node);
13201 f_grtop = build_decl (BUILTINS_LOCATION,
13202 FIELD_DECL, get_identifier ("__gr_top"),
13203 ptr_type_node);
13204 f_vrtop = build_decl (BUILTINS_LOCATION,
13205 FIELD_DECL, get_identifier ("__vr_top"),
13206 ptr_type_node);
13207 f_groff = build_decl (BUILTINS_LOCATION,
13208 FIELD_DECL, get_identifier ("__gr_offs"),
13209 integer_type_node);
13210 f_vroff = build_decl (BUILTINS_LOCATION,
13211 FIELD_DECL, get_identifier ("__vr_offs"),
13212 integer_type_node);
13213
13214 /* Tell tree-stdarg pass about our internal offset fields.
13215 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13216 purpose to identify whether the code is updating va_list internal
13217 offset fields through irregular way. */
13218 va_list_gpr_counter_field = f_groff;
13219 va_list_fpr_counter_field = f_vroff;
13220
13221 DECL_ARTIFICIAL (f_stack) = 1;
13222 DECL_ARTIFICIAL (f_grtop) = 1;
13223 DECL_ARTIFICIAL (f_vrtop) = 1;
13224 DECL_ARTIFICIAL (f_groff) = 1;
13225 DECL_ARTIFICIAL (f_vroff) = 1;
13226
13227 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13228 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13229 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13230 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13231 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13232
13233 TYPE_FIELDS (va_list_type) = f_stack;
13234 DECL_CHAIN (f_stack) = f_grtop;
13235 DECL_CHAIN (f_grtop) = f_vrtop;
13236 DECL_CHAIN (f_vrtop) = f_groff;
13237 DECL_CHAIN (f_groff) = f_vroff;
13238
13239 /* Compute its layout. */
13240 layout_type (va_list_type);
13241
13242 return va_list_type;
13243 }
13244
13245 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13246 static void
13247 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13248 {
13249 const CUMULATIVE_ARGS *cum;
13250 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13251 tree stack, grtop, vrtop, groff, vroff;
13252 tree t;
13253 int gr_save_area_size = cfun->va_list_gpr_size;
13254 int vr_save_area_size = cfun->va_list_fpr_size;
13255 int vr_offset;
13256
13257 cum = &crtl->args.info;
13258 if (cfun->va_list_gpr_size)
13259 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13260 cfun->va_list_gpr_size);
13261 if (cfun->va_list_fpr_size)
13262 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13263 * UNITS_PER_VREG, cfun->va_list_fpr_size);
13264
13265 if (!TARGET_FLOAT)
13266 {
13267 gcc_assert (cum->aapcs_nvrn == 0);
13268 vr_save_area_size = 0;
13269 }
13270
13271 f_stack = TYPE_FIELDS (va_list_type_node);
13272 f_grtop = DECL_CHAIN (f_stack);
13273 f_vrtop = DECL_CHAIN (f_grtop);
13274 f_groff = DECL_CHAIN (f_vrtop);
13275 f_vroff = DECL_CHAIN (f_groff);
13276
13277 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13278 NULL_TREE);
13279 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13280 NULL_TREE);
13281 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13282 NULL_TREE);
13283 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13284 NULL_TREE);
13285 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13286 NULL_TREE);
13287
13288 /* Emit code to initialize STACK, which points to the next varargs stack
13289 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13290 by named arguments. STACK is 8-byte aligned. */
13291 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13292 if (cum->aapcs_stack_size > 0)
13293 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13294 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13295 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13296
13297 /* Emit code to initialize GRTOP, the top of the GR save area.
13298 virtual_incoming_args_rtx should have been 16 byte aligned. */
13299 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13300 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13301 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13302
13303 /* Emit code to initialize VRTOP, the top of the VR save area.
13304 This address is gr_save_area_bytes below GRTOP, rounded
13305 down to the next 16-byte boundary. */
13306 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13307 vr_offset = ROUND_UP (gr_save_area_size,
13308 STACK_BOUNDARY / BITS_PER_UNIT);
13309
13310 if (vr_offset)
13311 t = fold_build_pointer_plus_hwi (t, -vr_offset);
13312 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13313 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13314
13315 /* Emit code to initialize GROFF, the offset from GRTOP of the
13316 next GPR argument. */
13317 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13318 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13319 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13320
13321 /* Likewise emit code to initialize VROFF, the offset from FTOP
13322 of the next VR argument. */
13323 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13324 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13325 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13326 }
13327
13328 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13329
13330 static tree
13331 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13332 gimple_seq *post_p ATTRIBUTE_UNUSED)
13333 {
13334 tree addr;
13335 bool indirect_p;
13336 bool is_ha; /* is HFA or HVA. */
13337 bool dw_align; /* double-word align. */
13338 machine_mode ag_mode = VOIDmode;
13339 int nregs;
13340 machine_mode mode;
13341
13342 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13343 tree stack, f_top, f_off, off, arg, roundup, on_stack;
13344 HOST_WIDE_INT size, rsize, adjust, align;
13345 tree t, u, cond1, cond2;
13346
13347 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13348 if (indirect_p)
13349 type = build_pointer_type (type);
13350
13351 mode = TYPE_MODE (type);
13352
13353 f_stack = TYPE_FIELDS (va_list_type_node);
13354 f_grtop = DECL_CHAIN (f_stack);
13355 f_vrtop = DECL_CHAIN (f_grtop);
13356 f_groff = DECL_CHAIN (f_vrtop);
13357 f_vroff = DECL_CHAIN (f_groff);
13358
13359 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13360 f_stack, NULL_TREE);
13361 size = int_size_in_bytes (type);
13362
13363 bool abi_break;
13364 align
13365 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13366
13367 dw_align = false;
13368 adjust = 0;
13369 if (aarch64_vfp_is_call_or_return_candidate (mode,
13370 type,
13371 &ag_mode,
13372 &nregs,
13373 &is_ha))
13374 {
13375 /* No frontends can create types with variable-sized modes, so we
13376 shouldn't be asked to pass or return them. */
13377 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13378
13379 /* TYPE passed in fp/simd registers. */
13380 if (!TARGET_FLOAT)
13381 aarch64_err_no_fpadvsimd (mode);
13382
13383 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13384 unshare_expr (valist), f_vrtop, NULL_TREE);
13385 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13386 unshare_expr (valist), f_vroff, NULL_TREE);
13387
13388 rsize = nregs * UNITS_PER_VREG;
13389
13390 if (is_ha)
13391 {
13392 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13393 adjust = UNITS_PER_VREG - ag_size;
13394 }
13395 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13396 && size < UNITS_PER_VREG)
13397 {
13398 adjust = UNITS_PER_VREG - size;
13399 }
13400 }
13401 else
13402 {
13403 /* TYPE passed in general registers. */
13404 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13405 unshare_expr (valist), f_grtop, NULL_TREE);
13406 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13407 unshare_expr (valist), f_groff, NULL_TREE);
13408 rsize = ROUND_UP (size, UNITS_PER_WORD);
13409 nregs = rsize / UNITS_PER_WORD;
13410
13411 if (align > 8)
13412 {
13413 if (abi_break && warn_psabi)
13414 inform (input_location, "parameter passing for argument of type "
13415 "%qT changed in GCC 9.1", type);
13416 dw_align = true;
13417 }
13418
13419 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13420 && size < UNITS_PER_WORD)
13421 {
13422 adjust = UNITS_PER_WORD - size;
13423 }
13424 }
13425
13426 /* Get a local temporary for the field value. */
13427 off = get_initialized_tmp_var (f_off, pre_p, NULL);
13428
13429 /* Emit code to branch if off >= 0. */
13430 t = build2 (GE_EXPR, boolean_type_node, off,
13431 build_int_cst (TREE_TYPE (off), 0));
13432 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13433
13434 if (dw_align)
13435 {
13436 /* Emit: offs = (offs + 15) & -16. */
13437 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13438 build_int_cst (TREE_TYPE (off), 15));
13439 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13440 build_int_cst (TREE_TYPE (off), -16));
13441 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13442 }
13443 else
13444 roundup = NULL;
13445
13446 /* Update ap.__[g|v]r_offs */
13447 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13448 build_int_cst (TREE_TYPE (off), rsize));
13449 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13450
13451 /* String up. */
13452 if (roundup)
13453 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13454
13455 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13456 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13457 build_int_cst (TREE_TYPE (f_off), 0));
13458 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13459
13460 /* String up: make sure the assignment happens before the use. */
13461 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13462 COND_EXPR_ELSE (cond1) = t;
13463
13464 /* Prepare the trees handling the argument that is passed on the stack;
13465 the top level node will store in ON_STACK. */
13466 arg = get_initialized_tmp_var (stack, pre_p, NULL);
13467 if (align > 8)
13468 {
13469 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13470 t = fold_build_pointer_plus_hwi (arg, 15);
13471 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13472 build_int_cst (TREE_TYPE (t), -16));
13473 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13474 }
13475 else
13476 roundup = NULL;
13477 /* Advance ap.__stack */
13478 t = fold_build_pointer_plus_hwi (arg, size + 7);
13479 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13480 build_int_cst (TREE_TYPE (t), -8));
13481 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13482 /* String up roundup and advance. */
13483 if (roundup)
13484 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13485 /* String up with arg */
13486 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13487 /* Big-endianness related address adjustment. */
13488 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13489 && size < UNITS_PER_WORD)
13490 {
13491 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13492 size_int (UNITS_PER_WORD - size));
13493 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13494 }
13495
13496 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13497 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13498
13499 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13500 t = off;
13501 if (adjust)
13502 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13503 build_int_cst (TREE_TYPE (off), adjust));
13504
13505 t = fold_convert (sizetype, t);
13506 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13507
13508 if (is_ha)
13509 {
13510 /* type ha; // treat as "struct {ftype field[n];}"
13511 ... [computing offs]
13512 for (i = 0; i <nregs; ++i, offs += 16)
13513 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13514 return ha; */
13515 int i;
13516 tree tmp_ha, field_t, field_ptr_t;
13517
13518 /* Declare a local variable. */
13519 tmp_ha = create_tmp_var_raw (type, "ha");
13520 gimple_add_tmp_var (tmp_ha);
13521
13522 /* Establish the base type. */
13523 switch (ag_mode)
13524 {
13525 case E_SFmode:
13526 field_t = float_type_node;
13527 field_ptr_t = float_ptr_type_node;
13528 break;
13529 case E_DFmode:
13530 field_t = double_type_node;
13531 field_ptr_t = double_ptr_type_node;
13532 break;
13533 case E_TFmode:
13534 field_t = long_double_type_node;
13535 field_ptr_t = long_double_ptr_type_node;
13536 break;
13537 case E_HFmode:
13538 field_t = aarch64_fp16_type_node;
13539 field_ptr_t = aarch64_fp16_ptr_type_node;
13540 break;
13541 case E_V2SImode:
13542 case E_V4SImode:
13543 {
13544 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13545 field_t = build_vector_type_for_mode (innertype, ag_mode);
13546 field_ptr_t = build_pointer_type (field_t);
13547 }
13548 break;
13549 default:
13550 gcc_assert (0);
13551 }
13552
13553 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13554 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13555 addr = t;
13556 t = fold_convert (field_ptr_t, addr);
13557 t = build2 (MODIFY_EXPR, field_t,
13558 build1 (INDIRECT_REF, field_t, tmp_ha),
13559 build1 (INDIRECT_REF, field_t, t));
13560
13561 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13562 for (i = 1; i < nregs; ++i)
13563 {
13564 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13565 u = fold_convert (field_ptr_t, addr);
13566 u = build2 (MODIFY_EXPR, field_t,
13567 build2 (MEM_REF, field_t, tmp_ha,
13568 build_int_cst (field_ptr_t,
13569 (i *
13570 int_size_in_bytes (field_t)))),
13571 build1 (INDIRECT_REF, field_t, u));
13572 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13573 }
13574
13575 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13576 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13577 }
13578
13579 COND_EXPR_ELSE (cond2) = t;
13580 addr = fold_convert (build_pointer_type (type), cond1);
13581 addr = build_va_arg_indirect_ref (addr);
13582
13583 if (indirect_p)
13584 addr = build_va_arg_indirect_ref (addr);
13585
13586 return addr;
13587 }
13588
13589 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13590
13591 static void
13592 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13593 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13594 int no_rtl)
13595 {
13596 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13597 CUMULATIVE_ARGS local_cum;
13598 int gr_saved = cfun->va_list_gpr_size;
13599 int vr_saved = cfun->va_list_fpr_size;
13600
13601 /* The caller has advanced CUM up to, but not beyond, the last named
13602 argument. Advance a local copy of CUM past the last "real" named
13603 argument, to find out how many registers are left over. */
13604 local_cum = *cum;
13605 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13606
13607 /* Found out how many registers we need to save.
13608 Honor tree-stdvar analysis results. */
13609 if (cfun->va_list_gpr_size)
13610 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13611 cfun->va_list_gpr_size / UNITS_PER_WORD);
13612 if (cfun->va_list_fpr_size)
13613 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13614 cfun->va_list_fpr_size / UNITS_PER_VREG);
13615
13616 if (!TARGET_FLOAT)
13617 {
13618 gcc_assert (local_cum.aapcs_nvrn == 0);
13619 vr_saved = 0;
13620 }
13621
13622 if (!no_rtl)
13623 {
13624 if (gr_saved > 0)
13625 {
13626 rtx ptr, mem;
13627
13628 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13629 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13630 - gr_saved * UNITS_PER_WORD);
13631 mem = gen_frame_mem (BLKmode, ptr);
13632 set_mem_alias_set (mem, get_varargs_alias_set ());
13633
13634 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13635 mem, gr_saved);
13636 }
13637 if (vr_saved > 0)
13638 {
13639 /* We can't use move_block_from_reg, because it will use
13640 the wrong mode, storing D regs only. */
13641 machine_mode mode = TImode;
13642 int off, i, vr_start;
13643
13644 /* Set OFF to the offset from virtual_incoming_args_rtx of
13645 the first vector register. The VR save area lies below
13646 the GR one, and is aligned to 16 bytes. */
13647 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13648 STACK_BOUNDARY / BITS_PER_UNIT);
13649 off -= vr_saved * UNITS_PER_VREG;
13650
13651 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13652 for (i = 0; i < vr_saved; ++i)
13653 {
13654 rtx ptr, mem;
13655
13656 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13657 mem = gen_frame_mem (mode, ptr);
13658 set_mem_alias_set (mem, get_varargs_alias_set ());
13659 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13660 off += UNITS_PER_VREG;
13661 }
13662 }
13663 }
13664
13665 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13666 any complication of having crtl->args.pretend_args_size changed. */
13667 cfun->machine->frame.saved_varargs_size
13668 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13669 STACK_BOUNDARY / BITS_PER_UNIT)
13670 + vr_saved * UNITS_PER_VREG);
13671 }
13672
13673 static void
13674 aarch64_conditional_register_usage (void)
13675 {
13676 int i;
13677 if (!TARGET_FLOAT)
13678 {
13679 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13680 {
13681 fixed_regs[i] = 1;
13682 call_used_regs[i] = 1;
13683 }
13684 }
13685 if (!TARGET_SVE)
13686 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13687 {
13688 fixed_regs[i] = 1;
13689 call_used_regs[i] = 1;
13690 }
13691
13692 /* When tracking speculation, we need a couple of call-clobbered registers
13693 to track the speculation state. It would be nice to just use
13694 IP0 and IP1, but currently there are numerous places that just
13695 assume these registers are free for other uses (eg pointer
13696 authentication). */
13697 if (aarch64_track_speculation)
13698 {
13699 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13700 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13701 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13702 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13703 }
13704 }
13705
13706 /* Walk down the type tree of TYPE counting consecutive base elements.
13707 If *MODEP is VOIDmode, then set it to the first valid floating point
13708 type. If a non-floating point type is found, or if a floating point
13709 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13710 otherwise return the count in the sub-tree. */
13711 static int
13712 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13713 {
13714 machine_mode mode;
13715 HOST_WIDE_INT size;
13716
13717 switch (TREE_CODE (type))
13718 {
13719 case REAL_TYPE:
13720 mode = TYPE_MODE (type);
13721 if (mode != DFmode && mode != SFmode
13722 && mode != TFmode && mode != HFmode)
13723 return -1;
13724
13725 if (*modep == VOIDmode)
13726 *modep = mode;
13727
13728 if (*modep == mode)
13729 return 1;
13730
13731 break;
13732
13733 case COMPLEX_TYPE:
13734 mode = TYPE_MODE (TREE_TYPE (type));
13735 if (mode != DFmode && mode != SFmode
13736 && mode != TFmode && mode != HFmode)
13737 return -1;
13738
13739 if (*modep == VOIDmode)
13740 *modep = mode;
13741
13742 if (*modep == mode)
13743 return 2;
13744
13745 break;
13746
13747 case VECTOR_TYPE:
13748 /* Use V2SImode and V4SImode as representatives of all 64-bit
13749 and 128-bit vector types. */
13750 size = int_size_in_bytes (type);
13751 switch (size)
13752 {
13753 case 8:
13754 mode = V2SImode;
13755 break;
13756 case 16:
13757 mode = V4SImode;
13758 break;
13759 default:
13760 return -1;
13761 }
13762
13763 if (*modep == VOIDmode)
13764 *modep = mode;
13765
13766 /* Vector modes are considered to be opaque: two vectors are
13767 equivalent for the purposes of being homogeneous aggregates
13768 if they are the same size. */
13769 if (*modep == mode)
13770 return 1;
13771
13772 break;
13773
13774 case ARRAY_TYPE:
13775 {
13776 int count;
13777 tree index = TYPE_DOMAIN (type);
13778
13779 /* Can't handle incomplete types nor sizes that are not
13780 fixed. */
13781 if (!COMPLETE_TYPE_P (type)
13782 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13783 return -1;
13784
13785 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13786 if (count == -1
13787 || !index
13788 || !TYPE_MAX_VALUE (index)
13789 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13790 || !TYPE_MIN_VALUE (index)
13791 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13792 || count < 0)
13793 return -1;
13794
13795 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13796 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13797
13798 /* There must be no padding. */
13799 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13800 count * GET_MODE_BITSIZE (*modep)))
13801 return -1;
13802
13803 return count;
13804 }
13805
13806 case RECORD_TYPE:
13807 {
13808 int count = 0;
13809 int sub_count;
13810 tree field;
13811
13812 /* Can't handle incomplete types nor sizes that are not
13813 fixed. */
13814 if (!COMPLETE_TYPE_P (type)
13815 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13816 return -1;
13817
13818 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13819 {
13820 if (TREE_CODE (field) != FIELD_DECL)
13821 continue;
13822
13823 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13824 if (sub_count < 0)
13825 return -1;
13826 count += sub_count;
13827 }
13828
13829 /* There must be no padding. */
13830 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13831 count * GET_MODE_BITSIZE (*modep)))
13832 return -1;
13833
13834 return count;
13835 }
13836
13837 case UNION_TYPE:
13838 case QUAL_UNION_TYPE:
13839 {
13840 /* These aren't very interesting except in a degenerate case. */
13841 int count = 0;
13842 int sub_count;
13843 tree field;
13844
13845 /* Can't handle incomplete types nor sizes that are not
13846 fixed. */
13847 if (!COMPLETE_TYPE_P (type)
13848 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13849 return -1;
13850
13851 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13852 {
13853 if (TREE_CODE (field) != FIELD_DECL)
13854 continue;
13855
13856 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13857 if (sub_count < 0)
13858 return -1;
13859 count = count > sub_count ? count : sub_count;
13860 }
13861
13862 /* There must be no padding. */
13863 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13864 count * GET_MODE_BITSIZE (*modep)))
13865 return -1;
13866
13867 return count;
13868 }
13869
13870 default:
13871 break;
13872 }
13873
13874 return -1;
13875 }
13876
13877 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13878 type as described in AAPCS64 \S 4.1.2.
13879
13880 See the comment above aarch64_composite_type_p for the notes on MODE. */
13881
13882 static bool
13883 aarch64_short_vector_p (const_tree type,
13884 machine_mode mode)
13885 {
13886 poly_int64 size = -1;
13887
13888 if (type && TREE_CODE (type) == VECTOR_TYPE)
13889 size = int_size_in_bytes (type);
13890 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13891 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13892 size = GET_MODE_SIZE (mode);
13893
13894 return known_eq (size, 8) || known_eq (size, 16);
13895 }
13896
13897 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13898 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
13899 array types. The C99 floating-point complex types are also considered
13900 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
13901 types, which are GCC extensions and out of the scope of AAPCS64, are
13902 treated as composite types here as well.
13903
13904 Note that MODE itself is not sufficient in determining whether a type
13905 is such a composite type or not. This is because
13906 stor-layout.c:compute_record_mode may have already changed the MODE
13907 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
13908 structure with only one field may have its MODE set to the mode of the
13909 field. Also an integer mode whose size matches the size of the
13910 RECORD_TYPE type may be used to substitute the original mode
13911 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
13912 solely relied on. */
13913
13914 static bool
13915 aarch64_composite_type_p (const_tree type,
13916 machine_mode mode)
13917 {
13918 if (aarch64_short_vector_p (type, mode))
13919 return false;
13920
13921 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13922 return true;
13923
13924 if (mode == BLKmode
13925 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13926 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13927 return true;
13928
13929 return false;
13930 }
13931
13932 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13933 shall be passed or returned in simd/fp register(s) (providing these
13934 parameter passing registers are available).
13935
13936 Upon successful return, *COUNT returns the number of needed registers,
13937 *BASE_MODE returns the mode of the individual register and when IS_HAF
13938 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13939 floating-point aggregate or a homogeneous short-vector aggregate. */
13940
13941 static bool
13942 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13943 const_tree type,
13944 machine_mode *base_mode,
13945 int *count,
13946 bool *is_ha)
13947 {
13948 machine_mode new_mode = VOIDmode;
13949 bool composite_p = aarch64_composite_type_p (type, mode);
13950
13951 if (is_ha != NULL) *is_ha = false;
13952
13953 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13954 || aarch64_short_vector_p (type, mode))
13955 {
13956 *count = 1;
13957 new_mode = mode;
13958 }
13959 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13960 {
13961 if (is_ha != NULL) *is_ha = true;
13962 *count = 2;
13963 new_mode = GET_MODE_INNER (mode);
13964 }
13965 else if (type && composite_p)
13966 {
13967 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
13968
13969 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
13970 {
13971 if (is_ha != NULL) *is_ha = true;
13972 *count = ag_count;
13973 }
13974 else
13975 return false;
13976 }
13977 else
13978 return false;
13979
13980 *base_mode = new_mode;
13981 return true;
13982 }
13983
13984 /* Implement TARGET_STRUCT_VALUE_RTX. */
13985
13986 static rtx
13987 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
13988 int incoming ATTRIBUTE_UNUSED)
13989 {
13990 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
13991 }
13992
13993 /* Implements target hook vector_mode_supported_p. */
13994 static bool
13995 aarch64_vector_mode_supported_p (machine_mode mode)
13996 {
13997 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13998 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
13999 }
14000
14001 /* Return appropriate SIMD container
14002 for MODE within a vector of WIDTH bits. */
14003 static machine_mode
14004 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14005 {
14006 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14007 switch (mode)
14008 {
14009 case E_DFmode:
14010 return VNx2DFmode;
14011 case E_SFmode:
14012 return VNx4SFmode;
14013 case E_HFmode:
14014 return VNx8HFmode;
14015 case E_DImode:
14016 return VNx2DImode;
14017 case E_SImode:
14018 return VNx4SImode;
14019 case E_HImode:
14020 return VNx8HImode;
14021 case E_QImode:
14022 return VNx16QImode;
14023 default:
14024 return word_mode;
14025 }
14026
14027 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14028 if (TARGET_SIMD)
14029 {
14030 if (known_eq (width, 128))
14031 switch (mode)
14032 {
14033 case E_DFmode:
14034 return V2DFmode;
14035 case E_SFmode:
14036 return V4SFmode;
14037 case E_HFmode:
14038 return V8HFmode;
14039 case E_SImode:
14040 return V4SImode;
14041 case E_HImode:
14042 return V8HImode;
14043 case E_QImode:
14044 return V16QImode;
14045 case E_DImode:
14046 return V2DImode;
14047 default:
14048 break;
14049 }
14050 else
14051 switch (mode)
14052 {
14053 case E_SFmode:
14054 return V2SFmode;
14055 case E_HFmode:
14056 return V4HFmode;
14057 case E_SImode:
14058 return V2SImode;
14059 case E_HImode:
14060 return V4HImode;
14061 case E_QImode:
14062 return V8QImode;
14063 default:
14064 break;
14065 }
14066 }
14067 return word_mode;
14068 }
14069
14070 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14071 static machine_mode
14072 aarch64_preferred_simd_mode (scalar_mode mode)
14073 {
14074 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14075 return aarch64_simd_container_mode (mode, bits);
14076 }
14077
14078 /* Return a list of possible vector sizes for the vectorizer
14079 to iterate over. */
14080 static void
14081 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
14082 {
14083 if (TARGET_SVE)
14084 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14085 sizes->safe_push (16);
14086 sizes->safe_push (8);
14087 }
14088
14089 /* Implement TARGET_MANGLE_TYPE. */
14090
14091 static const char *
14092 aarch64_mangle_type (const_tree type)
14093 {
14094 /* The AArch64 ABI documents say that "__va_list" has to be
14095 mangled as if it is in the "std" namespace. */
14096 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14097 return "St9__va_list";
14098
14099 /* Half-precision float. */
14100 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14101 return "Dh";
14102
14103 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14104 builtin types. */
14105 if (TYPE_NAME (type) != NULL)
14106 return aarch64_mangle_builtin_type (type);
14107
14108 /* Use the default mangling. */
14109 return NULL;
14110 }
14111
14112 /* Find the first rtx_insn before insn that will generate an assembly
14113 instruction. */
14114
14115 static rtx_insn *
14116 aarch64_prev_real_insn (rtx_insn *insn)
14117 {
14118 if (!insn)
14119 return NULL;
14120
14121 do
14122 {
14123 insn = prev_real_insn (insn);
14124 }
14125 while (insn && recog_memoized (insn) < 0);
14126
14127 return insn;
14128 }
14129
14130 static bool
14131 is_madd_op (enum attr_type t1)
14132 {
14133 unsigned int i;
14134 /* A number of these may be AArch32 only. */
14135 enum attr_type mlatypes[] = {
14136 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14137 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14138 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14139 };
14140
14141 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14142 {
14143 if (t1 == mlatypes[i])
14144 return true;
14145 }
14146
14147 return false;
14148 }
14149
14150 /* Check if there is a register dependency between a load and the insn
14151 for which we hold recog_data. */
14152
14153 static bool
14154 dep_between_memop_and_curr (rtx memop)
14155 {
14156 rtx load_reg;
14157 int opno;
14158
14159 gcc_assert (GET_CODE (memop) == SET);
14160
14161 if (!REG_P (SET_DEST (memop)))
14162 return false;
14163
14164 load_reg = SET_DEST (memop);
14165 for (opno = 1; opno < recog_data.n_operands; opno++)
14166 {
14167 rtx operand = recog_data.operand[opno];
14168 if (REG_P (operand)
14169 && reg_overlap_mentioned_p (load_reg, operand))
14170 return true;
14171
14172 }
14173 return false;
14174 }
14175
14176
14177 /* When working around the Cortex-A53 erratum 835769,
14178 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14179 instruction and has a preceding memory instruction such that a NOP
14180 should be inserted between them. */
14181
14182 bool
14183 aarch64_madd_needs_nop (rtx_insn* insn)
14184 {
14185 enum attr_type attr_type;
14186 rtx_insn *prev;
14187 rtx body;
14188
14189 if (!TARGET_FIX_ERR_A53_835769)
14190 return false;
14191
14192 if (!INSN_P (insn) || recog_memoized (insn) < 0)
14193 return false;
14194
14195 attr_type = get_attr_type (insn);
14196 if (!is_madd_op (attr_type))
14197 return false;
14198
14199 prev = aarch64_prev_real_insn (insn);
14200 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14201 Restore recog state to INSN to avoid state corruption. */
14202 extract_constrain_insn_cached (insn);
14203
14204 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14205 return false;
14206
14207 body = single_set (prev);
14208
14209 /* If the previous insn is a memory op and there is no dependency between
14210 it and the DImode madd, emit a NOP between them. If body is NULL then we
14211 have a complex memory operation, probably a load/store pair.
14212 Be conservative for now and emit a NOP. */
14213 if (GET_MODE (recog_data.operand[0]) == DImode
14214 && (!body || !dep_between_memop_and_curr (body)))
14215 return true;
14216
14217 return false;
14218
14219 }
14220
14221
14222 /* Implement FINAL_PRESCAN_INSN. */
14223
14224 void
14225 aarch64_final_prescan_insn (rtx_insn *insn)
14226 {
14227 if (aarch64_madd_needs_nop (insn))
14228 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14229 }
14230
14231
14232 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14233 instruction. */
14234
14235 bool
14236 aarch64_sve_index_immediate_p (rtx base_or_step)
14237 {
14238 return (CONST_INT_P (base_or_step)
14239 && IN_RANGE (INTVAL (base_or_step), -16, 15));
14240 }
14241
14242 /* Return true if X is a valid immediate for the SVE ADD and SUB
14243 instructions. Negate X first if NEGATE_P is true. */
14244
14245 bool
14246 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14247 {
14248 rtx elt;
14249
14250 if (!const_vec_duplicate_p (x, &elt)
14251 || !CONST_INT_P (elt))
14252 return false;
14253
14254 HOST_WIDE_INT val = INTVAL (elt);
14255 if (negate_p)
14256 val = -val;
14257 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14258
14259 if (val & 0xff)
14260 return IN_RANGE (val, 0, 0xff);
14261 return IN_RANGE (val, 0, 0xff00);
14262 }
14263
14264 /* Return true if X is a valid immediate operand for an SVE logical
14265 instruction such as AND. */
14266
14267 bool
14268 aarch64_sve_bitmask_immediate_p (rtx x)
14269 {
14270 rtx elt;
14271
14272 return (const_vec_duplicate_p (x, &elt)
14273 && CONST_INT_P (elt)
14274 && aarch64_bitmask_imm (INTVAL (elt),
14275 GET_MODE_INNER (GET_MODE (x))));
14276 }
14277
14278 /* Return true if X is a valid immediate for the SVE DUP and CPY
14279 instructions. */
14280
14281 bool
14282 aarch64_sve_dup_immediate_p (rtx x)
14283 {
14284 rtx elt;
14285
14286 if (!const_vec_duplicate_p (x, &elt)
14287 || !CONST_INT_P (elt))
14288 return false;
14289
14290 HOST_WIDE_INT val = INTVAL (elt);
14291 if (val & 0xff)
14292 return IN_RANGE (val, -0x80, 0x7f);
14293 return IN_RANGE (val, -0x8000, 0x7f00);
14294 }
14295
14296 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14297 SIGNED_P says whether the operand is signed rather than unsigned. */
14298
14299 bool
14300 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14301 {
14302 rtx elt;
14303
14304 return (const_vec_duplicate_p (x, &elt)
14305 && CONST_INT_P (elt)
14306 && (signed_p
14307 ? IN_RANGE (INTVAL (elt), -16, 15)
14308 : IN_RANGE (INTVAL (elt), 0, 127)));
14309 }
14310
14311 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14312 instruction. Negate X first if NEGATE_P is true. */
14313
14314 bool
14315 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14316 {
14317 rtx elt;
14318 REAL_VALUE_TYPE r;
14319
14320 if (!const_vec_duplicate_p (x, &elt)
14321 || GET_CODE (elt) != CONST_DOUBLE)
14322 return false;
14323
14324 r = *CONST_DOUBLE_REAL_VALUE (elt);
14325
14326 if (negate_p)
14327 r = real_value_negate (&r);
14328
14329 if (real_equal (&r, &dconst1))
14330 return true;
14331 if (real_equal (&r, &dconsthalf))
14332 return true;
14333 return false;
14334 }
14335
14336 /* Return true if X is a valid immediate operand for an SVE FMUL
14337 instruction. */
14338
14339 bool
14340 aarch64_sve_float_mul_immediate_p (rtx x)
14341 {
14342 rtx elt;
14343
14344 /* GCC will never generate a multiply with an immediate of 2, so there is no
14345 point testing for it (even though it is a valid constant). */
14346 return (const_vec_duplicate_p (x, &elt)
14347 && GET_CODE (elt) == CONST_DOUBLE
14348 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14349 }
14350
14351 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14352 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14353 is nonnull, use it to describe valid immediates. */
14354 static bool
14355 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14356 simd_immediate_info *info,
14357 enum simd_immediate_check which,
14358 simd_immediate_info::insn_type insn)
14359 {
14360 /* Try a 4-byte immediate with LSL. */
14361 for (unsigned int shift = 0; shift < 32; shift += 8)
14362 if ((val32 & (0xff << shift)) == val32)
14363 {
14364 if (info)
14365 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14366 simd_immediate_info::LSL, shift);
14367 return true;
14368 }
14369
14370 /* Try a 2-byte immediate with LSL. */
14371 unsigned int imm16 = val32 & 0xffff;
14372 if (imm16 == (val32 >> 16))
14373 for (unsigned int shift = 0; shift < 16; shift += 8)
14374 if ((imm16 & (0xff << shift)) == imm16)
14375 {
14376 if (info)
14377 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14378 simd_immediate_info::LSL, shift);
14379 return true;
14380 }
14381
14382 /* Try a 4-byte immediate with MSL, except for cases that MVN
14383 can handle. */
14384 if (which == AARCH64_CHECK_MOV)
14385 for (unsigned int shift = 8; shift < 24; shift += 8)
14386 {
14387 unsigned int low = (1 << shift) - 1;
14388 if (((val32 & (0xff << shift)) | low) == val32)
14389 {
14390 if (info)
14391 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14392 simd_immediate_info::MSL, shift);
14393 return true;
14394 }
14395 }
14396
14397 return false;
14398 }
14399
14400 /* Return true if replicating VAL64 is a valid immediate for the
14401 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14402 use it to describe valid immediates. */
14403 static bool
14404 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14405 simd_immediate_info *info,
14406 enum simd_immediate_check which)
14407 {
14408 unsigned int val32 = val64 & 0xffffffff;
14409 unsigned int val16 = val64 & 0xffff;
14410 unsigned int val8 = val64 & 0xff;
14411
14412 if (val32 == (val64 >> 32))
14413 {
14414 if ((which & AARCH64_CHECK_ORR) != 0
14415 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14416 simd_immediate_info::MOV))
14417 return true;
14418
14419 if ((which & AARCH64_CHECK_BIC) != 0
14420 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14421 simd_immediate_info::MVN))
14422 return true;
14423
14424 /* Try using a replicated byte. */
14425 if (which == AARCH64_CHECK_MOV
14426 && val16 == (val32 >> 16)
14427 && val8 == (val16 >> 8))
14428 {
14429 if (info)
14430 *info = simd_immediate_info (QImode, val8);
14431 return true;
14432 }
14433 }
14434
14435 /* Try using a bit-to-bytemask. */
14436 if (which == AARCH64_CHECK_MOV)
14437 {
14438 unsigned int i;
14439 for (i = 0; i < 64; i += 8)
14440 {
14441 unsigned char byte = (val64 >> i) & 0xff;
14442 if (byte != 0 && byte != 0xff)
14443 break;
14444 }
14445 if (i == 64)
14446 {
14447 if (info)
14448 *info = simd_immediate_info (DImode, val64);
14449 return true;
14450 }
14451 }
14452 return false;
14453 }
14454
14455 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14456 instruction. If INFO is nonnull, use it to describe valid immediates. */
14457
14458 static bool
14459 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14460 simd_immediate_info *info)
14461 {
14462 scalar_int_mode mode = DImode;
14463 unsigned int val32 = val64 & 0xffffffff;
14464 if (val32 == (val64 >> 32))
14465 {
14466 mode = SImode;
14467 unsigned int val16 = val32 & 0xffff;
14468 if (val16 == (val32 >> 16))
14469 {
14470 mode = HImode;
14471 unsigned int val8 = val16 & 0xff;
14472 if (val8 == (val16 >> 8))
14473 mode = QImode;
14474 }
14475 }
14476 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14477 if (IN_RANGE (val, -0x80, 0x7f))
14478 {
14479 /* DUP with no shift. */
14480 if (info)
14481 *info = simd_immediate_info (mode, val);
14482 return true;
14483 }
14484 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14485 {
14486 /* DUP with LSL #8. */
14487 if (info)
14488 *info = simd_immediate_info (mode, val);
14489 return true;
14490 }
14491 if (aarch64_bitmask_imm (val64, mode))
14492 {
14493 /* DUPM. */
14494 if (info)
14495 *info = simd_immediate_info (mode, val);
14496 return true;
14497 }
14498 return false;
14499 }
14500
14501 /* Return true if OP is a valid SIMD immediate for the operation
14502 described by WHICH. If INFO is nonnull, use it to describe valid
14503 immediates. */
14504 bool
14505 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14506 enum simd_immediate_check which)
14507 {
14508 machine_mode mode = GET_MODE (op);
14509 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14510 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14511 return false;
14512
14513 scalar_mode elt_mode = GET_MODE_INNER (mode);
14514 rtx base, step;
14515 unsigned int n_elts;
14516 if (GET_CODE (op) == CONST_VECTOR
14517 && CONST_VECTOR_DUPLICATE_P (op))
14518 n_elts = CONST_VECTOR_NPATTERNS (op);
14519 else if ((vec_flags & VEC_SVE_DATA)
14520 && const_vec_series_p (op, &base, &step))
14521 {
14522 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14523 if (!aarch64_sve_index_immediate_p (base)
14524 || !aarch64_sve_index_immediate_p (step))
14525 return false;
14526
14527 if (info)
14528 *info = simd_immediate_info (elt_mode, base, step);
14529 return true;
14530 }
14531 else if (GET_CODE (op) == CONST_VECTOR
14532 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14533 /* N_ELTS set above. */;
14534 else
14535 return false;
14536
14537 /* Handle PFALSE and PTRUE. */
14538 if (vec_flags & VEC_SVE_PRED)
14539 return (op == CONST0_RTX (mode)
14540 || op == CONSTM1_RTX (mode));
14541
14542 scalar_float_mode elt_float_mode;
14543 if (n_elts == 1
14544 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14545 {
14546 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14547 if (aarch64_float_const_zero_rtx_p (elt)
14548 || aarch64_float_const_representable_p (elt))
14549 {
14550 if (info)
14551 *info = simd_immediate_info (elt_float_mode, elt);
14552 return true;
14553 }
14554 }
14555
14556 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14557 if (elt_size > 8)
14558 return false;
14559
14560 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14561
14562 /* Expand the vector constant out into a byte vector, with the least
14563 significant byte of the register first. */
14564 auto_vec<unsigned char, 16> bytes;
14565 bytes.reserve (n_elts * elt_size);
14566 for (unsigned int i = 0; i < n_elts; i++)
14567 {
14568 /* The vector is provided in gcc endian-neutral fashion.
14569 For aarch64_be Advanced SIMD, it must be laid out in the vector
14570 register in reverse order. */
14571 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14572 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14573
14574 if (elt_mode != elt_int_mode)
14575 elt = gen_lowpart (elt_int_mode, elt);
14576
14577 if (!CONST_INT_P (elt))
14578 return false;
14579
14580 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14581 for (unsigned int byte = 0; byte < elt_size; byte++)
14582 {
14583 bytes.quick_push (elt_val & 0xff);
14584 elt_val >>= BITS_PER_UNIT;
14585 }
14586 }
14587
14588 /* The immediate must repeat every eight bytes. */
14589 unsigned int nbytes = bytes.length ();
14590 for (unsigned i = 8; i < nbytes; ++i)
14591 if (bytes[i] != bytes[i - 8])
14592 return false;
14593
14594 /* Get the repeating 8-byte value as an integer. No endian correction
14595 is needed here because bytes is already in lsb-first order. */
14596 unsigned HOST_WIDE_INT val64 = 0;
14597 for (unsigned int i = 0; i < 8; i++)
14598 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14599 << (i * BITS_PER_UNIT));
14600
14601 if (vec_flags & VEC_SVE_DATA)
14602 return aarch64_sve_valid_immediate (val64, info);
14603 else
14604 return aarch64_advsimd_valid_immediate (val64, info, which);
14605 }
14606
14607 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14608 has a step in the range of INDEX. Return the index expression if so,
14609 otherwise return null. */
14610 rtx
14611 aarch64_check_zero_based_sve_index_immediate (rtx x)
14612 {
14613 rtx base, step;
14614 if (const_vec_series_p (x, &base, &step)
14615 && base == const0_rtx
14616 && aarch64_sve_index_immediate_p (step))
14617 return step;
14618 return NULL_RTX;
14619 }
14620
14621 /* Check of immediate shift constants are within range. */
14622 bool
14623 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14624 {
14625 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14626 if (left)
14627 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14628 else
14629 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14630 }
14631
14632 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14633 operation of width WIDTH at bit position POS. */
14634
14635 rtx
14636 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14637 {
14638 gcc_assert (CONST_INT_P (width));
14639 gcc_assert (CONST_INT_P (pos));
14640
14641 unsigned HOST_WIDE_INT mask
14642 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14643 return GEN_INT (mask << UINTVAL (pos));
14644 }
14645
14646 bool
14647 aarch64_mov_operand_p (rtx x, machine_mode mode)
14648 {
14649 if (GET_CODE (x) == HIGH
14650 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14651 return true;
14652
14653 if (CONST_INT_P (x))
14654 return true;
14655
14656 if (VECTOR_MODE_P (GET_MODE (x)))
14657 return aarch64_simd_valid_immediate (x, NULL);
14658
14659 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14660 return true;
14661
14662 if (aarch64_sve_cnt_immediate_p (x))
14663 return true;
14664
14665 return aarch64_classify_symbolic_expression (x)
14666 == SYMBOL_TINY_ABSOLUTE;
14667 }
14668
14669 /* Return a const_int vector of VAL. */
14670 rtx
14671 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14672 {
14673 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14674 return gen_const_vec_duplicate (mode, c);
14675 }
14676
14677 /* Check OP is a legal scalar immediate for the MOVI instruction. */
14678
14679 bool
14680 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14681 {
14682 machine_mode vmode;
14683
14684 vmode = aarch64_simd_container_mode (mode, 64);
14685 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14686 return aarch64_simd_valid_immediate (op_v, NULL);
14687 }
14688
14689 /* Construct and return a PARALLEL RTX vector with elements numbering the
14690 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14691 the vector - from the perspective of the architecture. This does not
14692 line up with GCC's perspective on lane numbers, so we end up with
14693 different masks depending on our target endian-ness. The diagram
14694 below may help. We must draw the distinction when building masks
14695 which select one half of the vector. An instruction selecting
14696 architectural low-lanes for a big-endian target, must be described using
14697 a mask selecting GCC high-lanes.
14698
14699 Big-Endian Little-Endian
14700
14701 GCC 0 1 2 3 3 2 1 0
14702 | x | x | x | x | | x | x | x | x |
14703 Architecture 3 2 1 0 3 2 1 0
14704
14705 Low Mask: { 2, 3 } { 0, 1 }
14706 High Mask: { 0, 1 } { 2, 3 }
14707
14708 MODE Is the mode of the vector and NUNITS is the number of units in it. */
14709
14710 rtx
14711 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14712 {
14713 rtvec v = rtvec_alloc (nunits / 2);
14714 int high_base = nunits / 2;
14715 int low_base = 0;
14716 int base;
14717 rtx t1;
14718 int i;
14719
14720 if (BYTES_BIG_ENDIAN)
14721 base = high ? low_base : high_base;
14722 else
14723 base = high ? high_base : low_base;
14724
14725 for (i = 0; i < nunits / 2; i++)
14726 RTVEC_ELT (v, i) = GEN_INT (base + i);
14727
14728 t1 = gen_rtx_PARALLEL (mode, v);
14729 return t1;
14730 }
14731
14732 /* Check OP for validity as a PARALLEL RTX vector with elements
14733 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14734 from the perspective of the architecture. See the diagram above
14735 aarch64_simd_vect_par_cnst_half for more details. */
14736
14737 bool
14738 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14739 bool high)
14740 {
14741 int nelts;
14742 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14743 return false;
14744
14745 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14746 HOST_WIDE_INT count_op = XVECLEN (op, 0);
14747 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14748 int i = 0;
14749
14750 if (count_op != count_ideal)
14751 return false;
14752
14753 for (i = 0; i < count_ideal; i++)
14754 {
14755 rtx elt_op = XVECEXP (op, 0, i);
14756 rtx elt_ideal = XVECEXP (ideal, 0, i);
14757
14758 if (!CONST_INT_P (elt_op)
14759 || INTVAL (elt_ideal) != INTVAL (elt_op))
14760 return false;
14761 }
14762 return true;
14763 }
14764
14765 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
14766 HIGH (exclusive). */
14767 void
14768 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14769 const_tree exp)
14770 {
14771 HOST_WIDE_INT lane;
14772 gcc_assert (CONST_INT_P (operand));
14773 lane = INTVAL (operand);
14774
14775 if (lane < low || lane >= high)
14776 {
14777 if (exp)
14778 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14779 else
14780 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14781 }
14782 }
14783
14784 /* Peform endian correction on lane number N, which indexes a vector
14785 of mode MODE, and return the result as an SImode rtx. */
14786
14787 rtx
14788 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14789 {
14790 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14791 }
14792
14793 /* Return TRUE if OP is a valid vector addressing mode. */
14794
14795 bool
14796 aarch64_simd_mem_operand_p (rtx op)
14797 {
14798 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14799 || REG_P (XEXP (op, 0)));
14800 }
14801
14802 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
14803
14804 bool
14805 aarch64_sve_ld1r_operand_p (rtx op)
14806 {
14807 struct aarch64_address_info addr;
14808 scalar_mode mode;
14809
14810 return (MEM_P (op)
14811 && is_a <scalar_mode> (GET_MODE (op), &mode)
14812 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14813 && addr.type == ADDRESS_REG_IMM
14814 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14815 }
14816
14817 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14818 The conditions for STR are the same. */
14819 bool
14820 aarch64_sve_ldr_operand_p (rtx op)
14821 {
14822 struct aarch64_address_info addr;
14823
14824 return (MEM_P (op)
14825 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14826 false, ADDR_QUERY_ANY)
14827 && addr.type == ADDRESS_REG_IMM);
14828 }
14829
14830 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14831 We need to be able to access the individual pieces, so the range
14832 is different from LD[234] and ST[234]. */
14833 bool
14834 aarch64_sve_struct_memory_operand_p (rtx op)
14835 {
14836 if (!MEM_P (op))
14837 return false;
14838
14839 machine_mode mode = GET_MODE (op);
14840 struct aarch64_address_info addr;
14841 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14842 ADDR_QUERY_ANY)
14843 || addr.type != ADDRESS_REG_IMM)
14844 return false;
14845
14846 poly_int64 first = addr.const_offset;
14847 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14848 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14849 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14850 }
14851
14852 /* Emit a register copy from operand to operand, taking care not to
14853 early-clobber source registers in the process.
14854
14855 COUNT is the number of components into which the copy needs to be
14856 decomposed. */
14857 void
14858 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14859 unsigned int count)
14860 {
14861 unsigned int i;
14862 int rdest = REGNO (operands[0]);
14863 int rsrc = REGNO (operands[1]);
14864
14865 if (!reg_overlap_mentioned_p (operands[0], operands[1])
14866 || rdest < rsrc)
14867 for (i = 0; i < count; i++)
14868 emit_move_insn (gen_rtx_REG (mode, rdest + i),
14869 gen_rtx_REG (mode, rsrc + i));
14870 else
14871 for (i = 0; i < count; i++)
14872 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14873 gen_rtx_REG (mode, rsrc + count - i - 1));
14874 }
14875
14876 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14877 one of VSTRUCT modes: OI, CI, or XI. */
14878 int
14879 aarch64_simd_attr_length_rglist (machine_mode mode)
14880 {
14881 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
14882 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14883 }
14884
14885 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
14886 alignment of a vector to 128 bits. SVE predicates have an alignment of
14887 16 bits. */
14888 static HOST_WIDE_INT
14889 aarch64_simd_vector_alignment (const_tree type)
14890 {
14891 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14892 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14893 be set for non-predicate vectors of booleans. Modes are the most
14894 direct way we have of identifying real SVE predicate types. */
14895 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14896 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
14897 return MIN (align, 128);
14898 }
14899
14900 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
14901 static poly_uint64
14902 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14903 {
14904 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14905 {
14906 /* If the length of the vector is fixed, try to align to that length,
14907 otherwise don't try to align at all. */
14908 HOST_WIDE_INT result;
14909 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14910 result = TYPE_ALIGN (TREE_TYPE (type));
14911 return result;
14912 }
14913 return TYPE_ALIGN (type);
14914 }
14915
14916 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
14917 static bool
14918 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14919 {
14920 if (is_packed)
14921 return false;
14922
14923 /* For fixed-length vectors, check that the vectorizer will aim for
14924 full-vector alignment. This isn't true for generic GCC vectors
14925 that are wider than the ABI maximum of 128 bits. */
14926 poly_uint64 preferred_alignment =
14927 aarch64_vectorize_preferred_vector_alignment (type);
14928 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14929 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14930 preferred_alignment))
14931 return false;
14932
14933 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
14934 return true;
14935 }
14936
14937 /* Return true if the vector misalignment factor is supported by the
14938 target. */
14939 static bool
14940 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14941 const_tree type, int misalignment,
14942 bool is_packed)
14943 {
14944 if (TARGET_SIMD && STRICT_ALIGNMENT)
14945 {
14946 /* Return if movmisalign pattern is not supported for this mode. */
14947 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14948 return false;
14949
14950 /* Misalignment factor is unknown at compile time. */
14951 if (misalignment == -1)
14952 return false;
14953 }
14954 return default_builtin_support_vector_misalignment (mode, type, misalignment,
14955 is_packed);
14956 }
14957
14958 /* If VALS is a vector constant that can be loaded into a register
14959 using DUP, generate instructions to do so and return an RTX to
14960 assign to the register. Otherwise return NULL_RTX. */
14961 static rtx
14962 aarch64_simd_dup_constant (rtx vals)
14963 {
14964 machine_mode mode = GET_MODE (vals);
14965 machine_mode inner_mode = GET_MODE_INNER (mode);
14966 rtx x;
14967
14968 if (!const_vec_duplicate_p (vals, &x))
14969 return NULL_RTX;
14970
14971 /* We can load this constant by using DUP and a constant in a
14972 single ARM register. This will be cheaper than a vector
14973 load. */
14974 x = copy_to_mode_reg (inner_mode, x);
14975 return gen_vec_duplicate (mode, x);
14976 }
14977
14978
14979 /* Generate code to load VALS, which is a PARALLEL containing only
14980 constants (for vec_init) or CONST_VECTOR, efficiently into a
14981 register. Returns an RTX to copy into the register, or NULL_RTX
14982 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
14983 static rtx
14984 aarch64_simd_make_constant (rtx vals)
14985 {
14986 machine_mode mode = GET_MODE (vals);
14987 rtx const_dup;
14988 rtx const_vec = NULL_RTX;
14989 int n_const = 0;
14990 int i;
14991
14992 if (GET_CODE (vals) == CONST_VECTOR)
14993 const_vec = vals;
14994 else if (GET_CODE (vals) == PARALLEL)
14995 {
14996 /* A CONST_VECTOR must contain only CONST_INTs and
14997 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
14998 Only store valid constants in a CONST_VECTOR. */
14999 int n_elts = XVECLEN (vals, 0);
15000 for (i = 0; i < n_elts; ++i)
15001 {
15002 rtx x = XVECEXP (vals, 0, i);
15003 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15004 n_const++;
15005 }
15006 if (n_const == n_elts)
15007 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15008 }
15009 else
15010 gcc_unreachable ();
15011
15012 if (const_vec != NULL_RTX
15013 && aarch64_simd_valid_immediate (const_vec, NULL))
15014 /* Load using MOVI/MVNI. */
15015 return const_vec;
15016 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15017 /* Loaded using DUP. */
15018 return const_dup;
15019 else if (const_vec != NULL_RTX)
15020 /* Load from constant pool. We cannot take advantage of single-cycle
15021 LD1 because we need a PC-relative addressing mode. */
15022 return const_vec;
15023 else
15024 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15025 We cannot construct an initializer. */
15026 return NULL_RTX;
15027 }
15028
15029 /* Expand a vector initialisation sequence, such that TARGET is
15030 initialised to contain VALS. */
15031
15032 void
15033 aarch64_expand_vector_init (rtx target, rtx vals)
15034 {
15035 machine_mode mode = GET_MODE (target);
15036 scalar_mode inner_mode = GET_MODE_INNER (mode);
15037 /* The number of vector elements. */
15038 int n_elts = XVECLEN (vals, 0);
15039 /* The number of vector elements which are not constant. */
15040 int n_var = 0;
15041 rtx any_const = NULL_RTX;
15042 /* The first element of vals. */
15043 rtx v0 = XVECEXP (vals, 0, 0);
15044 bool all_same = true;
15045
15046 /* Count the number of variable elements to initialise. */
15047 for (int i = 0; i < n_elts; ++i)
15048 {
15049 rtx x = XVECEXP (vals, 0, i);
15050 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15051 ++n_var;
15052 else
15053 any_const = x;
15054
15055 all_same &= rtx_equal_p (x, v0);
15056 }
15057
15058 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15059 how best to handle this. */
15060 if (n_var == 0)
15061 {
15062 rtx constant = aarch64_simd_make_constant (vals);
15063 if (constant != NULL_RTX)
15064 {
15065 emit_move_insn (target, constant);
15066 return;
15067 }
15068 }
15069
15070 /* Splat a single non-constant element if we can. */
15071 if (all_same)
15072 {
15073 rtx x = copy_to_mode_reg (inner_mode, v0);
15074 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15075 return;
15076 }
15077
15078 enum insn_code icode = optab_handler (vec_set_optab, mode);
15079 gcc_assert (icode != CODE_FOR_nothing);
15080
15081 /* If there are only variable elements, try to optimize
15082 the insertion using dup for the most common element
15083 followed by insertions. */
15084
15085 /* The algorithm will fill matches[*][0] with the earliest matching element,
15086 and matches[X][1] with the count of duplicate elements (if X is the
15087 earliest element which has duplicates). */
15088
15089 if (n_var == n_elts && n_elts <= 16)
15090 {
15091 int matches[16][2] = {0};
15092 for (int i = 0; i < n_elts; i++)
15093 {
15094 for (int j = 0; j <= i; j++)
15095 {
15096 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15097 {
15098 matches[i][0] = j;
15099 matches[j][1]++;
15100 break;
15101 }
15102 }
15103 }
15104 int maxelement = 0;
15105 int maxv = 0;
15106 for (int i = 0; i < n_elts; i++)
15107 if (matches[i][1] > maxv)
15108 {
15109 maxelement = i;
15110 maxv = matches[i][1];
15111 }
15112
15113 /* Create a duplicate of the most common element, unless all elements
15114 are equally useless to us, in which case just immediately set the
15115 vector register using the first element. */
15116
15117 if (maxv == 1)
15118 {
15119 /* For vectors of two 64-bit elements, we can do even better. */
15120 if (n_elts == 2
15121 && (inner_mode == E_DImode
15122 || inner_mode == E_DFmode))
15123
15124 {
15125 rtx x0 = XVECEXP (vals, 0, 0);
15126 rtx x1 = XVECEXP (vals, 0, 1);
15127 /* Combine can pick up this case, but handling it directly
15128 here leaves clearer RTL.
15129
15130 This is load_pair_lanes<mode>, and also gives us a clean-up
15131 for store_pair_lanes<mode>. */
15132 if (memory_operand (x0, inner_mode)
15133 && memory_operand (x1, inner_mode)
15134 && !STRICT_ALIGNMENT
15135 && rtx_equal_p (XEXP (x1, 0),
15136 plus_constant (Pmode,
15137 XEXP (x0, 0),
15138 GET_MODE_SIZE (inner_mode))))
15139 {
15140 rtx t;
15141 if (inner_mode == DFmode)
15142 t = gen_load_pair_lanesdf (target, x0, x1);
15143 else
15144 t = gen_load_pair_lanesdi (target, x0, x1);
15145 emit_insn (t);
15146 return;
15147 }
15148 }
15149 /* The subreg-move sequence below will move into lane zero of the
15150 vector register. For big-endian we want that position to hold
15151 the last element of VALS. */
15152 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15153 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15154 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15155 }
15156 else
15157 {
15158 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15159 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15160 }
15161
15162 /* Insert the rest. */
15163 for (int i = 0; i < n_elts; i++)
15164 {
15165 rtx x = XVECEXP (vals, 0, i);
15166 if (matches[i][0] == maxelement)
15167 continue;
15168 x = copy_to_mode_reg (inner_mode, x);
15169 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15170 }
15171 return;
15172 }
15173
15174 /* Initialise a vector which is part-variable. We want to first try
15175 to build those lanes which are constant in the most efficient way we
15176 can. */
15177 if (n_var != n_elts)
15178 {
15179 rtx copy = copy_rtx (vals);
15180
15181 /* Load constant part of vector. We really don't care what goes into the
15182 parts we will overwrite, but we're more likely to be able to load the
15183 constant efficiently if it has fewer, larger, repeating parts
15184 (see aarch64_simd_valid_immediate). */
15185 for (int i = 0; i < n_elts; i++)
15186 {
15187 rtx x = XVECEXP (vals, 0, i);
15188 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15189 continue;
15190 rtx subst = any_const;
15191 for (int bit = n_elts / 2; bit > 0; bit /= 2)
15192 {
15193 /* Look in the copied vector, as more elements are const. */
15194 rtx test = XVECEXP (copy, 0, i ^ bit);
15195 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15196 {
15197 subst = test;
15198 break;
15199 }
15200 }
15201 XVECEXP (copy, 0, i) = subst;
15202 }
15203 aarch64_expand_vector_init (target, copy);
15204 }
15205
15206 /* Insert the variable lanes directly. */
15207 for (int i = 0; i < n_elts; i++)
15208 {
15209 rtx x = XVECEXP (vals, 0, i);
15210 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15211 continue;
15212 x = copy_to_mode_reg (inner_mode, x);
15213 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15214 }
15215 }
15216
15217 static unsigned HOST_WIDE_INT
15218 aarch64_shift_truncation_mask (machine_mode mode)
15219 {
15220 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15221 return 0;
15222 return GET_MODE_UNIT_BITSIZE (mode) - 1;
15223 }
15224
15225 /* Select a format to encode pointers in exception handling data. */
15226 int
15227 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15228 {
15229 int type;
15230 switch (aarch64_cmodel)
15231 {
15232 case AARCH64_CMODEL_TINY:
15233 case AARCH64_CMODEL_TINY_PIC:
15234 case AARCH64_CMODEL_SMALL:
15235 case AARCH64_CMODEL_SMALL_PIC:
15236 case AARCH64_CMODEL_SMALL_SPIC:
15237 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15238 for everything. */
15239 type = DW_EH_PE_sdata4;
15240 break;
15241 default:
15242 /* No assumptions here. 8-byte relocs required. */
15243 type = DW_EH_PE_sdata8;
15244 break;
15245 }
15246 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15247 }
15248
15249 /* The last .arch and .tune assembly strings that we printed. */
15250 static std::string aarch64_last_printed_arch_string;
15251 static std::string aarch64_last_printed_tune_string;
15252
15253 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15254 by the function fndecl. */
15255
15256 void
15257 aarch64_declare_function_name (FILE *stream, const char* name,
15258 tree fndecl)
15259 {
15260 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15261
15262 struct cl_target_option *targ_options;
15263 if (target_parts)
15264 targ_options = TREE_TARGET_OPTION (target_parts);
15265 else
15266 targ_options = TREE_TARGET_OPTION (target_option_current_node);
15267 gcc_assert (targ_options);
15268
15269 const struct processor *this_arch
15270 = aarch64_get_arch (targ_options->x_explicit_arch);
15271
15272 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15273 std::string extension
15274 = aarch64_get_extension_string_for_isa_flags (isa_flags,
15275 this_arch->flags);
15276 /* Only update the assembler .arch string if it is distinct from the last
15277 such string we printed. */
15278 std::string to_print = this_arch->name + extension;
15279 if (to_print != aarch64_last_printed_arch_string)
15280 {
15281 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15282 aarch64_last_printed_arch_string = to_print;
15283 }
15284
15285 /* Print the cpu name we're tuning for in the comments, might be
15286 useful to readers of the generated asm. Do it only when it changes
15287 from function to function and verbose assembly is requested. */
15288 const struct processor *this_tune
15289 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15290
15291 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15292 {
15293 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15294 this_tune->name);
15295 aarch64_last_printed_tune_string = this_tune->name;
15296 }
15297
15298 /* Don't forget the type directive for ELF. */
15299 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15300 ASM_OUTPUT_LABEL (stream, name);
15301 }
15302
15303 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15304
15305 static void
15306 aarch64_start_file (void)
15307 {
15308 struct cl_target_option *default_options
15309 = TREE_TARGET_OPTION (target_option_default_node);
15310
15311 const struct processor *default_arch
15312 = aarch64_get_arch (default_options->x_explicit_arch);
15313 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15314 std::string extension
15315 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15316 default_arch->flags);
15317
15318 aarch64_last_printed_arch_string = default_arch->name + extension;
15319 aarch64_last_printed_tune_string = "";
15320 asm_fprintf (asm_out_file, "\t.arch %s\n",
15321 aarch64_last_printed_arch_string.c_str ());
15322
15323 default_file_start ();
15324 }
15325
15326 /* Emit load exclusive. */
15327
15328 static void
15329 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15330 rtx mem, rtx model_rtx)
15331 {
15332 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15333 }
15334
15335 /* Emit store exclusive. */
15336
15337 static void
15338 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15339 rtx rval, rtx mem, rtx model_rtx)
15340 {
15341 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15342 }
15343
15344 /* Mark the previous jump instruction as unlikely. */
15345
15346 static void
15347 aarch64_emit_unlikely_jump (rtx insn)
15348 {
15349 rtx_insn *jump = emit_jump_insn (insn);
15350 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15351 }
15352
15353 /* Expand a compare and swap pattern. */
15354
15355 void
15356 aarch64_expand_compare_and_swap (rtx operands[])
15357 {
15358 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15359 machine_mode mode, r_mode;
15360
15361 bval = operands[0];
15362 rval = operands[1];
15363 mem = operands[2];
15364 oldval = operands[3];
15365 newval = operands[4];
15366 is_weak = operands[5];
15367 mod_s = operands[6];
15368 mod_f = operands[7];
15369 mode = GET_MODE (mem);
15370
15371 /* Normally the succ memory model must be stronger than fail, but in the
15372 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15373 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15374 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15375 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15376 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15377
15378 r_mode = mode;
15379 if (mode == QImode || mode == HImode)
15380 {
15381 r_mode = SImode;
15382 rval = gen_reg_rtx (r_mode);
15383 }
15384
15385 if (TARGET_LSE)
15386 {
15387 /* The CAS insn requires oldval and rval overlap, but we need to
15388 have a copy of oldval saved across the operation to tell if
15389 the operation is successful. */
15390 if (reg_overlap_mentioned_p (rval, oldval))
15391 rval = copy_to_mode_reg (r_mode, oldval);
15392 else
15393 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15394
15395 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15396 newval, mod_s));
15397 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15398 }
15399 else
15400 {
15401 /* The oldval predicate varies by mode. Test it and force to reg. */
15402 insn_code code = code_for_aarch64_compare_and_swap (mode);
15403 if (!insn_data[code].operand[2].predicate (oldval, mode))
15404 oldval = force_reg (mode, oldval);
15405
15406 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15407 is_weak, mod_s, mod_f));
15408 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15409 }
15410
15411 if (r_mode != mode)
15412 rval = gen_lowpart (mode, rval);
15413 emit_move_insn (operands[1], rval);
15414
15415 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15416 emit_insn (gen_rtx_SET (bval, x));
15417 }
15418
15419 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15420 sequence implementing an atomic operation. */
15421
15422 static void
15423 aarch64_emit_post_barrier (enum memmodel model)
15424 {
15425 const enum memmodel base_model = memmodel_base (model);
15426
15427 if (is_mm_sync (model)
15428 && (base_model == MEMMODEL_ACQUIRE
15429 || base_model == MEMMODEL_ACQ_REL
15430 || base_model == MEMMODEL_SEQ_CST))
15431 {
15432 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15433 }
15434 }
15435
15436 /* Split a compare and swap pattern. */
15437
15438 void
15439 aarch64_split_compare_and_swap (rtx operands[])
15440 {
15441 rtx rval, mem, oldval, newval, scratch;
15442 machine_mode mode;
15443 bool is_weak;
15444 rtx_code_label *label1, *label2;
15445 rtx x, cond;
15446 enum memmodel model;
15447 rtx model_rtx;
15448
15449 rval = operands[0];
15450 mem = operands[1];
15451 oldval = operands[2];
15452 newval = operands[3];
15453 is_weak = (operands[4] != const0_rtx);
15454 model_rtx = operands[5];
15455 scratch = operands[7];
15456 mode = GET_MODE (mem);
15457 model = memmodel_from_int (INTVAL (model_rtx));
15458
15459 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15460 loop:
15461 .label1:
15462 LD[A]XR rval, [mem]
15463 CBNZ rval, .label2
15464 ST[L]XR scratch, newval, [mem]
15465 CBNZ scratch, .label1
15466 .label2:
15467 CMP rval, 0. */
15468 bool strong_zero_p = !is_weak && oldval == const0_rtx;
15469
15470 label1 = NULL;
15471 if (!is_weak)
15472 {
15473 label1 = gen_label_rtx ();
15474 emit_label (label1);
15475 }
15476 label2 = gen_label_rtx ();
15477
15478 /* The initial load can be relaxed for a __sync operation since a final
15479 barrier will be emitted to stop code hoisting. */
15480 if (is_mm_sync (model))
15481 aarch64_emit_load_exclusive (mode, rval, mem,
15482 GEN_INT (MEMMODEL_RELAXED));
15483 else
15484 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15485
15486 if (strong_zero_p)
15487 {
15488 if (aarch64_track_speculation)
15489 {
15490 /* Emit an explicit compare instruction, so that we can correctly
15491 track the condition codes. */
15492 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15493 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15494 }
15495 else
15496 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15497
15498 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15499 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15500 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15501 }
15502 else
15503 {
15504 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15505 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15506 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15507 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15508 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15509 }
15510
15511 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15512
15513 if (!is_weak)
15514 {
15515 if (aarch64_track_speculation)
15516 {
15517 /* Emit an explicit compare instruction, so that we can correctly
15518 track the condition codes. */
15519 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15520 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15521 }
15522 else
15523 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15524
15525 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15526 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15527 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15528 }
15529 else
15530 {
15531 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15532 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15533 emit_insn (gen_rtx_SET (cond, x));
15534 }
15535
15536 emit_label (label2);
15537 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15538 to set the condition flags. If this is not used it will be removed by
15539 later passes. */
15540 if (strong_zero_p)
15541 {
15542 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15543 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15544 emit_insn (gen_rtx_SET (cond, x));
15545 }
15546 /* Emit any final barrier needed for a __sync operation. */
15547 if (is_mm_sync (model))
15548 aarch64_emit_post_barrier (model);
15549 }
15550
15551 /* Split an atomic operation. */
15552
15553 void
15554 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15555 rtx value, rtx model_rtx, rtx cond)
15556 {
15557 machine_mode mode = GET_MODE (mem);
15558 machine_mode wmode = (mode == DImode ? DImode : SImode);
15559 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15560 const bool is_sync = is_mm_sync (model);
15561 rtx_code_label *label;
15562 rtx x;
15563
15564 /* Split the atomic operation into a sequence. */
15565 label = gen_label_rtx ();
15566 emit_label (label);
15567
15568 if (new_out)
15569 new_out = gen_lowpart (wmode, new_out);
15570 if (old_out)
15571 old_out = gen_lowpart (wmode, old_out);
15572 else
15573 old_out = new_out;
15574 value = simplify_gen_subreg (wmode, value, mode, 0);
15575
15576 /* The initial load can be relaxed for a __sync operation since a final
15577 barrier will be emitted to stop code hoisting. */
15578 if (is_sync)
15579 aarch64_emit_load_exclusive (mode, old_out, mem,
15580 GEN_INT (MEMMODEL_RELAXED));
15581 else
15582 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15583
15584 switch (code)
15585 {
15586 case SET:
15587 new_out = value;
15588 break;
15589
15590 case NOT:
15591 x = gen_rtx_AND (wmode, old_out, value);
15592 emit_insn (gen_rtx_SET (new_out, x));
15593 x = gen_rtx_NOT (wmode, new_out);
15594 emit_insn (gen_rtx_SET (new_out, x));
15595 break;
15596
15597 case MINUS:
15598 if (CONST_INT_P (value))
15599 {
15600 value = GEN_INT (-INTVAL (value));
15601 code = PLUS;
15602 }
15603 /* Fall through. */
15604
15605 default:
15606 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15607 emit_insn (gen_rtx_SET (new_out, x));
15608 break;
15609 }
15610
15611 aarch64_emit_store_exclusive (mode, cond, mem,
15612 gen_lowpart (mode, new_out), model_rtx);
15613
15614 if (aarch64_track_speculation)
15615 {
15616 /* Emit an explicit compare instruction, so that we can correctly
15617 track the condition codes. */
15618 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15619 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15620 }
15621 else
15622 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15623
15624 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15625 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15626 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15627
15628 /* Emit any final barrier needed for a __sync operation. */
15629 if (is_sync)
15630 aarch64_emit_post_barrier (model);
15631 }
15632
15633 static void
15634 aarch64_init_libfuncs (void)
15635 {
15636 /* Half-precision float operations. The compiler handles all operations
15637 with NULL libfuncs by converting to SFmode. */
15638
15639 /* Conversions. */
15640 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15641 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15642
15643 /* Arithmetic. */
15644 set_optab_libfunc (add_optab, HFmode, NULL);
15645 set_optab_libfunc (sdiv_optab, HFmode, NULL);
15646 set_optab_libfunc (smul_optab, HFmode, NULL);
15647 set_optab_libfunc (neg_optab, HFmode, NULL);
15648 set_optab_libfunc (sub_optab, HFmode, NULL);
15649
15650 /* Comparisons. */
15651 set_optab_libfunc (eq_optab, HFmode, NULL);
15652 set_optab_libfunc (ne_optab, HFmode, NULL);
15653 set_optab_libfunc (lt_optab, HFmode, NULL);
15654 set_optab_libfunc (le_optab, HFmode, NULL);
15655 set_optab_libfunc (ge_optab, HFmode, NULL);
15656 set_optab_libfunc (gt_optab, HFmode, NULL);
15657 set_optab_libfunc (unord_optab, HFmode, NULL);
15658 }
15659
15660 /* Target hook for c_mode_for_suffix. */
15661 static machine_mode
15662 aarch64_c_mode_for_suffix (char suffix)
15663 {
15664 if (suffix == 'q')
15665 return TFmode;
15666
15667 return VOIDmode;
15668 }
15669
15670 /* We can only represent floating point constants which will fit in
15671 "quarter-precision" values. These values are characterised by
15672 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
15673 by:
15674
15675 (-1)^s * (n/16) * 2^r
15676
15677 Where:
15678 's' is the sign bit.
15679 'n' is an integer in the range 16 <= n <= 31.
15680 'r' is an integer in the range -3 <= r <= 4. */
15681
15682 /* Return true iff X can be represented by a quarter-precision
15683 floating point immediate operand X. Note, we cannot represent 0.0. */
15684 bool
15685 aarch64_float_const_representable_p (rtx x)
15686 {
15687 /* This represents our current view of how many bits
15688 make up the mantissa. */
15689 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15690 int exponent;
15691 unsigned HOST_WIDE_INT mantissa, mask;
15692 REAL_VALUE_TYPE r, m;
15693 bool fail;
15694
15695 if (!CONST_DOUBLE_P (x))
15696 return false;
15697
15698 if (GET_MODE (x) == VOIDmode
15699 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15700 return false;
15701
15702 r = *CONST_DOUBLE_REAL_VALUE (x);
15703
15704 /* We cannot represent infinities, NaNs or +/-zero. We won't
15705 know if we have +zero until we analyse the mantissa, but we
15706 can reject the other invalid values. */
15707 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15708 || REAL_VALUE_MINUS_ZERO (r))
15709 return false;
15710
15711 /* Extract exponent. */
15712 r = real_value_abs (&r);
15713 exponent = REAL_EXP (&r);
15714
15715 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15716 highest (sign) bit, with a fixed binary point at bit point_pos.
15717 m1 holds the low part of the mantissa, m2 the high part.
15718 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15719 bits for the mantissa, this can fail (low bits will be lost). */
15720 real_ldexp (&m, &r, point_pos - exponent);
15721 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15722
15723 /* If the low part of the mantissa has bits set we cannot represent
15724 the value. */
15725 if (w.ulow () != 0)
15726 return false;
15727 /* We have rejected the lower HOST_WIDE_INT, so update our
15728 understanding of how many bits lie in the mantissa and
15729 look only at the high HOST_WIDE_INT. */
15730 mantissa = w.elt (1);
15731 point_pos -= HOST_BITS_PER_WIDE_INT;
15732
15733 /* We can only represent values with a mantissa of the form 1.xxxx. */
15734 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15735 if ((mantissa & mask) != 0)
15736 return false;
15737
15738 /* Having filtered unrepresentable values, we may now remove all
15739 but the highest 5 bits. */
15740 mantissa >>= point_pos - 5;
15741
15742 /* We cannot represent the value 0.0, so reject it. This is handled
15743 elsewhere. */
15744 if (mantissa == 0)
15745 return false;
15746
15747 /* Then, as bit 4 is always set, we can mask it off, leaving
15748 the mantissa in the range [0, 15]. */
15749 mantissa &= ~(1 << 4);
15750 gcc_assert (mantissa <= 15);
15751
15752 /* GCC internally does not use IEEE754-like encoding (where normalized
15753 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
15754 Our mantissa values are shifted 4 places to the left relative to
15755 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15756 by 5 places to correct for GCC's representation. */
15757 exponent = 5 - exponent;
15758
15759 return (exponent >= 0 && exponent <= 7);
15760 }
15761
15762 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15763 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
15764 output MOVI/MVNI, ORR or BIC immediate. */
15765 char*
15766 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15767 enum simd_immediate_check which)
15768 {
15769 bool is_valid;
15770 static char templ[40];
15771 const char *mnemonic;
15772 const char *shift_op;
15773 unsigned int lane_count = 0;
15774 char element_char;
15775
15776 struct simd_immediate_info info;
15777
15778 /* This will return true to show const_vector is legal for use as either
15779 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15780 It will also update INFO to show how the immediate should be generated.
15781 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
15782 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15783 gcc_assert (is_valid);
15784
15785 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15786 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15787
15788 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15789 {
15790 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15791 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15792 move immediate path. */
15793 if (aarch64_float_const_zero_rtx_p (info.value))
15794 info.value = GEN_INT (0);
15795 else
15796 {
15797 const unsigned int buf_size = 20;
15798 char float_buf[buf_size] = {'\0'};
15799 real_to_decimal_for_mode (float_buf,
15800 CONST_DOUBLE_REAL_VALUE (info.value),
15801 buf_size, buf_size, 1, info.elt_mode);
15802
15803 if (lane_count == 1)
15804 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15805 else
15806 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15807 lane_count, element_char, float_buf);
15808 return templ;
15809 }
15810 }
15811
15812 gcc_assert (CONST_INT_P (info.value));
15813
15814 if (which == AARCH64_CHECK_MOV)
15815 {
15816 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15817 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15818 if (lane_count == 1)
15819 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15820 mnemonic, UINTVAL (info.value));
15821 else if (info.shift)
15822 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15823 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15824 element_char, UINTVAL (info.value), shift_op, info.shift);
15825 else
15826 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15827 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15828 element_char, UINTVAL (info.value));
15829 }
15830 else
15831 {
15832 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15833 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15834 if (info.shift)
15835 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15836 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15837 element_char, UINTVAL (info.value), "lsl", info.shift);
15838 else
15839 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15840 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15841 element_char, UINTVAL (info.value));
15842 }
15843 return templ;
15844 }
15845
15846 char*
15847 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15848 {
15849
15850 /* If a floating point number was passed and we desire to use it in an
15851 integer mode do the conversion to integer. */
15852 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15853 {
15854 unsigned HOST_WIDE_INT ival;
15855 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15856 gcc_unreachable ();
15857 immediate = gen_int_mode (ival, mode);
15858 }
15859
15860 machine_mode vmode;
15861 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15862 a 128 bit vector mode. */
15863 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15864
15865 vmode = aarch64_simd_container_mode (mode, width);
15866 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15867 return aarch64_output_simd_mov_immediate (v_op, width);
15868 }
15869
15870 /* Return the output string to use for moving immediate CONST_VECTOR
15871 into an SVE register. */
15872
15873 char *
15874 aarch64_output_sve_mov_immediate (rtx const_vector)
15875 {
15876 static char templ[40];
15877 struct simd_immediate_info info;
15878 char element_char;
15879
15880 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15881 gcc_assert (is_valid);
15882
15883 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15884
15885 if (info.step)
15886 {
15887 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15888 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15889 element_char, INTVAL (info.value), INTVAL (info.step));
15890 return templ;
15891 }
15892
15893 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15894 {
15895 if (aarch64_float_const_zero_rtx_p (info.value))
15896 info.value = GEN_INT (0);
15897 else
15898 {
15899 const int buf_size = 20;
15900 char float_buf[buf_size] = {};
15901 real_to_decimal_for_mode (float_buf,
15902 CONST_DOUBLE_REAL_VALUE (info.value),
15903 buf_size, buf_size, 1, info.elt_mode);
15904
15905 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15906 element_char, float_buf);
15907 return templ;
15908 }
15909 }
15910
15911 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15912 element_char, INTVAL (info.value));
15913 return templ;
15914 }
15915
15916 /* Return the asm format for a PTRUE instruction whose destination has
15917 mode MODE. SUFFIX is the element size suffix. */
15918
15919 char *
15920 aarch64_output_ptrue (machine_mode mode, char suffix)
15921 {
15922 unsigned int nunits;
15923 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15924 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15925 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15926 else
15927 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15928 return buf;
15929 }
15930
15931 /* Split operands into moves from op[1] + op[2] into op[0]. */
15932
15933 void
15934 aarch64_split_combinev16qi (rtx operands[3])
15935 {
15936 unsigned int dest = REGNO (operands[0]);
15937 unsigned int src1 = REGNO (operands[1]);
15938 unsigned int src2 = REGNO (operands[2]);
15939 machine_mode halfmode = GET_MODE (operands[1]);
15940 unsigned int halfregs = REG_NREGS (operands[1]);
15941 rtx destlo, desthi;
15942
15943 gcc_assert (halfmode == V16QImode);
15944
15945 if (src1 == dest && src2 == dest + halfregs)
15946 {
15947 /* No-op move. Can't split to nothing; emit something. */
15948 emit_note (NOTE_INSN_DELETED);
15949 return;
15950 }
15951
15952 /* Preserve register attributes for variable tracking. */
15953 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15954 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15955 GET_MODE_SIZE (halfmode));
15956
15957 /* Special case of reversed high/low parts. */
15958 if (reg_overlap_mentioned_p (operands[2], destlo)
15959 && reg_overlap_mentioned_p (operands[1], desthi))
15960 {
15961 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15962 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15963 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15964 }
15965 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15966 {
15967 /* Try to avoid unnecessary moves if part of the result
15968 is in the right place already. */
15969 if (src1 != dest)
15970 emit_move_insn (destlo, operands[1]);
15971 if (src2 != dest + halfregs)
15972 emit_move_insn (desthi, operands[2]);
15973 }
15974 else
15975 {
15976 if (src2 != dest + halfregs)
15977 emit_move_insn (desthi, operands[2]);
15978 if (src1 != dest)
15979 emit_move_insn (destlo, operands[1]);
15980 }
15981 }
15982
15983 /* vec_perm support. */
15984
15985 struct expand_vec_perm_d
15986 {
15987 rtx target, op0, op1;
15988 vec_perm_indices perm;
15989 machine_mode vmode;
15990 unsigned int vec_flags;
15991 bool one_vector_p;
15992 bool testing_p;
15993 };
15994
15995 /* Generate a variable permutation. */
15996
15997 static void
15998 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15999 {
16000 machine_mode vmode = GET_MODE (target);
16001 bool one_vector_p = rtx_equal_p (op0, op1);
16002
16003 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16004 gcc_checking_assert (GET_MODE (op0) == vmode);
16005 gcc_checking_assert (GET_MODE (op1) == vmode);
16006 gcc_checking_assert (GET_MODE (sel) == vmode);
16007 gcc_checking_assert (TARGET_SIMD);
16008
16009 if (one_vector_p)
16010 {
16011 if (vmode == V8QImode)
16012 {
16013 /* Expand the argument to a V16QI mode by duplicating it. */
16014 rtx pair = gen_reg_rtx (V16QImode);
16015 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16016 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16017 }
16018 else
16019 {
16020 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16021 }
16022 }
16023 else
16024 {
16025 rtx pair;
16026
16027 if (vmode == V8QImode)
16028 {
16029 pair = gen_reg_rtx (V16QImode);
16030 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16031 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16032 }
16033 else
16034 {
16035 pair = gen_reg_rtx (OImode);
16036 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16037 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16038 }
16039 }
16040 }
16041
16042 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16043 NELT is the number of elements in the vector. */
16044
16045 void
16046 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16047 unsigned int nelt)
16048 {
16049 machine_mode vmode = GET_MODE (target);
16050 bool one_vector_p = rtx_equal_p (op0, op1);
16051 rtx mask;
16052
16053 /* The TBL instruction does not use a modulo index, so we must take care
16054 of that ourselves. */
16055 mask = aarch64_simd_gen_const_vector_dup (vmode,
16056 one_vector_p ? nelt - 1 : 2 * nelt - 1);
16057 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16058
16059 /* For big-endian, we also need to reverse the index within the vector
16060 (but not which vector). */
16061 if (BYTES_BIG_ENDIAN)
16062 {
16063 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
16064 if (!one_vector_p)
16065 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16066 sel = expand_simple_binop (vmode, XOR, sel, mask,
16067 NULL, 0, OPTAB_LIB_WIDEN);
16068 }
16069 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16070 }
16071
16072 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
16073
16074 static void
16075 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16076 {
16077 emit_insn (gen_rtx_SET (target,
16078 gen_rtx_UNSPEC (GET_MODE (target),
16079 gen_rtvec (2, op0, op1), code)));
16080 }
16081
16082 /* Expand an SVE vec_perm with the given operands. */
16083
16084 void
16085 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16086 {
16087 machine_mode data_mode = GET_MODE (target);
16088 machine_mode sel_mode = GET_MODE (sel);
16089 /* Enforced by the pattern condition. */
16090 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16091
16092 /* Note: vec_perm indices are supposed to wrap when they go beyond the
16093 size of the two value vectors, i.e. the upper bits of the indices
16094 are effectively ignored. SVE TBL instead produces 0 for any
16095 out-of-range indices, so we need to modulo all the vec_perm indices
16096 to ensure they are all in range. */
16097 rtx sel_reg = force_reg (sel_mode, sel);
16098
16099 /* Check if the sel only references the first values vector. */
16100 if (GET_CODE (sel) == CONST_VECTOR
16101 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16102 {
16103 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16104 return;
16105 }
16106
16107 /* Check if the two values vectors are the same. */
16108 if (rtx_equal_p (op0, op1))
16109 {
16110 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16111 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16112 NULL, 0, OPTAB_DIRECT);
16113 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16114 return;
16115 }
16116
16117 /* Run TBL on for each value vector and combine the results. */
16118
16119 rtx res0 = gen_reg_rtx (data_mode);
16120 rtx res1 = gen_reg_rtx (data_mode);
16121 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16122 if (GET_CODE (sel) != CONST_VECTOR
16123 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16124 {
16125 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16126 2 * nunits - 1);
16127 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16128 NULL, 0, OPTAB_DIRECT);
16129 }
16130 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16131 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16132 NULL, 0, OPTAB_DIRECT);
16133 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16134 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16135 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16136 else
16137 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16138 }
16139
16140 /* Recognize patterns suitable for the TRN instructions. */
16141 static bool
16142 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16143 {
16144 HOST_WIDE_INT odd;
16145 poly_uint64 nelt = d->perm.length ();
16146 rtx out, in0, in1, x;
16147 machine_mode vmode = d->vmode;
16148
16149 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16150 return false;
16151
16152 /* Note that these are little-endian tests.
16153 We correct for big-endian later. */
16154 if (!d->perm[0].is_constant (&odd)
16155 || (odd != 0 && odd != 1)
16156 || !d->perm.series_p (0, 2, odd, 2)
16157 || !d->perm.series_p (1, 2, nelt + odd, 2))
16158 return false;
16159
16160 /* Success! */
16161 if (d->testing_p)
16162 return true;
16163
16164 in0 = d->op0;
16165 in1 = d->op1;
16166 /* We don't need a big-endian lane correction for SVE; see the comment
16167 at the head of aarch64-sve.md for details. */
16168 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16169 {
16170 x = in0, in0 = in1, in1 = x;
16171 odd = !odd;
16172 }
16173 out = d->target;
16174
16175 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16176 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16177 return true;
16178 }
16179
16180 /* Recognize patterns suitable for the UZP instructions. */
16181 static bool
16182 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16183 {
16184 HOST_WIDE_INT odd;
16185 rtx out, in0, in1, x;
16186 machine_mode vmode = d->vmode;
16187
16188 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16189 return false;
16190
16191 /* Note that these are little-endian tests.
16192 We correct for big-endian later. */
16193 if (!d->perm[0].is_constant (&odd)
16194 || (odd != 0 && odd != 1)
16195 || !d->perm.series_p (0, 1, odd, 2))
16196 return false;
16197
16198 /* Success! */
16199 if (d->testing_p)
16200 return true;
16201
16202 in0 = d->op0;
16203 in1 = d->op1;
16204 /* We don't need a big-endian lane correction for SVE; see the comment
16205 at the head of aarch64-sve.md for details. */
16206 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16207 {
16208 x = in0, in0 = in1, in1 = x;
16209 odd = !odd;
16210 }
16211 out = d->target;
16212
16213 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16214 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16215 return true;
16216 }
16217
16218 /* Recognize patterns suitable for the ZIP instructions. */
16219 static bool
16220 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16221 {
16222 unsigned int high;
16223 poly_uint64 nelt = d->perm.length ();
16224 rtx out, in0, in1, x;
16225 machine_mode vmode = d->vmode;
16226
16227 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16228 return false;
16229
16230 /* Note that these are little-endian tests.
16231 We correct for big-endian later. */
16232 poly_uint64 first = d->perm[0];
16233 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16234 || !d->perm.series_p (0, 2, first, 1)
16235 || !d->perm.series_p (1, 2, first + nelt, 1))
16236 return false;
16237 high = maybe_ne (first, 0U);
16238
16239 /* Success! */
16240 if (d->testing_p)
16241 return true;
16242
16243 in0 = d->op0;
16244 in1 = d->op1;
16245 /* We don't need a big-endian lane correction for SVE; see the comment
16246 at the head of aarch64-sve.md for details. */
16247 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16248 {
16249 x = in0, in0 = in1, in1 = x;
16250 high = !high;
16251 }
16252 out = d->target;
16253
16254 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16255 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16256 return true;
16257 }
16258
16259 /* Recognize patterns for the EXT insn. */
16260
16261 static bool
16262 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16263 {
16264 HOST_WIDE_INT location;
16265 rtx offset;
16266
16267 /* The first element always refers to the first vector.
16268 Check if the extracted indices are increasing by one. */
16269 if (d->vec_flags == VEC_SVE_PRED
16270 || !d->perm[0].is_constant (&location)
16271 || !d->perm.series_p (0, 1, location, 1))
16272 return false;
16273
16274 /* Success! */
16275 if (d->testing_p)
16276 return true;
16277
16278 /* The case where (location == 0) is a no-op for both big- and little-endian,
16279 and is removed by the mid-end at optimization levels -O1 and higher.
16280
16281 We don't need a big-endian lane correction for SVE; see the comment
16282 at the head of aarch64-sve.md for details. */
16283 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16284 {
16285 /* After setup, we want the high elements of the first vector (stored
16286 at the LSB end of the register), and the low elements of the second
16287 vector (stored at the MSB end of the register). So swap. */
16288 std::swap (d->op0, d->op1);
16289 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16290 to_constant () is safe since this is restricted to Advanced SIMD
16291 vectors. */
16292 location = d->perm.length ().to_constant () - location;
16293 }
16294
16295 offset = GEN_INT (location);
16296 emit_set_insn (d->target,
16297 gen_rtx_UNSPEC (d->vmode,
16298 gen_rtvec (3, d->op0, d->op1, offset),
16299 UNSPEC_EXT));
16300 return true;
16301 }
16302
16303 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16304 within each 64-bit, 32-bit or 16-bit granule. */
16305
16306 static bool
16307 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16308 {
16309 HOST_WIDE_INT diff;
16310 unsigned int i, size, unspec;
16311 machine_mode pred_mode;
16312
16313 if (d->vec_flags == VEC_SVE_PRED
16314 || !d->one_vector_p
16315 || !d->perm[0].is_constant (&diff))
16316 return false;
16317
16318 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16319 if (size == 8)
16320 {
16321 unspec = UNSPEC_REV64;
16322 pred_mode = VNx2BImode;
16323 }
16324 else if (size == 4)
16325 {
16326 unspec = UNSPEC_REV32;
16327 pred_mode = VNx4BImode;
16328 }
16329 else if (size == 2)
16330 {
16331 unspec = UNSPEC_REV16;
16332 pred_mode = VNx8BImode;
16333 }
16334 else
16335 return false;
16336
16337 unsigned int step = diff + 1;
16338 for (i = 0; i < step; ++i)
16339 if (!d->perm.series_p (i, step, diff - i, step))
16340 return false;
16341
16342 /* Success! */
16343 if (d->testing_p)
16344 return true;
16345
16346 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16347 if (d->vec_flags == VEC_SVE_DATA)
16348 {
16349 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16350 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16351 UNSPEC_MERGE_PTRUE);
16352 }
16353 emit_set_insn (d->target, src);
16354 return true;
16355 }
16356
16357 /* Recognize patterns for the REV insn, which reverses elements within
16358 a full vector. */
16359
16360 static bool
16361 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16362 {
16363 poly_uint64 nelt = d->perm.length ();
16364
16365 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16366 return false;
16367
16368 if (!d->perm.series_p (0, 1, nelt - 1, -1))
16369 return false;
16370
16371 /* Success! */
16372 if (d->testing_p)
16373 return true;
16374
16375 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16376 emit_set_insn (d->target, src);
16377 return true;
16378 }
16379
16380 static bool
16381 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16382 {
16383 rtx out = d->target;
16384 rtx in0;
16385 HOST_WIDE_INT elt;
16386 machine_mode vmode = d->vmode;
16387 rtx lane;
16388
16389 if (d->vec_flags == VEC_SVE_PRED
16390 || d->perm.encoding ().encoded_nelts () != 1
16391 || !d->perm[0].is_constant (&elt))
16392 return false;
16393
16394 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16395 return false;
16396
16397 /* Success! */
16398 if (d->testing_p)
16399 return true;
16400
16401 /* The generic preparation in aarch64_expand_vec_perm_const_1
16402 swaps the operand order and the permute indices if it finds
16403 d->perm[0] to be in the second operand. Thus, we can always
16404 use d->op0 and need not do any extra arithmetic to get the
16405 correct lane number. */
16406 in0 = d->op0;
16407 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
16408
16409 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16410 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16411 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16412 return true;
16413 }
16414
16415 static bool
16416 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16417 {
16418 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16419 machine_mode vmode = d->vmode;
16420
16421 /* Make sure that the indices are constant. */
16422 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16423 for (unsigned int i = 0; i < encoded_nelts; ++i)
16424 if (!d->perm[i].is_constant ())
16425 return false;
16426
16427 if (d->testing_p)
16428 return true;
16429
16430 /* Generic code will try constant permutation twice. Once with the
16431 original mode and again with the elements lowered to QImode.
16432 So wait and don't do the selector expansion ourselves. */
16433 if (vmode != V8QImode && vmode != V16QImode)
16434 return false;
16435
16436 /* to_constant is safe since this routine is specific to Advanced SIMD
16437 vectors. */
16438 unsigned int nelt = d->perm.length ().to_constant ();
16439 for (unsigned int i = 0; i < nelt; ++i)
16440 /* If big-endian and two vectors we end up with a weird mixed-endian
16441 mode on NEON. Reverse the index within each word but not the word
16442 itself. to_constant is safe because we checked is_constant above. */
16443 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16444 ? d->perm[i].to_constant () ^ (nelt - 1)
16445 : d->perm[i].to_constant ());
16446
16447 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16448 sel = force_reg (vmode, sel);
16449
16450 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16451 return true;
16452 }
16453
16454 /* Try to implement D using an SVE TBL instruction. */
16455
16456 static bool
16457 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16458 {
16459 unsigned HOST_WIDE_INT nelt;
16460
16461 /* Permuting two variable-length vectors could overflow the
16462 index range. */
16463 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16464 return false;
16465
16466 if (d->testing_p)
16467 return true;
16468
16469 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16470 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16471 if (d->one_vector_p)
16472 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16473 else
16474 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16475 return true;
16476 }
16477
16478 static bool
16479 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16480 {
16481 /* The pattern matching functions above are written to look for a small
16482 number to begin the sequence (0, 1, N/2). If we begin with an index
16483 from the second operand, we can swap the operands. */
16484 poly_int64 nelt = d->perm.length ();
16485 if (known_ge (d->perm[0], nelt))
16486 {
16487 d->perm.rotate_inputs (1);
16488 std::swap (d->op0, d->op1);
16489 }
16490
16491 if ((d->vec_flags == VEC_ADVSIMD
16492 || d->vec_flags == VEC_SVE_DATA
16493 || d->vec_flags == VEC_SVE_PRED)
16494 && known_gt (nelt, 1))
16495 {
16496 if (aarch64_evpc_rev_local (d))
16497 return true;
16498 else if (aarch64_evpc_rev_global (d))
16499 return true;
16500 else if (aarch64_evpc_ext (d))
16501 return true;
16502 else if (aarch64_evpc_dup (d))
16503 return true;
16504 else if (aarch64_evpc_zip (d))
16505 return true;
16506 else if (aarch64_evpc_uzp (d))
16507 return true;
16508 else if (aarch64_evpc_trn (d))
16509 return true;
16510 if (d->vec_flags == VEC_SVE_DATA)
16511 return aarch64_evpc_sve_tbl (d);
16512 else if (d->vec_flags == VEC_ADVSIMD)
16513 return aarch64_evpc_tbl (d);
16514 }
16515 return false;
16516 }
16517
16518 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
16519
16520 static bool
16521 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16522 rtx op1, const vec_perm_indices &sel)
16523 {
16524 struct expand_vec_perm_d d;
16525
16526 /* Check whether the mask can be applied to a single vector. */
16527 if (sel.ninputs () == 1
16528 || (op0 && rtx_equal_p (op0, op1)))
16529 d.one_vector_p = true;
16530 else if (sel.all_from_input_p (0))
16531 {
16532 d.one_vector_p = true;
16533 op1 = op0;
16534 }
16535 else if (sel.all_from_input_p (1))
16536 {
16537 d.one_vector_p = true;
16538 op0 = op1;
16539 }
16540 else
16541 d.one_vector_p = false;
16542
16543 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16544 sel.nelts_per_input ());
16545 d.vmode = vmode;
16546 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16547 d.target = target;
16548 d.op0 = op0;
16549 d.op1 = op1;
16550 d.testing_p = !target;
16551
16552 if (!d.testing_p)
16553 return aarch64_expand_vec_perm_const_1 (&d);
16554
16555 rtx_insn *last = get_last_insn ();
16556 bool ret = aarch64_expand_vec_perm_const_1 (&d);
16557 gcc_assert (last == get_last_insn ());
16558
16559 return ret;
16560 }
16561
16562 /* Generate a byte permute mask for a register of mode MODE,
16563 which has NUNITS units. */
16564
16565 rtx
16566 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16567 {
16568 /* We have to reverse each vector because we dont have
16569 a permuted load that can reverse-load according to ABI rules. */
16570 rtx mask;
16571 rtvec v = rtvec_alloc (16);
16572 unsigned int i, j;
16573 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16574
16575 gcc_assert (BYTES_BIG_ENDIAN);
16576 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16577
16578 for (i = 0; i < nunits; i++)
16579 for (j = 0; j < usize; j++)
16580 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16581 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16582 return force_reg (V16QImode, mask);
16583 }
16584
16585 /* Return true if X is a valid second operand for the SVE instruction
16586 that implements integer comparison OP_CODE. */
16587
16588 static bool
16589 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16590 {
16591 if (register_operand (x, VOIDmode))
16592 return true;
16593
16594 switch (op_code)
16595 {
16596 case LTU:
16597 case LEU:
16598 case GEU:
16599 case GTU:
16600 return aarch64_sve_cmp_immediate_p (x, false);
16601 case LT:
16602 case LE:
16603 case GE:
16604 case GT:
16605 case NE:
16606 case EQ:
16607 return aarch64_sve_cmp_immediate_p (x, true);
16608 default:
16609 gcc_unreachable ();
16610 }
16611 }
16612
16613 /* Use predicated SVE instructions to implement the equivalent of:
16614
16615 (set TARGET OP)
16616
16617 given that PTRUE is an all-true predicate of the appropriate mode. */
16618
16619 static void
16620 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16621 {
16622 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16623 gen_rtvec (2, ptrue, op),
16624 UNSPEC_MERGE_PTRUE);
16625 rtx_insn *insn = emit_set_insn (target, unspec);
16626 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16627 }
16628
16629 /* Likewise, but also clobber the condition codes. */
16630
16631 static void
16632 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16633 {
16634 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16635 gen_rtvec (2, ptrue, op),
16636 UNSPEC_MERGE_PTRUE);
16637 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16638 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16639 }
16640
16641 /* Return the UNSPEC_COND_* code for comparison CODE. */
16642
16643 static unsigned int
16644 aarch64_unspec_cond_code (rtx_code code)
16645 {
16646 switch (code)
16647 {
16648 case NE:
16649 return UNSPEC_COND_NE;
16650 case EQ:
16651 return UNSPEC_COND_EQ;
16652 case LT:
16653 return UNSPEC_COND_LT;
16654 case GT:
16655 return UNSPEC_COND_GT;
16656 case LE:
16657 return UNSPEC_COND_LE;
16658 case GE:
16659 return UNSPEC_COND_GE;
16660 default:
16661 gcc_unreachable ();
16662 }
16663 }
16664
16665 /* Emit:
16666
16667 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16668
16669 where <X> is the operation associated with comparison CODE. This form
16670 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16671 semantics, such as when PRED might not be all-true and when comparing
16672 inactive lanes could have side effects. */
16673
16674 static void
16675 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16676 rtx pred, rtx op0, rtx op1)
16677 {
16678 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16679 gen_rtvec (3, pred, op0, op1),
16680 aarch64_unspec_cond_code (code));
16681 emit_set_insn (target, unspec);
16682 }
16683
16684 /* Expand an SVE integer comparison using the SVE equivalent of:
16685
16686 (set TARGET (CODE OP0 OP1)). */
16687
16688 void
16689 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16690 {
16691 machine_mode pred_mode = GET_MODE (target);
16692 machine_mode data_mode = GET_MODE (op0);
16693
16694 if (!aarch64_sve_cmp_operand_p (code, op1))
16695 op1 = force_reg (data_mode, op1);
16696
16697 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16698 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16699 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16700 }
16701
16702 /* Emit the SVE equivalent of:
16703
16704 (set TMP1 (CODE1 OP0 OP1))
16705 (set TMP2 (CODE2 OP0 OP1))
16706 (set TARGET (ior:PRED_MODE TMP1 TMP2))
16707
16708 PTRUE is an all-true predicate with the same mode as TARGET. */
16709
16710 static void
16711 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16712 rtx ptrue, rtx op0, rtx op1)
16713 {
16714 machine_mode pred_mode = GET_MODE (ptrue);
16715 rtx tmp1 = gen_reg_rtx (pred_mode);
16716 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16717 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16718 rtx tmp2 = gen_reg_rtx (pred_mode);
16719 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16720 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16721 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16722 }
16723
16724 /* Emit the SVE equivalent of:
16725
16726 (set TMP (CODE OP0 OP1))
16727 (set TARGET (not TMP))
16728
16729 PTRUE is an all-true predicate with the same mode as TARGET. */
16730
16731 static void
16732 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16733 rtx op0, rtx op1)
16734 {
16735 machine_mode pred_mode = GET_MODE (ptrue);
16736 rtx tmp = gen_reg_rtx (pred_mode);
16737 aarch64_emit_sve_ptrue_op (tmp, ptrue,
16738 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16739 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16740 }
16741
16742 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16743
16744 (set TARGET (CODE OP0 OP1))
16745
16746 If CAN_INVERT_P is true, the caller can also handle inverted results;
16747 return true if the result is in fact inverted. */
16748
16749 bool
16750 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16751 rtx op0, rtx op1, bool can_invert_p)
16752 {
16753 machine_mode pred_mode = GET_MODE (target);
16754 machine_mode data_mode = GET_MODE (op0);
16755
16756 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16757 switch (code)
16758 {
16759 case UNORDERED:
16760 /* UNORDERED has no immediate form. */
16761 op1 = force_reg (data_mode, op1);
16762 /* fall through */
16763 case LT:
16764 case LE:
16765 case GT:
16766 case GE:
16767 case EQ:
16768 case NE:
16769 {
16770 /* There is native support for the comparison. */
16771 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16772 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16773 return false;
16774 }
16775
16776 case LTGT:
16777 /* This is a trapping operation (LT or GT). */
16778 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16779 return false;
16780
16781 case UNEQ:
16782 if (!flag_trapping_math)
16783 {
16784 /* This would trap for signaling NaNs. */
16785 op1 = force_reg (data_mode, op1);
16786 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16787 return false;
16788 }
16789 /* fall through */
16790 case UNLT:
16791 case UNLE:
16792 case UNGT:
16793 case UNGE:
16794 if (flag_trapping_math)
16795 {
16796 /* Work out which elements are ordered. */
16797 rtx ordered = gen_reg_rtx (pred_mode);
16798 op1 = force_reg (data_mode, op1);
16799 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16800
16801 /* Test the opposite condition for the ordered elements,
16802 then invert the result. */
16803 if (code == UNEQ)
16804 code = NE;
16805 else
16806 code = reverse_condition_maybe_unordered (code);
16807 if (can_invert_p)
16808 {
16809 aarch64_emit_sve_predicated_cond (target, code,
16810 ordered, op0, op1);
16811 return true;
16812 }
16813 rtx tmp = gen_reg_rtx (pred_mode);
16814 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16815 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16816 return false;
16817 }
16818 break;
16819
16820 case ORDERED:
16821 /* ORDERED has no immediate form. */
16822 op1 = force_reg (data_mode, op1);
16823 break;
16824
16825 default:
16826 gcc_unreachable ();
16827 }
16828
16829 /* There is native support for the inverse comparison. */
16830 code = reverse_condition_maybe_unordered (code);
16831 if (can_invert_p)
16832 {
16833 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16834 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16835 return true;
16836 }
16837 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16838 return false;
16839 }
16840
16841 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16842 of the data being selected and CMP_MODE is the mode of the values being
16843 compared. */
16844
16845 void
16846 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16847 rtx *ops)
16848 {
16849 machine_mode pred_mode
16850 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16851 GET_MODE_SIZE (cmp_mode)).require ();
16852 rtx pred = gen_reg_rtx (pred_mode);
16853 if (FLOAT_MODE_P (cmp_mode))
16854 {
16855 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16856 ops[4], ops[5], true))
16857 std::swap (ops[1], ops[2]);
16858 }
16859 else
16860 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16861
16862 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16863 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16864 }
16865
16866 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16867 true. However due to issues with register allocation it is preferable
16868 to avoid tieing integer scalar and FP scalar modes. Executing integer
16869 operations in general registers is better than treating them as scalar
16870 vector operations. This reduces latency and avoids redundant int<->FP
16871 moves. So tie modes if they are either the same class, or vector modes
16872 with other vector modes, vector structs or any scalar mode. */
16873
16874 static bool
16875 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16876 {
16877 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16878 return true;
16879
16880 /* We specifically want to allow elements of "structure" modes to
16881 be tieable to the structure. This more general condition allows
16882 other rarer situations too. The reason we don't extend this to
16883 predicate modes is that there are no predicate structure modes
16884 nor any specific instructions for extracting part of a predicate
16885 register. */
16886 if (aarch64_vector_data_mode_p (mode1)
16887 && aarch64_vector_data_mode_p (mode2))
16888 return true;
16889
16890 /* Also allow any scalar modes with vectors. */
16891 if (aarch64_vector_mode_supported_p (mode1)
16892 || aarch64_vector_mode_supported_p (mode2))
16893 return true;
16894
16895 return false;
16896 }
16897
16898 /* Return a new RTX holding the result of moving POINTER forward by
16899 AMOUNT bytes. */
16900
16901 static rtx
16902 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16903 {
16904 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16905
16906 return adjust_automodify_address (pointer, GET_MODE (pointer),
16907 next, amount);
16908 }
16909
16910 /* Return a new RTX holding the result of moving POINTER forward by the
16911 size of the mode it points to. */
16912
16913 static rtx
16914 aarch64_progress_pointer (rtx pointer)
16915 {
16916 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16917 }
16918
16919 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16920 MODE bytes. */
16921
16922 static void
16923 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16924 machine_mode mode)
16925 {
16926 rtx reg = gen_reg_rtx (mode);
16927
16928 /* "Cast" the pointers to the correct mode. */
16929 *src = adjust_address (*src, mode, 0);
16930 *dst = adjust_address (*dst, mode, 0);
16931 /* Emit the memcpy. */
16932 emit_move_insn (reg, *src);
16933 emit_move_insn (*dst, reg);
16934 /* Move the pointers forward. */
16935 *src = aarch64_progress_pointer (*src);
16936 *dst = aarch64_progress_pointer (*dst);
16937 }
16938
16939 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16940 we succeed, otherwise return false. */
16941
16942 bool
16943 aarch64_expand_movmem (rtx *operands)
16944 {
16945 int n, mode_bits;
16946 rtx dst = operands[0];
16947 rtx src = operands[1];
16948 rtx base;
16949 machine_mode cur_mode = BLKmode, next_mode;
16950 bool speed_p = !optimize_function_for_size_p (cfun);
16951
16952 /* When optimizing for size, give a better estimate of the length of a
16953 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16954 will always require an even number of instructions to do now. And each
16955 operation requires both a load+store, so devide the max number by 2. */
16956 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16957
16958 /* We can't do anything smart if the amount to copy is not constant. */
16959 if (!CONST_INT_P (operands[2]))
16960 return false;
16961
16962 n = INTVAL (operands[2]);
16963
16964 /* Try to keep the number of instructions low. For all cases we will do at
16965 most two moves for the residual amount, since we'll always overlap the
16966 remainder. */
16967 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16968 return false;
16969
16970 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16971 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16972
16973 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16974 src = adjust_automodify_address (src, VOIDmode, base, 0);
16975
16976 /* Convert n to bits to make the rest of the code simpler. */
16977 n = n * BITS_PER_UNIT;
16978
16979 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
16980 larger than TImode, but we should not use them for loads/stores here. */
16981 const int copy_limit = GET_MODE_BITSIZE (TImode);
16982
16983 while (n > 0)
16984 {
16985 /* Find the largest mode in which to do the copy in without over reading
16986 or writing. */
16987 opt_scalar_int_mode mode_iter;
16988 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16989 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
16990 cur_mode = mode_iter.require ();
16991
16992 gcc_assert (cur_mode != BLKmode);
16993
16994 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16995 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16996
16997 n -= mode_bits;
16998
16999 /* Do certain trailing copies as overlapping if it's going to be
17000 cheaper. i.e. less instructions to do so. For instance doing a 15
17001 byte copy it's more efficient to do two overlapping 8 byte copies than
17002 8 + 6 + 1. */
17003 if (n > 0 && n <= 8 * BITS_PER_UNIT)
17004 {
17005 next_mode = smallest_mode_for_size (n, MODE_INT);
17006 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17007 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17008 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17009 n = n_bits;
17010 }
17011 }
17012
17013 return true;
17014 }
17015
17016 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17017 SImode stores. Handle the case when the constant has identical
17018 bottom and top halves. This is beneficial when the two stores can be
17019 merged into an STP and we avoid synthesising potentially expensive
17020 immediates twice. Return true if such a split is possible. */
17021
17022 bool
17023 aarch64_split_dimode_const_store (rtx dst, rtx src)
17024 {
17025 rtx lo = gen_lowpart (SImode, src);
17026 rtx hi = gen_highpart_mode (SImode, DImode, src);
17027
17028 bool size_p = optimize_function_for_size_p (cfun);
17029
17030 if (!rtx_equal_p (lo, hi))
17031 return false;
17032
17033 unsigned int orig_cost
17034 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17035 unsigned int lo_cost
17036 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17037
17038 /* We want to transform:
17039 MOV x1, 49370
17040 MOVK x1, 0x140, lsl 16
17041 MOVK x1, 0xc0da, lsl 32
17042 MOVK x1, 0x140, lsl 48
17043 STR x1, [x0]
17044 into:
17045 MOV w1, 49370
17046 MOVK w1, 0x140, lsl 16
17047 STP w1, w1, [x0]
17048 So we want to perform this only when we save two instructions
17049 or more. When optimizing for size, however, accept any code size
17050 savings we can. */
17051 if (size_p && orig_cost <= lo_cost)
17052 return false;
17053
17054 if (!size_p
17055 && (orig_cost <= lo_cost + 1))
17056 return false;
17057
17058 rtx mem_lo = adjust_address (dst, SImode, 0);
17059 if (!aarch64_mem_pair_operand (mem_lo, SImode))
17060 return false;
17061
17062 rtx tmp_reg = gen_reg_rtx (SImode);
17063 aarch64_expand_mov_immediate (tmp_reg, lo);
17064 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17065 /* Don't emit an explicit store pair as this may not be always profitable.
17066 Let the sched-fusion logic decide whether to merge them. */
17067 emit_move_insn (mem_lo, tmp_reg);
17068 emit_move_insn (mem_hi, tmp_reg);
17069
17070 return true;
17071 }
17072
17073 /* Generate RTL for a conditional branch with rtx comparison CODE in
17074 mode CC_MODE. The destination of the unlikely conditional branch
17075 is LABEL_REF. */
17076
17077 void
17078 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17079 rtx label_ref)
17080 {
17081 rtx x;
17082 x = gen_rtx_fmt_ee (code, VOIDmode,
17083 gen_rtx_REG (cc_mode, CC_REGNUM),
17084 const0_rtx);
17085
17086 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17087 gen_rtx_LABEL_REF (VOIDmode, label_ref),
17088 pc_rtx);
17089 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17090 }
17091
17092 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17093
17094 OP1 represents the TImode destination operand 1
17095 OP2 represents the TImode destination operand 2
17096 LOW_DEST represents the low half (DImode) of TImode operand 0
17097 LOW_IN1 represents the low half (DImode) of TImode operand 1
17098 LOW_IN2 represents the low half (DImode) of TImode operand 2
17099 HIGH_DEST represents the high half (DImode) of TImode operand 0
17100 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17101 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17102
17103 void
17104 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17105 rtx *low_in1, rtx *low_in2,
17106 rtx *high_dest, rtx *high_in1,
17107 rtx *high_in2)
17108 {
17109 *low_dest = gen_reg_rtx (DImode);
17110 *low_in1 = gen_lowpart (DImode, op1);
17111 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17112 subreg_lowpart_offset (DImode, TImode));
17113 *high_dest = gen_reg_rtx (DImode);
17114 *high_in1 = gen_highpart (DImode, op1);
17115 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17116 subreg_highpart_offset (DImode, TImode));
17117 }
17118
17119 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17120
17121 This function differs from 'arch64_addti_scratch_regs' in that
17122 OP1 can be an immediate constant (zero). We must call
17123 subreg_highpart_offset with DImode and TImode arguments, otherwise
17124 VOIDmode will be used for the const_int which generates an internal
17125 error from subreg_size_highpart_offset which does not expect a size of zero.
17126
17127 OP1 represents the TImode destination operand 1
17128 OP2 represents the TImode destination operand 2
17129 LOW_DEST represents the low half (DImode) of TImode operand 0
17130 LOW_IN1 represents the low half (DImode) of TImode operand 1
17131 LOW_IN2 represents the low half (DImode) of TImode operand 2
17132 HIGH_DEST represents the high half (DImode) of TImode operand 0
17133 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17134 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17135
17136
17137 void
17138 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17139 rtx *low_in1, rtx *low_in2,
17140 rtx *high_dest, rtx *high_in1,
17141 rtx *high_in2)
17142 {
17143 *low_dest = gen_reg_rtx (DImode);
17144 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17145 subreg_lowpart_offset (DImode, TImode));
17146
17147 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17148 subreg_lowpart_offset (DImode, TImode));
17149 *high_dest = gen_reg_rtx (DImode);
17150
17151 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17152 subreg_highpart_offset (DImode, TImode));
17153 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17154 subreg_highpart_offset (DImode, TImode));
17155 }
17156
17157 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17158
17159 OP0 represents the TImode destination operand 0
17160 LOW_DEST represents the low half (DImode) of TImode operand 0
17161 LOW_IN1 represents the low half (DImode) of TImode operand 1
17162 LOW_IN2 represents the low half (DImode) of TImode operand 2
17163 HIGH_DEST represents the high half (DImode) of TImode operand 0
17164 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17165 HIGH_IN2 represents the high half (DImode) of TImode operand 2
17166 UNSIGNED_P is true if the operation is being performed on unsigned
17167 values. */
17168 void
17169 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17170 rtx low_in2, rtx high_dest, rtx high_in1,
17171 rtx high_in2, bool unsigned_p)
17172 {
17173 if (low_in2 == const0_rtx)
17174 {
17175 low_dest = low_in1;
17176 high_in2 = force_reg (DImode, high_in2);
17177 if (unsigned_p)
17178 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17179 else
17180 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17181 }
17182 else
17183 {
17184 if (CONST_INT_P (low_in2))
17185 {
17186 high_in2 = force_reg (DImode, high_in2);
17187 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17188 GEN_INT (-INTVAL (low_in2))));
17189 }
17190 else
17191 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17192
17193 if (unsigned_p)
17194 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17195 else
17196 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17197 }
17198
17199 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17200 emit_move_insn (gen_highpart (DImode, op0), high_dest);
17201
17202 }
17203
17204 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
17205
17206 static unsigned HOST_WIDE_INT
17207 aarch64_asan_shadow_offset (void)
17208 {
17209 return (HOST_WIDE_INT_1 << 36);
17210 }
17211
17212 static rtx
17213 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17214 int code, tree treeop0, tree treeop1)
17215 {
17216 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17217 rtx op0, op1;
17218 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17219 insn_code icode;
17220 struct expand_operand ops[4];
17221
17222 start_sequence ();
17223 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17224
17225 op_mode = GET_MODE (op0);
17226 if (op_mode == VOIDmode)
17227 op_mode = GET_MODE (op1);
17228
17229 switch (op_mode)
17230 {
17231 case E_QImode:
17232 case E_HImode:
17233 case E_SImode:
17234 cmp_mode = SImode;
17235 icode = CODE_FOR_cmpsi;
17236 break;
17237
17238 case E_DImode:
17239 cmp_mode = DImode;
17240 icode = CODE_FOR_cmpdi;
17241 break;
17242
17243 case E_SFmode:
17244 cmp_mode = SFmode;
17245 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17246 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17247 break;
17248
17249 case E_DFmode:
17250 cmp_mode = DFmode;
17251 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17252 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17253 break;
17254
17255 default:
17256 end_sequence ();
17257 return NULL_RTX;
17258 }
17259
17260 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17261 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17262 if (!op0 || !op1)
17263 {
17264 end_sequence ();
17265 return NULL_RTX;
17266 }
17267 *prep_seq = get_insns ();
17268 end_sequence ();
17269
17270 create_fixed_operand (&ops[0], op0);
17271 create_fixed_operand (&ops[1], op1);
17272
17273 start_sequence ();
17274 if (!maybe_expand_insn (icode, 2, ops))
17275 {
17276 end_sequence ();
17277 return NULL_RTX;
17278 }
17279 *gen_seq = get_insns ();
17280 end_sequence ();
17281
17282 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17283 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17284 }
17285
17286 static rtx
17287 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17288 int cmp_code, tree treeop0, tree treeop1, int bit_code)
17289 {
17290 rtx op0, op1, target;
17291 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17292 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17293 insn_code icode;
17294 struct expand_operand ops[6];
17295 int aarch64_cond;
17296
17297 push_to_sequence (*prep_seq);
17298 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17299
17300 op_mode = GET_MODE (op0);
17301 if (op_mode == VOIDmode)
17302 op_mode = GET_MODE (op1);
17303
17304 switch (op_mode)
17305 {
17306 case E_QImode:
17307 case E_HImode:
17308 case E_SImode:
17309 cmp_mode = SImode;
17310 icode = CODE_FOR_ccmpsi;
17311 break;
17312
17313 case E_DImode:
17314 cmp_mode = DImode;
17315 icode = CODE_FOR_ccmpdi;
17316 break;
17317
17318 case E_SFmode:
17319 cmp_mode = SFmode;
17320 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17321 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17322 break;
17323
17324 case E_DFmode:
17325 cmp_mode = DFmode;
17326 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17327 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17328 break;
17329
17330 default:
17331 end_sequence ();
17332 return NULL_RTX;
17333 }
17334
17335 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17336 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17337 if (!op0 || !op1)
17338 {
17339 end_sequence ();
17340 return NULL_RTX;
17341 }
17342 *prep_seq = get_insns ();
17343 end_sequence ();
17344
17345 target = gen_rtx_REG (cc_mode, CC_REGNUM);
17346 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17347
17348 if (bit_code != AND)
17349 {
17350 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17351 GET_MODE (XEXP (prev, 0))),
17352 VOIDmode, XEXP (prev, 0), const0_rtx);
17353 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17354 }
17355
17356 create_fixed_operand (&ops[0], XEXP (prev, 0));
17357 create_fixed_operand (&ops[1], target);
17358 create_fixed_operand (&ops[2], op0);
17359 create_fixed_operand (&ops[3], op1);
17360 create_fixed_operand (&ops[4], prev);
17361 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17362
17363 push_to_sequence (*gen_seq);
17364 if (!maybe_expand_insn (icode, 6, ops))
17365 {
17366 end_sequence ();
17367 return NULL_RTX;
17368 }
17369
17370 *gen_seq = get_insns ();
17371 end_sequence ();
17372
17373 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17374 }
17375
17376 #undef TARGET_GEN_CCMP_FIRST
17377 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17378
17379 #undef TARGET_GEN_CCMP_NEXT
17380 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17381
17382 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17383 instruction fusion of some sort. */
17384
17385 static bool
17386 aarch64_macro_fusion_p (void)
17387 {
17388 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17389 }
17390
17391
17392 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17393 should be kept together during scheduling. */
17394
17395 static bool
17396 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17397 {
17398 rtx set_dest;
17399 rtx prev_set = single_set (prev);
17400 rtx curr_set = single_set (curr);
17401 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17402 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17403
17404 if (!aarch64_macro_fusion_p ())
17405 return false;
17406
17407 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17408 {
17409 /* We are trying to match:
17410 prev (mov) == (set (reg r0) (const_int imm16))
17411 curr (movk) == (set (zero_extract (reg r0)
17412 (const_int 16)
17413 (const_int 16))
17414 (const_int imm16_1)) */
17415
17416 set_dest = SET_DEST (curr_set);
17417
17418 if (GET_CODE (set_dest) == ZERO_EXTRACT
17419 && CONST_INT_P (SET_SRC (curr_set))
17420 && CONST_INT_P (SET_SRC (prev_set))
17421 && CONST_INT_P (XEXP (set_dest, 2))
17422 && INTVAL (XEXP (set_dest, 2)) == 16
17423 && REG_P (XEXP (set_dest, 0))
17424 && REG_P (SET_DEST (prev_set))
17425 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17426 {
17427 return true;
17428 }
17429 }
17430
17431 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17432 {
17433
17434 /* We're trying to match:
17435 prev (adrp) == (set (reg r1)
17436 (high (symbol_ref ("SYM"))))
17437 curr (add) == (set (reg r0)
17438 (lo_sum (reg r1)
17439 (symbol_ref ("SYM"))))
17440 Note that r0 need not necessarily be the same as r1, especially
17441 during pre-regalloc scheduling. */
17442
17443 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17444 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17445 {
17446 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17447 && REG_P (XEXP (SET_SRC (curr_set), 0))
17448 && REGNO (XEXP (SET_SRC (curr_set), 0))
17449 == REGNO (SET_DEST (prev_set))
17450 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17451 XEXP (SET_SRC (curr_set), 1)))
17452 return true;
17453 }
17454 }
17455
17456 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17457 {
17458
17459 /* We're trying to match:
17460 prev (movk) == (set (zero_extract (reg r0)
17461 (const_int 16)
17462 (const_int 32))
17463 (const_int imm16_1))
17464 curr (movk) == (set (zero_extract (reg r0)
17465 (const_int 16)
17466 (const_int 48))
17467 (const_int imm16_2)) */
17468
17469 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17470 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17471 && REG_P (XEXP (SET_DEST (prev_set), 0))
17472 && REG_P (XEXP (SET_DEST (curr_set), 0))
17473 && REGNO (XEXP (SET_DEST (prev_set), 0))
17474 == REGNO (XEXP (SET_DEST (curr_set), 0))
17475 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17476 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17477 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17478 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17479 && CONST_INT_P (SET_SRC (prev_set))
17480 && CONST_INT_P (SET_SRC (curr_set)))
17481 return true;
17482
17483 }
17484 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17485 {
17486 /* We're trying to match:
17487 prev (adrp) == (set (reg r0)
17488 (high (symbol_ref ("SYM"))))
17489 curr (ldr) == (set (reg r1)
17490 (mem (lo_sum (reg r0)
17491 (symbol_ref ("SYM")))))
17492 or
17493 curr (ldr) == (set (reg r1)
17494 (zero_extend (mem
17495 (lo_sum (reg r0)
17496 (symbol_ref ("SYM")))))) */
17497 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17498 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17499 {
17500 rtx curr_src = SET_SRC (curr_set);
17501
17502 if (GET_CODE (curr_src) == ZERO_EXTEND)
17503 curr_src = XEXP (curr_src, 0);
17504
17505 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17506 && REG_P (XEXP (XEXP (curr_src, 0), 0))
17507 && REGNO (XEXP (XEXP (curr_src, 0), 0))
17508 == REGNO (SET_DEST (prev_set))
17509 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17510 XEXP (SET_SRC (prev_set), 0)))
17511 return true;
17512 }
17513 }
17514
17515 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17516 && aarch_crypto_can_dual_issue (prev, curr))
17517 return true;
17518
17519 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17520 && any_condjump_p (curr))
17521 {
17522 unsigned int condreg1, condreg2;
17523 rtx cc_reg_1;
17524 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17525 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17526
17527 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17528 && prev
17529 && modified_in_p (cc_reg_1, prev))
17530 {
17531 enum attr_type prev_type = get_attr_type (prev);
17532
17533 /* FIXME: this misses some which is considered simple arthematic
17534 instructions for ThunderX. Simple shifts are missed here. */
17535 if (prev_type == TYPE_ALUS_SREG
17536 || prev_type == TYPE_ALUS_IMM
17537 || prev_type == TYPE_LOGICS_REG
17538 || prev_type == TYPE_LOGICS_IMM)
17539 return true;
17540 }
17541 }
17542
17543 if (prev_set
17544 && curr_set
17545 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17546 && any_condjump_p (curr))
17547 {
17548 /* We're trying to match:
17549 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17550 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
17551 (const_int 0))
17552 (label_ref ("SYM"))
17553 (pc)) */
17554 if (SET_DEST (curr_set) == (pc_rtx)
17555 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17556 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17557 && REG_P (SET_DEST (prev_set))
17558 && REGNO (SET_DEST (prev_set))
17559 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17560 {
17561 /* Fuse ALU operations followed by conditional branch instruction. */
17562 switch (get_attr_type (prev))
17563 {
17564 case TYPE_ALU_IMM:
17565 case TYPE_ALU_SREG:
17566 case TYPE_ADC_REG:
17567 case TYPE_ADC_IMM:
17568 case TYPE_ADCS_REG:
17569 case TYPE_ADCS_IMM:
17570 case TYPE_LOGIC_REG:
17571 case TYPE_LOGIC_IMM:
17572 case TYPE_CSEL:
17573 case TYPE_ADR:
17574 case TYPE_MOV_IMM:
17575 case TYPE_SHIFT_REG:
17576 case TYPE_SHIFT_IMM:
17577 case TYPE_BFM:
17578 case TYPE_RBIT:
17579 case TYPE_REV:
17580 case TYPE_EXTEND:
17581 return true;
17582
17583 default:;
17584 }
17585 }
17586 }
17587
17588 return false;
17589 }
17590
17591 /* Return true iff the instruction fusion described by OP is enabled. */
17592
17593 bool
17594 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17595 {
17596 return (aarch64_tune_params.fusible_ops & op) != 0;
17597 }
17598
17599 /* If MEM is in the form of [base+offset], extract the two parts
17600 of address and set to BASE and OFFSET, otherwise return false
17601 after clearing BASE and OFFSET. */
17602
17603 bool
17604 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17605 {
17606 rtx addr;
17607
17608 gcc_assert (MEM_P (mem));
17609
17610 addr = XEXP (mem, 0);
17611
17612 if (REG_P (addr))
17613 {
17614 *base = addr;
17615 *offset = const0_rtx;
17616 return true;
17617 }
17618
17619 if (GET_CODE (addr) == PLUS
17620 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17621 {
17622 *base = XEXP (addr, 0);
17623 *offset = XEXP (addr, 1);
17624 return true;
17625 }
17626
17627 *base = NULL_RTX;
17628 *offset = NULL_RTX;
17629
17630 return false;
17631 }
17632
17633 /* Types for scheduling fusion. */
17634 enum sched_fusion_type
17635 {
17636 SCHED_FUSION_NONE = 0,
17637 SCHED_FUSION_LD_SIGN_EXTEND,
17638 SCHED_FUSION_LD_ZERO_EXTEND,
17639 SCHED_FUSION_LD,
17640 SCHED_FUSION_ST,
17641 SCHED_FUSION_NUM
17642 };
17643
17644 /* If INSN is a load or store of address in the form of [base+offset],
17645 extract the two parts and set to BASE and OFFSET. Return scheduling
17646 fusion type this INSN is. */
17647
17648 static enum sched_fusion_type
17649 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17650 {
17651 rtx x, dest, src;
17652 enum sched_fusion_type fusion = SCHED_FUSION_LD;
17653
17654 gcc_assert (INSN_P (insn));
17655 x = PATTERN (insn);
17656 if (GET_CODE (x) != SET)
17657 return SCHED_FUSION_NONE;
17658
17659 src = SET_SRC (x);
17660 dest = SET_DEST (x);
17661
17662 machine_mode dest_mode = GET_MODE (dest);
17663
17664 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17665 return SCHED_FUSION_NONE;
17666
17667 if (GET_CODE (src) == SIGN_EXTEND)
17668 {
17669 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17670 src = XEXP (src, 0);
17671 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17672 return SCHED_FUSION_NONE;
17673 }
17674 else if (GET_CODE (src) == ZERO_EXTEND)
17675 {
17676 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17677 src = XEXP (src, 0);
17678 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17679 return SCHED_FUSION_NONE;
17680 }
17681
17682 if (GET_CODE (src) == MEM && REG_P (dest))
17683 extract_base_offset_in_addr (src, base, offset);
17684 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17685 {
17686 fusion = SCHED_FUSION_ST;
17687 extract_base_offset_in_addr (dest, base, offset);
17688 }
17689 else
17690 return SCHED_FUSION_NONE;
17691
17692 if (*base == NULL_RTX || *offset == NULL_RTX)
17693 fusion = SCHED_FUSION_NONE;
17694
17695 return fusion;
17696 }
17697
17698 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17699
17700 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17701 and PRI are only calculated for these instructions. For other instruction,
17702 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
17703 type instruction fusion can be added by returning different priorities.
17704
17705 It's important that irrelevant instructions get the largest FUSION_PRI. */
17706
17707 static void
17708 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17709 int *fusion_pri, int *pri)
17710 {
17711 int tmp, off_val;
17712 rtx base, offset;
17713 enum sched_fusion_type fusion;
17714
17715 gcc_assert (INSN_P (insn));
17716
17717 tmp = max_pri - 1;
17718 fusion = fusion_load_store (insn, &base, &offset);
17719 if (fusion == SCHED_FUSION_NONE)
17720 {
17721 *pri = tmp;
17722 *fusion_pri = tmp;
17723 return;
17724 }
17725
17726 /* Set FUSION_PRI according to fusion type and base register. */
17727 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17728
17729 /* Calculate PRI. */
17730 tmp /= 2;
17731
17732 /* INSN with smaller offset goes first. */
17733 off_val = (int)(INTVAL (offset));
17734 if (off_val >= 0)
17735 tmp -= (off_val & 0xfffff);
17736 else
17737 tmp += ((- off_val) & 0xfffff);
17738
17739 *pri = tmp;
17740 return;
17741 }
17742
17743 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17744 Adjust priority of sha1h instructions so they are scheduled before
17745 other SHA1 instructions. */
17746
17747 static int
17748 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17749 {
17750 rtx x = PATTERN (insn);
17751
17752 if (GET_CODE (x) == SET)
17753 {
17754 x = SET_SRC (x);
17755
17756 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17757 return priority + 10;
17758 }
17759
17760 return priority;
17761 }
17762
17763 /* Given OPERANDS of consecutive load/store, check if we can merge
17764 them into ldp/stp. LOAD is true if they are load instructions.
17765 MODE is the mode of memory operands. */
17766
17767 bool
17768 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17769 machine_mode mode)
17770 {
17771 HOST_WIDE_INT offval_1, offval_2, msize;
17772 enum reg_class rclass_1, rclass_2;
17773 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17774
17775 if (load)
17776 {
17777 mem_1 = operands[1];
17778 mem_2 = operands[3];
17779 reg_1 = operands[0];
17780 reg_2 = operands[2];
17781 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17782 if (REGNO (reg_1) == REGNO (reg_2))
17783 return false;
17784 }
17785 else
17786 {
17787 mem_1 = operands[0];
17788 mem_2 = operands[2];
17789 reg_1 = operands[1];
17790 reg_2 = operands[3];
17791 }
17792
17793 /* The mems cannot be volatile. */
17794 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17795 return false;
17796
17797 /* If we have SImode and slow unaligned ldp,
17798 check the alignment to be at least 8 byte. */
17799 if (mode == SImode
17800 && (aarch64_tune_params.extra_tuning_flags
17801 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17802 && !optimize_size
17803 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17804 return false;
17805
17806 /* Check if the addresses are in the form of [base+offset]. */
17807 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17808 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17809 return false;
17810 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17811 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17812 return false;
17813
17814 /* Check if the bases are same. */
17815 if (!rtx_equal_p (base_1, base_2))
17816 return false;
17817
17818 /* The operands must be of the same size. */
17819 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17820 GET_MODE_SIZE (GET_MODE (mem_2))));
17821
17822 offval_1 = INTVAL (offset_1);
17823 offval_2 = INTVAL (offset_2);
17824 /* We should only be trying this for fixed-sized modes. There is no
17825 SVE LDP/STP instruction. */
17826 msize = GET_MODE_SIZE (mode).to_constant ();
17827 /* Check if the offsets are consecutive. */
17828 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17829 return false;
17830
17831 /* Check if the addresses are clobbered by load. */
17832 if (load)
17833 {
17834 if (reg_mentioned_p (reg_1, mem_1))
17835 return false;
17836
17837 /* In increasing order, the last load can clobber the address. */
17838 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17839 return false;
17840 }
17841
17842 /* One of the memory accesses must be a mempair operand.
17843 If it is not the first one, they need to be swapped by the
17844 peephole. */
17845 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17846 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17847 return false;
17848
17849 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17850 rclass_1 = FP_REGS;
17851 else
17852 rclass_1 = GENERAL_REGS;
17853
17854 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17855 rclass_2 = FP_REGS;
17856 else
17857 rclass_2 = GENERAL_REGS;
17858
17859 /* Check if the registers are of same class. */
17860 if (rclass_1 != rclass_2)
17861 return false;
17862
17863 return true;
17864 }
17865
17866 /* Given OPERANDS of consecutive load/store that can be merged,
17867 swap them if they are not in ascending order. */
17868 void
17869 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17870 {
17871 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17872 HOST_WIDE_INT offval_1, offval_2;
17873
17874 if (load)
17875 {
17876 mem_1 = operands[1];
17877 mem_2 = operands[3];
17878 }
17879 else
17880 {
17881 mem_1 = operands[0];
17882 mem_2 = operands[2];
17883 }
17884
17885 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17886 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17887
17888 offval_1 = INTVAL (offset_1);
17889 offval_2 = INTVAL (offset_2);
17890
17891 if (offval_1 > offval_2)
17892 {
17893 /* Irrespective of whether this is a load or a store,
17894 we do the same swap. */
17895 std::swap (operands[0], operands[2]);
17896 std::swap (operands[1], operands[3]);
17897 }
17898 }
17899
17900 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17901 comparison between the two. */
17902 int
17903 aarch64_host_wide_int_compare (const void *x, const void *y)
17904 {
17905 return wi::cmps (* ((const HOST_WIDE_INT *) x),
17906 * ((const HOST_WIDE_INT *) y));
17907 }
17908
17909 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17910 other pointing to a REG rtx containing an offset, compare the offsets
17911 of the two pairs.
17912
17913 Return:
17914
17915 1 iff offset (X) > offset (Y)
17916 0 iff offset (X) == offset (Y)
17917 -1 iff offset (X) < offset (Y) */
17918 int
17919 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17920 {
17921 const rtx * operands_1 = (const rtx *) x;
17922 const rtx * operands_2 = (const rtx *) y;
17923 rtx mem_1, mem_2, base, offset_1, offset_2;
17924
17925 if (MEM_P (operands_1[0]))
17926 mem_1 = operands_1[0];
17927 else
17928 mem_1 = operands_1[1];
17929
17930 if (MEM_P (operands_2[0]))
17931 mem_2 = operands_2[0];
17932 else
17933 mem_2 = operands_2[1];
17934
17935 /* Extract the offsets. */
17936 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17937 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17938
17939 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17940
17941 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17942 }
17943
17944 /* Given OPERANDS of consecutive load/store, check if we can merge
17945 them into ldp/stp by adjusting the offset. LOAD is true if they
17946 are load instructions. MODE is the mode of memory operands.
17947
17948 Given below consecutive stores:
17949
17950 str w1, [xb, 0x100]
17951 str w1, [xb, 0x104]
17952 str w1, [xb, 0x108]
17953 str w1, [xb, 0x10c]
17954
17955 Though the offsets are out of the range supported by stp, we can
17956 still pair them after adjusting the offset, like:
17957
17958 add scratch, xb, 0x100
17959 stp w1, w1, [scratch]
17960 stp w1, w1, [scratch, 0x8]
17961
17962 The peephole patterns detecting this opportunity should guarantee
17963 the scratch register is avaliable. */
17964
17965 bool
17966 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17967 scalar_mode mode)
17968 {
17969 const int num_insns = 4;
17970 enum reg_class rclass;
17971 HOST_WIDE_INT offvals[num_insns], msize;
17972 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17973
17974 if (load)
17975 {
17976 for (int i = 0; i < num_insns; i++)
17977 {
17978 reg[i] = operands[2 * i];
17979 mem[i] = operands[2 * i + 1];
17980
17981 gcc_assert (REG_P (reg[i]));
17982 }
17983
17984 /* Do not attempt to merge the loads if the loads clobber each other. */
17985 for (int i = 0; i < 8; i += 2)
17986 for (int j = i + 2; j < 8; j += 2)
17987 if (reg_overlap_mentioned_p (operands[i], operands[j]))
17988 return false;
17989 }
17990 else
17991 for (int i = 0; i < num_insns; i++)
17992 {
17993 mem[i] = operands[2 * i];
17994 reg[i] = operands[2 * i + 1];
17995 }
17996
17997 /* Skip if memory operand is by itself valid for ldp/stp. */
17998 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
17999 return false;
18000
18001 for (int i = 0; i < num_insns; i++)
18002 {
18003 /* The mems cannot be volatile. */
18004 if (MEM_VOLATILE_P (mem[i]))
18005 return false;
18006
18007 /* Check if the addresses are in the form of [base+offset]. */
18008 extract_base_offset_in_addr (mem[i], base + i, offset + i);
18009 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18010 return false;
18011 }
18012
18013 /* Check if the registers are of same class. */
18014 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18015 ? FP_REGS : GENERAL_REGS;
18016
18017 for (int i = 1; i < num_insns; i++)
18018 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18019 {
18020 if (rclass != FP_REGS)
18021 return false;
18022 }
18023 else
18024 {
18025 if (rclass != GENERAL_REGS)
18026 return false;
18027 }
18028
18029 /* Only the last register in the order in which they occur
18030 may be clobbered by the load. */
18031 if (rclass == GENERAL_REGS && load)
18032 for (int i = 0; i < num_insns - 1; i++)
18033 if (reg_mentioned_p (reg[i], mem[i]))
18034 return false;
18035
18036 /* Check if the bases are same. */
18037 for (int i = 0; i < num_insns - 1; i++)
18038 if (!rtx_equal_p (base[i], base[i + 1]))
18039 return false;
18040
18041 for (int i = 0; i < num_insns; i++)
18042 offvals[i] = INTVAL (offset[i]);
18043
18044 msize = GET_MODE_SIZE (mode);
18045
18046 /* Check if the offsets can be put in the right order to do a ldp/stp. */
18047 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18048 aarch64_host_wide_int_compare);
18049
18050 if (!(offvals[1] == offvals[0] + msize
18051 && offvals[3] == offvals[2] + msize))
18052 return false;
18053
18054 /* Check that offsets are within range of each other. The ldp/stp
18055 instructions have 7 bit immediate offsets, so use 0x80. */
18056 if (offvals[2] - offvals[0] >= msize * 0x80)
18057 return false;
18058
18059 /* The offsets must be aligned with respect to each other. */
18060 if (offvals[0] % msize != offvals[2] % msize)
18061 return false;
18062
18063 /* If we have SImode and slow unaligned ldp,
18064 check the alignment to be at least 8 byte. */
18065 if (mode == SImode
18066 && (aarch64_tune_params.extra_tuning_flags
18067 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18068 && !optimize_size
18069 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18070 return false;
18071
18072 return true;
18073 }
18074
18075 /* Given OPERANDS of consecutive load/store, this function pairs them
18076 into LDP/STP after adjusting the offset. It depends on the fact
18077 that the operands can be sorted so the offsets are correct for STP.
18078 MODE is the mode of memory operands. CODE is the rtl operator
18079 which should be applied to all memory operands, it's SIGN_EXTEND,
18080 ZERO_EXTEND or UNKNOWN. */
18081
18082 bool
18083 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18084 scalar_mode mode, RTX_CODE code)
18085 {
18086 rtx base, offset_1, offset_3, t1, t2;
18087 rtx mem_1, mem_2, mem_3, mem_4;
18088 rtx temp_operands[8];
18089 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18090 stp_off_upper_limit, stp_off_lower_limit, msize;
18091
18092 /* We make changes on a copy as we may still bail out. */
18093 for (int i = 0; i < 8; i ++)
18094 temp_operands[i] = operands[i];
18095
18096 /* Sort the operands. */
18097 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18098
18099 if (load)
18100 {
18101 mem_1 = temp_operands[1];
18102 mem_2 = temp_operands[3];
18103 mem_3 = temp_operands[5];
18104 mem_4 = temp_operands[7];
18105 }
18106 else
18107 {
18108 mem_1 = temp_operands[0];
18109 mem_2 = temp_operands[2];
18110 mem_3 = temp_operands[4];
18111 mem_4 = temp_operands[6];
18112 gcc_assert (code == UNKNOWN);
18113 }
18114
18115 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18116 extract_base_offset_in_addr (mem_3, &base, &offset_3);
18117 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18118 && offset_3 != NULL_RTX);
18119
18120 /* Adjust offset so it can fit in LDP/STP instruction. */
18121 msize = GET_MODE_SIZE (mode);
18122 stp_off_upper_limit = msize * (0x40 - 1);
18123 stp_off_lower_limit = - msize * 0x40;
18124
18125 off_val_1 = INTVAL (offset_1);
18126 off_val_3 = INTVAL (offset_3);
18127
18128 /* The base offset is optimally half way between the two STP/LDP offsets. */
18129 if (msize <= 4)
18130 base_off = (off_val_1 + off_val_3) / 2;
18131 else
18132 /* However, due to issues with negative LDP/STP offset generation for
18133 larger modes, for DF, DI and vector modes. we must not use negative
18134 addresses smaller than 9 signed unadjusted bits can store. This
18135 provides the most range in this case. */
18136 base_off = off_val_1;
18137
18138 /* Adjust the base so that it is aligned with the addresses but still
18139 optimal. */
18140 if (base_off % msize != off_val_1 % msize)
18141 /* Fix the offset, bearing in mind we want to make it bigger not
18142 smaller. */
18143 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18144 else if (msize <= 4)
18145 /* The negative range of LDP/STP is one larger than the positive range. */
18146 base_off += msize;
18147
18148 /* Check if base offset is too big or too small. We can attempt to resolve
18149 this issue by setting it to the maximum value and seeing if the offsets
18150 still fit. */
18151 if (base_off >= 0x1000)
18152 {
18153 base_off = 0x1000 - 1;
18154 /* We must still make sure that the base offset is aligned with respect
18155 to the address. But it may may not be made any bigger. */
18156 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18157 }
18158
18159 /* Likewise for the case where the base is too small. */
18160 if (base_off <= -0x1000)
18161 {
18162 base_off = -0x1000 + 1;
18163 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18164 }
18165
18166 /* Offset of the first STP/LDP. */
18167 new_off_1 = off_val_1 - base_off;
18168
18169 /* Offset of the second STP/LDP. */
18170 new_off_3 = off_val_3 - base_off;
18171
18172 /* The offsets must be within the range of the LDP/STP instructions. */
18173 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18174 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18175 return false;
18176
18177 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18178 new_off_1), true);
18179 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18180 new_off_1 + msize), true);
18181 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18182 new_off_3), true);
18183 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18184 new_off_3 + msize), true);
18185
18186 if (!aarch64_mem_pair_operand (mem_1, mode)
18187 || !aarch64_mem_pair_operand (mem_3, mode))
18188 return false;
18189
18190 if (code == ZERO_EXTEND)
18191 {
18192 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18193 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18194 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18195 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18196 }
18197 else if (code == SIGN_EXTEND)
18198 {
18199 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18200 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18201 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18202 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18203 }
18204
18205 if (load)
18206 {
18207 operands[0] = temp_operands[0];
18208 operands[1] = mem_1;
18209 operands[2] = temp_operands[2];
18210 operands[3] = mem_2;
18211 operands[4] = temp_operands[4];
18212 operands[5] = mem_3;
18213 operands[6] = temp_operands[6];
18214 operands[7] = mem_4;
18215 }
18216 else
18217 {
18218 operands[0] = mem_1;
18219 operands[1] = temp_operands[1];
18220 operands[2] = mem_2;
18221 operands[3] = temp_operands[3];
18222 operands[4] = mem_3;
18223 operands[5] = temp_operands[5];
18224 operands[6] = mem_4;
18225 operands[7] = temp_operands[7];
18226 }
18227
18228 /* Emit adjusting instruction. */
18229 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18230 /* Emit ldp/stp instructions. */
18231 t1 = gen_rtx_SET (operands[0], operands[1]);
18232 t2 = gen_rtx_SET (operands[2], operands[3]);
18233 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18234 t1 = gen_rtx_SET (operands[4], operands[5]);
18235 t2 = gen_rtx_SET (operands[6], operands[7]);
18236 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18237 return true;
18238 }
18239
18240 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18241 it isn't worth branching around empty masked ops (including masked
18242 stores). */
18243
18244 static bool
18245 aarch64_empty_mask_is_expensive (unsigned)
18246 {
18247 return false;
18248 }
18249
18250 /* Return 1 if pseudo register should be created and used to hold
18251 GOT address for PIC code. */
18252
18253 bool
18254 aarch64_use_pseudo_pic_reg (void)
18255 {
18256 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18257 }
18258
18259 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18260
18261 static int
18262 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18263 {
18264 switch (XINT (x, 1))
18265 {
18266 case UNSPEC_GOTSMALLPIC:
18267 case UNSPEC_GOTSMALLPIC28K:
18268 case UNSPEC_GOTTINYPIC:
18269 return 0;
18270 default:
18271 break;
18272 }
18273
18274 return default_unspec_may_trap_p (x, flags);
18275 }
18276
18277
18278 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18279 return the log2 of that value. Otherwise return -1. */
18280
18281 int
18282 aarch64_fpconst_pow_of_2 (rtx x)
18283 {
18284 const REAL_VALUE_TYPE *r;
18285
18286 if (!CONST_DOUBLE_P (x))
18287 return -1;
18288
18289 r = CONST_DOUBLE_REAL_VALUE (x);
18290
18291 if (REAL_VALUE_NEGATIVE (*r)
18292 || REAL_VALUE_ISNAN (*r)
18293 || REAL_VALUE_ISINF (*r)
18294 || !real_isinteger (r, DFmode))
18295 return -1;
18296
18297 return exact_log2 (real_to_integer (r));
18298 }
18299
18300 /* If X is a vector of equal CONST_DOUBLE values and that value is
18301 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18302
18303 int
18304 aarch64_vec_fpconst_pow_of_2 (rtx x)
18305 {
18306 int nelts;
18307 if (GET_CODE (x) != CONST_VECTOR
18308 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18309 return -1;
18310
18311 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18312 return -1;
18313
18314 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18315 if (firstval <= 0)
18316 return -1;
18317
18318 for (int i = 1; i < nelts; i++)
18319 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18320 return -1;
18321
18322 return firstval;
18323 }
18324
18325 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18326 to float.
18327
18328 __fp16 always promotes through this hook.
18329 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18330 through the generic excess precision logic rather than here. */
18331
18332 static tree
18333 aarch64_promoted_type (const_tree t)
18334 {
18335 if (SCALAR_FLOAT_TYPE_P (t)
18336 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18337 return float_type_node;
18338
18339 return NULL_TREE;
18340 }
18341
18342 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18343
18344 static bool
18345 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18346 optimization_type opt_type)
18347 {
18348 switch (op)
18349 {
18350 case rsqrt_optab:
18351 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18352
18353 default:
18354 return true;
18355 }
18356 }
18357
18358 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18359
18360 static unsigned int
18361 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18362 int *offset)
18363 {
18364 /* Polynomial invariant 1 == (VG / 2) - 1. */
18365 gcc_assert (i == 1);
18366 *factor = 2;
18367 *offset = 1;
18368 return AARCH64_DWARF_VG;
18369 }
18370
18371 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18372 if MODE is HFmode, and punt to the generic implementation otherwise. */
18373
18374 static bool
18375 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18376 {
18377 return (mode == HFmode
18378 ? true
18379 : default_libgcc_floating_mode_supported_p (mode));
18380 }
18381
18382 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18383 if MODE is HFmode, and punt to the generic implementation otherwise. */
18384
18385 static bool
18386 aarch64_scalar_mode_supported_p (scalar_mode mode)
18387 {
18388 return (mode == HFmode
18389 ? true
18390 : default_scalar_mode_supported_p (mode));
18391 }
18392
18393 /* Set the value of FLT_EVAL_METHOD.
18394 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18395
18396 0: evaluate all operations and constants, whose semantic type has at
18397 most the range and precision of type float, to the range and
18398 precision of float; evaluate all other operations and constants to
18399 the range and precision of the semantic type;
18400
18401 N, where _FloatN is a supported interchange floating type
18402 evaluate all operations and constants, whose semantic type has at
18403 most the range and precision of _FloatN type, to the range and
18404 precision of the _FloatN type; evaluate all other operations and
18405 constants to the range and precision of the semantic type;
18406
18407 If we have the ARMv8.2-A extensions then we support _Float16 in native
18408 precision, so we should set this to 16. Otherwise, we support the type,
18409 but want to evaluate expressions in float precision, so set this to
18410 0. */
18411
18412 static enum flt_eval_method
18413 aarch64_excess_precision (enum excess_precision_type type)
18414 {
18415 switch (type)
18416 {
18417 case EXCESS_PRECISION_TYPE_FAST:
18418 case EXCESS_PRECISION_TYPE_STANDARD:
18419 /* We can calculate either in 16-bit range and precision or
18420 32-bit range and precision. Make that decision based on whether
18421 we have native support for the ARMv8.2-A 16-bit floating-point
18422 instructions or not. */
18423 return (TARGET_FP_F16INST
18424 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18425 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18426 case EXCESS_PRECISION_TYPE_IMPLICIT:
18427 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18428 default:
18429 gcc_unreachable ();
18430 }
18431 return FLT_EVAL_METHOD_UNPREDICTABLE;
18432 }
18433
18434 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18435 scheduled for speculative execution. Reject the long-running division
18436 and square-root instructions. */
18437
18438 static bool
18439 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18440 {
18441 switch (get_attr_type (insn))
18442 {
18443 case TYPE_SDIV:
18444 case TYPE_UDIV:
18445 case TYPE_FDIVS:
18446 case TYPE_FDIVD:
18447 case TYPE_FSQRTS:
18448 case TYPE_FSQRTD:
18449 case TYPE_NEON_FP_SQRT_S:
18450 case TYPE_NEON_FP_SQRT_D:
18451 case TYPE_NEON_FP_SQRT_S_Q:
18452 case TYPE_NEON_FP_SQRT_D_Q:
18453 case TYPE_NEON_FP_DIV_S:
18454 case TYPE_NEON_FP_DIV_D:
18455 case TYPE_NEON_FP_DIV_S_Q:
18456 case TYPE_NEON_FP_DIV_D_Q:
18457 return false;
18458 default:
18459 return true;
18460 }
18461 }
18462
18463 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18464
18465 static int
18466 aarch64_compute_pressure_classes (reg_class *classes)
18467 {
18468 int i = 0;
18469 classes[i++] = GENERAL_REGS;
18470 classes[i++] = FP_REGS;
18471 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18472 registers need to go in PR_LO_REGS at some point during their
18473 lifetime. Splitting it into two halves has the effect of making
18474 all predicates count against PR_LO_REGS, so that we try whenever
18475 possible to restrict the number of live predicates to 8. This
18476 greatly reduces the amount of spilling in certain loops. */
18477 classes[i++] = PR_LO_REGS;
18478 classes[i++] = PR_HI_REGS;
18479 return i;
18480 }
18481
18482 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18483
18484 static bool
18485 aarch64_can_change_mode_class (machine_mode from,
18486 machine_mode to, reg_class_t)
18487 {
18488 if (BYTES_BIG_ENDIAN)
18489 {
18490 bool from_sve_p = aarch64_sve_data_mode_p (from);
18491 bool to_sve_p = aarch64_sve_data_mode_p (to);
18492
18493 /* Don't allow changes between SVE data modes and non-SVE modes.
18494 See the comment at the head of aarch64-sve.md for details. */
18495 if (from_sve_p != to_sve_p)
18496 return false;
18497
18498 /* Don't allow changes in element size: lane 0 of the new vector
18499 would not then be lane 0 of the old vector. See the comment
18500 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18501 description.
18502
18503 In the worst case, this forces a register to be spilled in
18504 one mode and reloaded in the other, which handles the
18505 endianness correctly. */
18506 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18507 return false;
18508 }
18509 return true;
18510 }
18511
18512 /* Implement TARGET_EARLY_REMAT_MODES. */
18513
18514 static void
18515 aarch64_select_early_remat_modes (sbitmap modes)
18516 {
18517 /* SVE values are not normally live across a call, so it should be
18518 worth doing early rematerialization even in VL-specific mode. */
18519 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18520 {
18521 machine_mode mode = (machine_mode) i;
18522 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18523 if (vec_flags & VEC_ANY_SVE)
18524 bitmap_set_bit (modes, i);
18525 }
18526 }
18527
18528 /* Override the default target speculation_safe_value. */
18529 static rtx
18530 aarch64_speculation_safe_value (machine_mode mode,
18531 rtx result, rtx val, rtx failval)
18532 {
18533 /* Maybe we should warn if falling back to hard barriers. They are
18534 likely to be noticably more expensive than the alternative below. */
18535 if (!aarch64_track_speculation)
18536 return default_speculation_safe_value (mode, result, val, failval);
18537
18538 if (!REG_P (val))
18539 val = copy_to_mode_reg (mode, val);
18540
18541 if (!aarch64_reg_or_zero (failval, mode))
18542 failval = copy_to_mode_reg (mode, failval);
18543
18544 emit_insn (gen_despeculate_copy (mode, result, val, failval));
18545 return result;
18546 }
18547
18548 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18549 Look into the tuning structure for an estimate.
18550 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18551 Advanced SIMD 128 bits. */
18552
18553 static HOST_WIDE_INT
18554 aarch64_estimated_poly_value (poly_int64 val)
18555 {
18556 enum aarch64_sve_vector_bits_enum width_source
18557 = aarch64_tune_params.sve_width;
18558
18559 /* If we still don't have an estimate, use the default. */
18560 if (width_source == SVE_SCALABLE)
18561 return default_estimated_poly_value (val);
18562
18563 HOST_WIDE_INT over_128 = width_source - 128;
18564 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18565 }
18566
18567
18568 /* Return true for types that could be supported as SIMD return or
18569 argument types. */
18570
18571 static bool
18572 supported_simd_type (tree t)
18573 {
18574 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
18575 {
18576 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
18577 return s == 1 || s == 2 || s == 4 || s == 8;
18578 }
18579 return false;
18580 }
18581
18582 /* Return true for types that currently are supported as SIMD return
18583 or argument types. */
18584
18585 static bool
18586 currently_supported_simd_type (tree t, tree b)
18587 {
18588 if (COMPLEX_FLOAT_TYPE_P (t))
18589 return false;
18590
18591 if (TYPE_SIZE (t) != TYPE_SIZE (b))
18592 return false;
18593
18594 return supported_simd_type (t);
18595 }
18596
18597 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
18598
18599 static int
18600 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
18601 struct cgraph_simd_clone *clonei,
18602 tree base_type, int num)
18603 {
18604 tree t, ret_type, arg_type;
18605 unsigned int elt_bits, vec_bits, count;
18606
18607 if (!TARGET_SIMD)
18608 return 0;
18609
18610 if (clonei->simdlen
18611 && (clonei->simdlen < 2
18612 || clonei->simdlen > 1024
18613 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
18614 {
18615 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18616 "unsupported simdlen %d", clonei->simdlen);
18617 return 0;
18618 }
18619
18620 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
18621 if (TREE_CODE (ret_type) != VOID_TYPE
18622 && !currently_supported_simd_type (ret_type, base_type))
18623 {
18624 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
18625 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18626 "GCC does not currently support mixed size types "
18627 "for %<simd%> functions");
18628 else if (supported_simd_type (ret_type))
18629 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18630 "GCC does not currently support return type %qT "
18631 "for %<simd%> functions", ret_type);
18632 else
18633 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18634 "unsupported return type %qT for %<simd%> functions",
18635 ret_type);
18636 return 0;
18637 }
18638
18639 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
18640 {
18641 arg_type = TREE_TYPE (t);
18642
18643 if (!currently_supported_simd_type (arg_type, base_type))
18644 {
18645 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
18646 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18647 "GCC does not currently support mixed size types "
18648 "for %<simd%> functions");
18649 else
18650 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18651 "GCC does not currently support argument type %qT "
18652 "for %<simd%> functions", arg_type);
18653 return 0;
18654 }
18655 }
18656
18657 clonei->vecsize_mangle = 'n';
18658 clonei->mask_mode = VOIDmode;
18659 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
18660 if (clonei->simdlen == 0)
18661 {
18662 count = 2;
18663 vec_bits = (num == 0 ? 64 : 128);
18664 clonei->simdlen = vec_bits / elt_bits;
18665 }
18666 else
18667 {
18668 count = 1;
18669 vec_bits = clonei->simdlen * elt_bits;
18670 if (vec_bits != 64 && vec_bits != 128)
18671 {
18672 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18673 "GCC does not currently support simdlen %d for type %qT",
18674 clonei->simdlen, base_type);
18675 return 0;
18676 }
18677 }
18678 clonei->vecsize_int = vec_bits;
18679 clonei->vecsize_float = vec_bits;
18680 return count;
18681 }
18682
18683 /* Implement TARGET_SIMD_CLONE_ADJUST. */
18684
18685 static void
18686 aarch64_simd_clone_adjust (struct cgraph_node *node)
18687 {
18688 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
18689 use the correct ABI. */
18690
18691 tree t = TREE_TYPE (node->decl);
18692 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
18693 TYPE_ATTRIBUTES (t));
18694 }
18695
18696 /* Implement TARGET_SIMD_CLONE_USABLE. */
18697
18698 static int
18699 aarch64_simd_clone_usable (struct cgraph_node *node)
18700 {
18701 switch (node->simdclone->vecsize_mangle)
18702 {
18703 case 'n':
18704 if (!TARGET_SIMD)
18705 return -1;
18706 return 0;
18707 default:
18708 gcc_unreachable ();
18709 }
18710 }
18711
18712 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
18713 global variable based guard use the default else
18714 return a null tree. */
18715 static tree
18716 aarch64_stack_protect_guard (void)
18717 {
18718 if (aarch64_stack_protector_guard == SSP_GLOBAL)
18719 return default_stack_protect_guard ();
18720
18721 return NULL_TREE;
18722 }
18723
18724
18725 /* Target-specific selftests. */
18726
18727 #if CHECKING_P
18728
18729 namespace selftest {
18730
18731 /* Selftest for the RTL loader.
18732 Verify that the RTL loader copes with a dump from
18733 print_rtx_function. This is essentially just a test that class
18734 function_reader can handle a real dump, but it also verifies
18735 that lookup_reg_by_dump_name correctly handles hard regs.
18736 The presence of hard reg names in the dump means that the test is
18737 target-specific, hence it is in this file. */
18738
18739 static void
18740 aarch64_test_loading_full_dump ()
18741 {
18742 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18743
18744 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18745
18746 rtx_insn *insn_1 = get_insn_by_uid (1);
18747 ASSERT_EQ (NOTE, GET_CODE (insn_1));
18748
18749 rtx_insn *insn_15 = get_insn_by_uid (15);
18750 ASSERT_EQ (INSN, GET_CODE (insn_15));
18751 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18752
18753 /* Verify crtl->return_rtx. */
18754 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18755 ASSERT_EQ (0, REGNO (crtl->return_rtx));
18756 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18757 }
18758
18759 /* Run all target-specific selftests. */
18760
18761 static void
18762 aarch64_run_selftests (void)
18763 {
18764 aarch64_test_loading_full_dump ();
18765 }
18766
18767 } // namespace selftest
18768
18769 #endif /* #if CHECKING_P */
18770
18771 #undef TARGET_STACK_PROTECT_GUARD
18772 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
18773
18774 #undef TARGET_ADDRESS_COST
18775 #define TARGET_ADDRESS_COST aarch64_address_cost
18776
18777 /* This hook will determines whether unnamed bitfields affect the alignment
18778 of the containing structure. The hook returns true if the structure
18779 should inherit the alignment requirements of an unnamed bitfield's
18780 type. */
18781 #undef TARGET_ALIGN_ANON_BITFIELD
18782 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18783
18784 #undef TARGET_ASM_ALIGNED_DI_OP
18785 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18786
18787 #undef TARGET_ASM_ALIGNED_HI_OP
18788 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18789
18790 #undef TARGET_ASM_ALIGNED_SI_OP
18791 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18792
18793 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18794 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18795 hook_bool_const_tree_hwi_hwi_const_tree_true
18796
18797 #undef TARGET_ASM_FILE_START
18798 #define TARGET_ASM_FILE_START aarch64_start_file
18799
18800 #undef TARGET_ASM_OUTPUT_MI_THUNK
18801 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18802
18803 #undef TARGET_ASM_SELECT_RTX_SECTION
18804 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18805
18806 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18807 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18808
18809 #undef TARGET_BUILD_BUILTIN_VA_LIST
18810 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18811
18812 #undef TARGET_CALLEE_COPIES
18813 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18814
18815 #undef TARGET_CAN_ELIMINATE
18816 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18817
18818 #undef TARGET_CAN_INLINE_P
18819 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18820
18821 #undef TARGET_CANNOT_FORCE_CONST_MEM
18822 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18823
18824 #undef TARGET_CASE_VALUES_THRESHOLD
18825 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18826
18827 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18828 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18829
18830 /* Only the least significant bit is used for initialization guard
18831 variables. */
18832 #undef TARGET_CXX_GUARD_MASK_BIT
18833 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18834
18835 #undef TARGET_C_MODE_FOR_SUFFIX
18836 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18837
18838 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18839 #undef TARGET_DEFAULT_TARGET_FLAGS
18840 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18841 #endif
18842
18843 #undef TARGET_CLASS_MAX_NREGS
18844 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18845
18846 #undef TARGET_BUILTIN_DECL
18847 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18848
18849 #undef TARGET_BUILTIN_RECIPROCAL
18850 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18851
18852 #undef TARGET_C_EXCESS_PRECISION
18853 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18854
18855 #undef TARGET_EXPAND_BUILTIN
18856 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18857
18858 #undef TARGET_EXPAND_BUILTIN_VA_START
18859 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18860
18861 #undef TARGET_FOLD_BUILTIN
18862 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18863
18864 #undef TARGET_FUNCTION_ARG
18865 #define TARGET_FUNCTION_ARG aarch64_function_arg
18866
18867 #undef TARGET_FUNCTION_ARG_ADVANCE
18868 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18869
18870 #undef TARGET_FUNCTION_ARG_BOUNDARY
18871 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18872
18873 #undef TARGET_FUNCTION_ARG_PADDING
18874 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18875
18876 #undef TARGET_GET_RAW_RESULT_MODE
18877 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18878 #undef TARGET_GET_RAW_ARG_MODE
18879 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18880
18881 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18882 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18883
18884 #undef TARGET_FUNCTION_VALUE
18885 #define TARGET_FUNCTION_VALUE aarch64_function_value
18886
18887 #undef TARGET_FUNCTION_VALUE_REGNO_P
18888 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18889
18890 #undef TARGET_GIMPLE_FOLD_BUILTIN
18891 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18892
18893 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18894 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18895
18896 #undef TARGET_INIT_BUILTINS
18897 #define TARGET_INIT_BUILTINS aarch64_init_builtins
18898
18899 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18900 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18901 aarch64_ira_change_pseudo_allocno_class
18902
18903 #undef TARGET_LEGITIMATE_ADDRESS_P
18904 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18905
18906 #undef TARGET_LEGITIMATE_CONSTANT_P
18907 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18908
18909 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18910 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18911 aarch64_legitimize_address_displacement
18912
18913 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18914 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18915
18916 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18917 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18918 aarch64_libgcc_floating_mode_supported_p
18919
18920 #undef TARGET_MANGLE_TYPE
18921 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18922
18923 #undef TARGET_MEMORY_MOVE_COST
18924 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18925
18926 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18927 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18928
18929 #undef TARGET_MUST_PASS_IN_STACK
18930 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18931
18932 /* This target hook should return true if accesses to volatile bitfields
18933 should use the narrowest mode possible. It should return false if these
18934 accesses should use the bitfield container type. */
18935 #undef TARGET_NARROW_VOLATILE_BITFIELD
18936 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18937
18938 #undef TARGET_OPTION_OVERRIDE
18939 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18940
18941 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18942 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18943 aarch64_override_options_after_change
18944
18945 #undef TARGET_OPTION_SAVE
18946 #define TARGET_OPTION_SAVE aarch64_option_save
18947
18948 #undef TARGET_OPTION_RESTORE
18949 #define TARGET_OPTION_RESTORE aarch64_option_restore
18950
18951 #undef TARGET_OPTION_PRINT
18952 #define TARGET_OPTION_PRINT aarch64_option_print
18953
18954 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18955 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18956
18957 #undef TARGET_SET_CURRENT_FUNCTION
18958 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18959
18960 #undef TARGET_PASS_BY_REFERENCE
18961 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18962
18963 #undef TARGET_PREFERRED_RELOAD_CLASS
18964 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18965
18966 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18967 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18968
18969 #undef TARGET_PROMOTED_TYPE
18970 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18971
18972 #undef TARGET_SECONDARY_RELOAD
18973 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18974
18975 #undef TARGET_SHIFT_TRUNCATION_MASK
18976 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
18977
18978 #undef TARGET_SETUP_INCOMING_VARARGS
18979 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
18980
18981 #undef TARGET_STRUCT_VALUE_RTX
18982 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
18983
18984 #undef TARGET_REGISTER_MOVE_COST
18985 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
18986
18987 #undef TARGET_RETURN_IN_MEMORY
18988 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
18989
18990 #undef TARGET_RETURN_IN_MSB
18991 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
18992
18993 #undef TARGET_RTX_COSTS
18994 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
18995
18996 #undef TARGET_SCALAR_MODE_SUPPORTED_P
18997 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
18998
18999 #undef TARGET_SCHED_ISSUE_RATE
19000 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19001
19002 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19003 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19004 aarch64_sched_first_cycle_multipass_dfa_lookahead
19005
19006 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19007 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19008 aarch64_first_cycle_multipass_dfa_lookahead_guard
19009
19010 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19011 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19012 aarch64_get_separate_components
19013
19014 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19015 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19016 aarch64_components_for_bb
19017
19018 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19019 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19020 aarch64_disqualify_components
19021
19022 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19023 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19024 aarch64_emit_prologue_components
19025
19026 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19027 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19028 aarch64_emit_epilogue_components
19029
19030 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19031 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19032 aarch64_set_handled_components
19033
19034 #undef TARGET_TRAMPOLINE_INIT
19035 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19036
19037 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19038 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19039
19040 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19041 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19042
19043 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19044 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19045 aarch64_builtin_support_vector_misalignment
19046
19047 #undef TARGET_ARRAY_MODE
19048 #define TARGET_ARRAY_MODE aarch64_array_mode
19049
19050 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19051 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19052
19053 #undef TARGET_VECTORIZE_ADD_STMT_COST
19054 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19055
19056 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19057 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19058 aarch64_builtin_vectorization_cost
19059
19060 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19061 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19062
19063 #undef TARGET_VECTORIZE_BUILTINS
19064 #define TARGET_VECTORIZE_BUILTINS
19065
19066 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19067 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19068 aarch64_builtin_vectorized_function
19069
19070 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19071 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19072 aarch64_autovectorize_vector_sizes
19073
19074 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19075 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19076 aarch64_atomic_assign_expand_fenv
19077
19078 /* Section anchor support. */
19079
19080 #undef TARGET_MIN_ANCHOR_OFFSET
19081 #define TARGET_MIN_ANCHOR_OFFSET -256
19082
19083 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19084 byte offset; we can do much more for larger data types, but have no way
19085 to determine the size of the access. We assume accesses are aligned. */
19086 #undef TARGET_MAX_ANCHOR_OFFSET
19087 #define TARGET_MAX_ANCHOR_OFFSET 4095
19088
19089 #undef TARGET_VECTOR_ALIGNMENT
19090 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19091
19092 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19093 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19094 aarch64_vectorize_preferred_vector_alignment
19095 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19096 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19097 aarch64_simd_vector_alignment_reachable
19098
19099 /* vec_perm support. */
19100
19101 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19102 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19103 aarch64_vectorize_vec_perm_const
19104
19105 #undef TARGET_VECTORIZE_GET_MASK_MODE
19106 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19107 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19108 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19109 aarch64_empty_mask_is_expensive
19110 #undef TARGET_PREFERRED_ELSE_VALUE
19111 #define TARGET_PREFERRED_ELSE_VALUE \
19112 aarch64_preferred_else_value
19113
19114 #undef TARGET_INIT_LIBFUNCS
19115 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19116
19117 #undef TARGET_FIXED_CONDITION_CODE_REGS
19118 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19119
19120 #undef TARGET_FLAGS_REGNUM
19121 #define TARGET_FLAGS_REGNUM CC_REGNUM
19122
19123 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19124 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19125
19126 #undef TARGET_ASAN_SHADOW_OFFSET
19127 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19128
19129 #undef TARGET_LEGITIMIZE_ADDRESS
19130 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19131
19132 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19133 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19134
19135 #undef TARGET_CAN_USE_DOLOOP_P
19136 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19137
19138 #undef TARGET_SCHED_ADJUST_PRIORITY
19139 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19140
19141 #undef TARGET_SCHED_MACRO_FUSION_P
19142 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19143
19144 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19145 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19146
19147 #undef TARGET_SCHED_FUSION_PRIORITY
19148 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19149
19150 #undef TARGET_UNSPEC_MAY_TRAP_P
19151 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19152
19153 #undef TARGET_USE_PSEUDO_PIC_REG
19154 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19155
19156 #undef TARGET_PRINT_OPERAND
19157 #define TARGET_PRINT_OPERAND aarch64_print_operand
19158
19159 #undef TARGET_PRINT_OPERAND_ADDRESS
19160 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19161
19162 #undef TARGET_OPTAB_SUPPORTED_P
19163 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19164
19165 #undef TARGET_OMIT_STRUCT_RETURN_REG
19166 #define TARGET_OMIT_STRUCT_RETURN_REG true
19167
19168 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19169 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19170 aarch64_dwarf_poly_indeterminate_value
19171
19172 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
19173 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19174 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19175
19176 #undef TARGET_HARD_REGNO_NREGS
19177 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19178 #undef TARGET_HARD_REGNO_MODE_OK
19179 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19180
19181 #undef TARGET_MODES_TIEABLE_P
19182 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19183
19184 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19185 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19186 aarch64_hard_regno_call_part_clobbered
19187
19188 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19189 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19190 aarch64_remove_extra_call_preserved_regs
19191
19192 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19193 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19194 aarch64_return_call_with_max_clobbers
19195
19196 #undef TARGET_CONSTANT_ALIGNMENT
19197 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19198
19199 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19200 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19201 aarch64_stack_clash_protection_alloca_probe_range
19202
19203 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19204 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19205
19206 #undef TARGET_CAN_CHANGE_MODE_CLASS
19207 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19208
19209 #undef TARGET_SELECT_EARLY_REMAT_MODES
19210 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19211
19212 #undef TARGET_SPECULATION_SAFE_VALUE
19213 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19214
19215 #undef TARGET_ESTIMATED_POLY_VALUE
19216 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19217
19218 #undef TARGET_ATTRIBUTE_TABLE
19219 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19220
19221 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19222 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19223 aarch64_simd_clone_compute_vecsize_and_simdlen
19224
19225 #undef TARGET_SIMD_CLONE_ADJUST
19226 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19227
19228 #undef TARGET_SIMD_CLONE_USABLE
19229 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19230
19231 #if CHECKING_P
19232 #undef TARGET_RUN_TARGET_SELFTESTS
19233 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19234 #endif /* #if CHECKING_P */
19235
19236 struct gcc_target targetm = TARGET_INITIALIZER;
19237
19238 #include "gt-aarch64.h"