]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
[AArch64] Make simd_immediate_info INDEX explicit
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
85 {
86 enum insn_type { MOV, MVN, INDEX };
87 enum modifier_type { LSL, MSL };
88
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95
96 /* The mode of the elements. */
97 scalar_mode elt_mode;
98
99 /* The instruction to use to move the immediate into a vector. */
100 insn_type insn;
101
102 union
103 {
104 /* For MOV and MVN. */
105 struct
106 {
107 /* The value of each element. */
108 rtx value;
109
110 /* The kind of shift modifier to use, and the number of bits to shift.
111 This is (LSL, 0) if no shift is needed. */
112 modifier_type modifier;
113 unsigned int shift;
114 } mov;
115
116 /* For INDEX. */
117 struct
118 {
119 /* The value of the first element and the step to be added for each
120 subsequent element. */
121 rtx base, step;
122 } index;
123 } u;
124 };
125
126 /* Construct a floating-point immediate in which each element has mode
127 ELT_MODE_IN and value VALUE_IN. */
128 inline simd_immediate_info
129 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
130 : elt_mode (elt_mode_in), insn (MOV)
131 {
132 u.mov.value = value_in;
133 u.mov.modifier = LSL;
134 u.mov.shift = 0;
135 }
136
137 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
138 and value VALUE_IN. The other parameters are as for the structure
139 fields. */
140 inline simd_immediate_info
141 ::simd_immediate_info (scalar_int_mode elt_mode_in,
142 unsigned HOST_WIDE_INT value_in,
143 insn_type insn_in, modifier_type modifier_in,
144 unsigned int shift_in)
145 : elt_mode (elt_mode_in), insn (insn_in)
146 {
147 u.mov.value = gen_int_mode (value_in, elt_mode_in);
148 u.mov.modifier = modifier_in;
149 u.mov.shift = shift_in;
150 }
151
152 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
153 and where element I is equal to BASE_IN + I * STEP_IN. */
154 inline simd_immediate_info
155 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
156 : elt_mode (elt_mode_in), insn (INDEX)
157 {
158 u.index.base = base_in;
159 u.index.step = step_in;
160 }
161
162 /* The current code model. */
163 enum aarch64_code_model aarch64_cmodel;
164
165 /* The number of 64-bit elements in an SVE vector. */
166 poly_uint16 aarch64_sve_vg;
167
168 #ifdef HAVE_AS_TLS
169 #undef TARGET_HAVE_TLS
170 #define TARGET_HAVE_TLS 1
171 #endif
172
173 static bool aarch64_composite_type_p (const_tree, machine_mode);
174 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
175 const_tree,
176 machine_mode *, int *,
177 bool *);
178 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
179 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
180 static void aarch64_override_options_after_change (void);
181 static bool aarch64_vector_mode_supported_p (machine_mode);
182 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
183 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
184 const_tree type,
185 int misalignment,
186 bool is_packed);
187 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
188 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
189 aarch64_addr_query_type);
190 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
191
192 /* Major revision number of the ARM Architecture implemented by the target. */
193 unsigned aarch64_architecture_version;
194
195 /* The processor for which instructions should be scheduled. */
196 enum aarch64_processor aarch64_tune = cortexa53;
197
198 /* Mask to specify which instruction scheduling options should be used. */
199 uint64_t aarch64_tune_flags = 0;
200
201 /* Global flag for PC relative loads. */
202 bool aarch64_pcrelative_literal_loads;
203
204 /* Global flag for whether frame pointer is enabled. */
205 bool aarch64_use_frame_pointer;
206
207 #define BRANCH_PROTECT_STR_MAX 255
208 char *accepted_branch_protection_string = NULL;
209
210 static enum aarch64_parse_opt_result
211 aarch64_parse_branch_protection (const char*, char**);
212
213 /* Support for command line parsing of boolean flags in the tuning
214 structures. */
215 struct aarch64_flag_desc
216 {
217 const char* name;
218 unsigned int flag;
219 };
220
221 #define AARCH64_FUSION_PAIR(name, internal_name) \
222 { name, AARCH64_FUSE_##internal_name },
223 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
224 {
225 { "none", AARCH64_FUSE_NOTHING },
226 #include "aarch64-fusion-pairs.def"
227 { "all", AARCH64_FUSE_ALL },
228 { NULL, AARCH64_FUSE_NOTHING }
229 };
230
231 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
232 { name, AARCH64_EXTRA_TUNE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
234 {
235 { "none", AARCH64_EXTRA_TUNE_NONE },
236 #include "aarch64-tuning-flags.def"
237 { "all", AARCH64_EXTRA_TUNE_ALL },
238 { NULL, AARCH64_EXTRA_TUNE_NONE }
239 };
240
241 /* Tuning parameters. */
242
243 static const struct cpu_addrcost_table generic_addrcost_table =
244 {
245 {
246 1, /* hi */
247 0, /* si */
248 0, /* di */
249 1, /* ti */
250 },
251 0, /* pre_modify */
252 0, /* post_modify */
253 0, /* register_offset */
254 0, /* register_sextend */
255 0, /* register_zextend */
256 0 /* imm_offset */
257 };
258
259 static const struct cpu_addrcost_table exynosm1_addrcost_table =
260 {
261 {
262 0, /* hi */
263 0, /* si */
264 0, /* di */
265 2, /* ti */
266 },
267 0, /* pre_modify */
268 0, /* post_modify */
269 1, /* register_offset */
270 1, /* register_sextend */
271 2, /* register_zextend */
272 0, /* imm_offset */
273 };
274
275 static const struct cpu_addrcost_table xgene1_addrcost_table =
276 {
277 {
278 1, /* hi */
279 0, /* si */
280 0, /* di */
281 1, /* ti */
282 },
283 1, /* pre_modify */
284 1, /* post_modify */
285 0, /* register_offset */
286 1, /* register_sextend */
287 1, /* register_zextend */
288 0, /* imm_offset */
289 };
290
291 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
292 {
293 {
294 1, /* hi */
295 1, /* si */
296 1, /* di */
297 2, /* ti */
298 },
299 0, /* pre_modify */
300 0, /* post_modify */
301 2, /* register_offset */
302 3, /* register_sextend */
303 3, /* register_zextend */
304 0, /* imm_offset */
305 };
306
307 static const struct cpu_addrcost_table tsv110_addrcost_table =
308 {
309 {
310 1, /* hi */
311 0, /* si */
312 0, /* di */
313 1, /* ti */
314 },
315 0, /* pre_modify */
316 0, /* post_modify */
317 0, /* register_offset */
318 1, /* register_sextend */
319 1, /* register_zextend */
320 0, /* imm_offset */
321 };
322
323 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
324 {
325 {
326 1, /* hi */
327 1, /* si */
328 1, /* di */
329 2, /* ti */
330 },
331 1, /* pre_modify */
332 1, /* post_modify */
333 3, /* register_offset */
334 3, /* register_sextend */
335 3, /* register_zextend */
336 2, /* imm_offset */
337 };
338
339 static const struct cpu_regmove_cost generic_regmove_cost =
340 {
341 1, /* GP2GP */
342 /* Avoid the use of slow int<->fp moves for spilling by setting
343 their cost higher than memmov_cost. */
344 5, /* GP2FP */
345 5, /* FP2GP */
346 2 /* FP2FP */
347 };
348
349 static const struct cpu_regmove_cost cortexa57_regmove_cost =
350 {
351 1, /* GP2GP */
352 /* Avoid the use of slow int<->fp moves for spilling by setting
353 their cost higher than memmov_cost. */
354 5, /* GP2FP */
355 5, /* FP2GP */
356 2 /* FP2FP */
357 };
358
359 static const struct cpu_regmove_cost cortexa53_regmove_cost =
360 {
361 1, /* GP2GP */
362 /* Avoid the use of slow int<->fp moves for spilling by setting
363 their cost higher than memmov_cost. */
364 5, /* GP2FP */
365 5, /* FP2GP */
366 2 /* FP2FP */
367 };
368
369 static const struct cpu_regmove_cost exynosm1_regmove_cost =
370 {
371 1, /* GP2GP */
372 /* Avoid the use of slow int<->fp moves for spilling by setting
373 their cost higher than memmov_cost (actual, 4 and 9). */
374 9, /* GP2FP */
375 9, /* FP2GP */
376 1 /* FP2FP */
377 };
378
379 static const struct cpu_regmove_cost thunderx_regmove_cost =
380 {
381 2, /* GP2GP */
382 2, /* GP2FP */
383 6, /* FP2GP */
384 4 /* FP2FP */
385 };
386
387 static const struct cpu_regmove_cost xgene1_regmove_cost =
388 {
389 1, /* GP2GP */
390 /* Avoid the use of slow int<->fp moves for spilling by setting
391 their cost higher than memmov_cost. */
392 8, /* GP2FP */
393 8, /* FP2GP */
394 2 /* FP2FP */
395 };
396
397 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
398 {
399 2, /* GP2GP */
400 /* Avoid the use of int<->fp moves for spilling. */
401 6, /* GP2FP */
402 6, /* FP2GP */
403 4 /* FP2FP */
404 };
405
406 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
407 {
408 1, /* GP2GP */
409 /* Avoid the use of int<->fp moves for spilling. */
410 8, /* GP2FP */
411 8, /* FP2GP */
412 4 /* FP2FP */
413 };
414
415 static const struct cpu_regmove_cost tsv110_regmove_cost =
416 {
417 1, /* GP2GP */
418 /* Avoid the use of slow int<->fp moves for spilling by setting
419 their cost higher than memmov_cost. */
420 2, /* GP2FP */
421 3, /* FP2GP */
422 2 /* FP2FP */
423 };
424
425 /* Generic costs for vector insn classes. */
426 static const struct cpu_vector_cost generic_vector_cost =
427 {
428 1, /* scalar_int_stmt_cost */
429 1, /* scalar_fp_stmt_cost */
430 1, /* scalar_load_cost */
431 1, /* scalar_store_cost */
432 1, /* vec_int_stmt_cost */
433 1, /* vec_fp_stmt_cost */
434 2, /* vec_permute_cost */
435 1, /* vec_to_scalar_cost */
436 1, /* scalar_to_vec_cost */
437 1, /* vec_align_load_cost */
438 1, /* vec_unalign_load_cost */
439 1, /* vec_unalign_store_cost */
440 1, /* vec_store_cost */
441 3, /* cond_taken_branch_cost */
442 1 /* cond_not_taken_branch_cost */
443 };
444
445 /* QDF24XX costs for vector insn classes. */
446 static const struct cpu_vector_cost qdf24xx_vector_cost =
447 {
448 1, /* scalar_int_stmt_cost */
449 1, /* scalar_fp_stmt_cost */
450 1, /* scalar_load_cost */
451 1, /* scalar_store_cost */
452 1, /* vec_int_stmt_cost */
453 3, /* vec_fp_stmt_cost */
454 2, /* vec_permute_cost */
455 1, /* vec_to_scalar_cost */
456 1, /* scalar_to_vec_cost */
457 1, /* vec_align_load_cost */
458 1, /* vec_unalign_load_cost */
459 1, /* vec_unalign_store_cost */
460 1, /* vec_store_cost */
461 3, /* cond_taken_branch_cost */
462 1 /* cond_not_taken_branch_cost */
463 };
464
465 /* ThunderX costs for vector insn classes. */
466 static const struct cpu_vector_cost thunderx_vector_cost =
467 {
468 1, /* scalar_int_stmt_cost */
469 1, /* scalar_fp_stmt_cost */
470 3, /* scalar_load_cost */
471 1, /* scalar_store_cost */
472 4, /* vec_int_stmt_cost */
473 1, /* vec_fp_stmt_cost */
474 4, /* vec_permute_cost */
475 2, /* vec_to_scalar_cost */
476 2, /* scalar_to_vec_cost */
477 3, /* vec_align_load_cost */
478 5, /* vec_unalign_load_cost */
479 5, /* vec_unalign_store_cost */
480 1, /* vec_store_cost */
481 3, /* cond_taken_branch_cost */
482 3 /* cond_not_taken_branch_cost */
483 };
484
485 static const struct cpu_vector_cost tsv110_vector_cost =
486 {
487 1, /* scalar_int_stmt_cost */
488 1, /* scalar_fp_stmt_cost */
489 5, /* scalar_load_cost */
490 1, /* scalar_store_cost */
491 2, /* vec_int_stmt_cost */
492 2, /* vec_fp_stmt_cost */
493 2, /* vec_permute_cost */
494 3, /* vec_to_scalar_cost */
495 2, /* scalar_to_vec_cost */
496 5, /* vec_align_load_cost */
497 5, /* vec_unalign_load_cost */
498 1, /* vec_unalign_store_cost */
499 1, /* vec_store_cost */
500 1, /* cond_taken_branch_cost */
501 1 /* cond_not_taken_branch_cost */
502 };
503
504 /* Generic costs for vector insn classes. */
505 static const struct cpu_vector_cost cortexa57_vector_cost =
506 {
507 1, /* scalar_int_stmt_cost */
508 1, /* scalar_fp_stmt_cost */
509 4, /* scalar_load_cost */
510 1, /* scalar_store_cost */
511 2, /* vec_int_stmt_cost */
512 2, /* vec_fp_stmt_cost */
513 3, /* vec_permute_cost */
514 8, /* vec_to_scalar_cost */
515 8, /* scalar_to_vec_cost */
516 4, /* vec_align_load_cost */
517 4, /* vec_unalign_load_cost */
518 1, /* vec_unalign_store_cost */
519 1, /* vec_store_cost */
520 1, /* cond_taken_branch_cost */
521 1 /* cond_not_taken_branch_cost */
522 };
523
524 static const struct cpu_vector_cost exynosm1_vector_cost =
525 {
526 1, /* scalar_int_stmt_cost */
527 1, /* scalar_fp_stmt_cost */
528 5, /* scalar_load_cost */
529 1, /* scalar_store_cost */
530 3, /* vec_int_stmt_cost */
531 3, /* vec_fp_stmt_cost */
532 3, /* vec_permute_cost */
533 3, /* vec_to_scalar_cost */
534 3, /* scalar_to_vec_cost */
535 5, /* vec_align_load_cost */
536 5, /* vec_unalign_load_cost */
537 1, /* vec_unalign_store_cost */
538 1, /* vec_store_cost */
539 1, /* cond_taken_branch_cost */
540 1 /* cond_not_taken_branch_cost */
541 };
542
543 /* Generic costs for vector insn classes. */
544 static const struct cpu_vector_cost xgene1_vector_cost =
545 {
546 1, /* scalar_int_stmt_cost */
547 1, /* scalar_fp_stmt_cost */
548 5, /* scalar_load_cost */
549 1, /* scalar_store_cost */
550 2, /* vec_int_stmt_cost */
551 2, /* vec_fp_stmt_cost */
552 2, /* vec_permute_cost */
553 4, /* vec_to_scalar_cost */
554 4, /* scalar_to_vec_cost */
555 10, /* vec_align_load_cost */
556 10, /* vec_unalign_load_cost */
557 2, /* vec_unalign_store_cost */
558 2, /* vec_store_cost */
559 2, /* cond_taken_branch_cost */
560 1 /* cond_not_taken_branch_cost */
561 };
562
563 /* Costs for vector insn classes for Vulcan. */
564 static const struct cpu_vector_cost thunderx2t99_vector_cost =
565 {
566 1, /* scalar_int_stmt_cost */
567 6, /* scalar_fp_stmt_cost */
568 4, /* scalar_load_cost */
569 1, /* scalar_store_cost */
570 5, /* vec_int_stmt_cost */
571 6, /* vec_fp_stmt_cost */
572 3, /* vec_permute_cost */
573 6, /* vec_to_scalar_cost */
574 5, /* scalar_to_vec_cost */
575 8, /* vec_align_load_cost */
576 8, /* vec_unalign_load_cost */
577 4, /* vec_unalign_store_cost */
578 4, /* vec_store_cost */
579 2, /* cond_taken_branch_cost */
580 1 /* cond_not_taken_branch_cost */
581 };
582
583 /* Generic costs for branch instructions. */
584 static const struct cpu_branch_cost generic_branch_cost =
585 {
586 1, /* Predictable. */
587 3 /* Unpredictable. */
588 };
589
590 /* Generic approximation modes. */
591 static const cpu_approx_modes generic_approx_modes =
592 {
593 AARCH64_APPROX_NONE, /* division */
594 AARCH64_APPROX_NONE, /* sqrt */
595 AARCH64_APPROX_NONE /* recip_sqrt */
596 };
597
598 /* Approximation modes for Exynos M1. */
599 static const cpu_approx_modes exynosm1_approx_modes =
600 {
601 AARCH64_APPROX_NONE, /* division */
602 AARCH64_APPROX_ALL, /* sqrt */
603 AARCH64_APPROX_ALL /* recip_sqrt */
604 };
605
606 /* Approximation modes for X-Gene 1. */
607 static const cpu_approx_modes xgene1_approx_modes =
608 {
609 AARCH64_APPROX_NONE, /* division */
610 AARCH64_APPROX_NONE, /* sqrt */
611 AARCH64_APPROX_ALL /* recip_sqrt */
612 };
613
614 /* Generic prefetch settings (which disable prefetch). */
615 static const cpu_prefetch_tune generic_prefetch_tune =
616 {
617 0, /* num_slots */
618 -1, /* l1_cache_size */
619 -1, /* l1_cache_line_size */
620 -1, /* l2_cache_size */
621 true, /* prefetch_dynamic_strides */
622 -1, /* minimum_stride */
623 -1 /* default_opt_level */
624 };
625
626 static const cpu_prefetch_tune exynosm1_prefetch_tune =
627 {
628 0, /* num_slots */
629 -1, /* l1_cache_size */
630 64, /* l1_cache_line_size */
631 -1, /* l2_cache_size */
632 true, /* prefetch_dynamic_strides */
633 -1, /* minimum_stride */
634 -1 /* default_opt_level */
635 };
636
637 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
638 {
639 4, /* num_slots */
640 32, /* l1_cache_size */
641 64, /* l1_cache_line_size */
642 512, /* l2_cache_size */
643 false, /* prefetch_dynamic_strides */
644 2048, /* minimum_stride */
645 3 /* default_opt_level */
646 };
647
648 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
649 {
650 8, /* num_slots */
651 32, /* l1_cache_size */
652 128, /* l1_cache_line_size */
653 16*1024, /* l2_cache_size */
654 true, /* prefetch_dynamic_strides */
655 -1, /* minimum_stride */
656 3 /* default_opt_level */
657 };
658
659 static const cpu_prefetch_tune thunderx_prefetch_tune =
660 {
661 8, /* num_slots */
662 32, /* l1_cache_size */
663 128, /* l1_cache_line_size */
664 -1, /* l2_cache_size */
665 true, /* prefetch_dynamic_strides */
666 -1, /* minimum_stride */
667 -1 /* default_opt_level */
668 };
669
670 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
671 {
672 8, /* num_slots */
673 32, /* l1_cache_size */
674 64, /* l1_cache_line_size */
675 256, /* l2_cache_size */
676 true, /* prefetch_dynamic_strides */
677 -1, /* minimum_stride */
678 -1 /* default_opt_level */
679 };
680
681 static const cpu_prefetch_tune tsv110_prefetch_tune =
682 {
683 0, /* num_slots */
684 64, /* l1_cache_size */
685 64, /* l1_cache_line_size */
686 512, /* l2_cache_size */
687 true, /* prefetch_dynamic_strides */
688 -1, /* minimum_stride */
689 -1 /* default_opt_level */
690 };
691
692 static const cpu_prefetch_tune xgene1_prefetch_tune =
693 {
694 8, /* num_slots */
695 32, /* l1_cache_size */
696 64, /* l1_cache_line_size */
697 256, /* l2_cache_size */
698 true, /* prefetch_dynamic_strides */
699 -1, /* minimum_stride */
700 -1 /* default_opt_level */
701 };
702
703 static const struct tune_params generic_tunings =
704 {
705 &cortexa57_extra_costs,
706 &generic_addrcost_table,
707 &generic_regmove_cost,
708 &generic_vector_cost,
709 &generic_branch_cost,
710 &generic_approx_modes,
711 SVE_NOT_IMPLEMENTED, /* sve_width */
712 4, /* memmov_cost */
713 2, /* issue_rate */
714 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
715 "8", /* function_align. */
716 "4", /* jump_align. */
717 "8", /* loop_align. */
718 2, /* int_reassoc_width. */
719 4, /* fp_reassoc_width. */
720 1, /* vec_reassoc_width. */
721 2, /* min_div_recip_mul_sf. */
722 2, /* min_div_recip_mul_df. */
723 0, /* max_case_values. */
724 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
725 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
726 &generic_prefetch_tune
727 };
728
729 static const struct tune_params cortexa35_tunings =
730 {
731 &cortexa53_extra_costs,
732 &generic_addrcost_table,
733 &cortexa53_regmove_cost,
734 &generic_vector_cost,
735 &generic_branch_cost,
736 &generic_approx_modes,
737 SVE_NOT_IMPLEMENTED, /* sve_width */
738 4, /* memmov_cost */
739 1, /* issue_rate */
740 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
741 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
742 "16", /* function_align. */
743 "4", /* jump_align. */
744 "8", /* loop_align. */
745 2, /* int_reassoc_width. */
746 4, /* fp_reassoc_width. */
747 1, /* vec_reassoc_width. */
748 2, /* min_div_recip_mul_sf. */
749 2, /* min_div_recip_mul_df. */
750 0, /* max_case_values. */
751 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
752 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
753 &generic_prefetch_tune
754 };
755
756 static const struct tune_params cortexa53_tunings =
757 {
758 &cortexa53_extra_costs,
759 &generic_addrcost_table,
760 &cortexa53_regmove_cost,
761 &generic_vector_cost,
762 &generic_branch_cost,
763 &generic_approx_modes,
764 SVE_NOT_IMPLEMENTED, /* sve_width */
765 4, /* memmov_cost */
766 2, /* issue_rate */
767 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
768 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
769 "16", /* function_align. */
770 "4", /* jump_align. */
771 "8", /* loop_align. */
772 2, /* int_reassoc_width. */
773 4, /* fp_reassoc_width. */
774 1, /* vec_reassoc_width. */
775 2, /* min_div_recip_mul_sf. */
776 2, /* min_div_recip_mul_df. */
777 0, /* max_case_values. */
778 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
779 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
780 &generic_prefetch_tune
781 };
782
783 static const struct tune_params cortexa57_tunings =
784 {
785 &cortexa57_extra_costs,
786 &generic_addrcost_table,
787 &cortexa57_regmove_cost,
788 &cortexa57_vector_cost,
789 &generic_branch_cost,
790 &generic_approx_modes,
791 SVE_NOT_IMPLEMENTED, /* sve_width */
792 4, /* memmov_cost */
793 3, /* issue_rate */
794 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
795 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
796 "16", /* function_align. */
797 "4", /* jump_align. */
798 "8", /* loop_align. */
799 2, /* int_reassoc_width. */
800 4, /* fp_reassoc_width. */
801 1, /* vec_reassoc_width. */
802 2, /* min_div_recip_mul_sf. */
803 2, /* min_div_recip_mul_df. */
804 0, /* max_case_values. */
805 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
806 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
807 &generic_prefetch_tune
808 };
809
810 static const struct tune_params cortexa72_tunings =
811 {
812 &cortexa57_extra_costs,
813 &generic_addrcost_table,
814 &cortexa57_regmove_cost,
815 &cortexa57_vector_cost,
816 &generic_branch_cost,
817 &generic_approx_modes,
818 SVE_NOT_IMPLEMENTED, /* sve_width */
819 4, /* memmov_cost */
820 3, /* issue_rate */
821 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
822 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
823 "16", /* function_align. */
824 "4", /* jump_align. */
825 "8", /* loop_align. */
826 2, /* int_reassoc_width. */
827 4, /* fp_reassoc_width. */
828 1, /* vec_reassoc_width. */
829 2, /* min_div_recip_mul_sf. */
830 2, /* min_div_recip_mul_df. */
831 0, /* max_case_values. */
832 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
833 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
834 &generic_prefetch_tune
835 };
836
837 static const struct tune_params cortexa73_tunings =
838 {
839 &cortexa57_extra_costs,
840 &generic_addrcost_table,
841 &cortexa57_regmove_cost,
842 &cortexa57_vector_cost,
843 &generic_branch_cost,
844 &generic_approx_modes,
845 SVE_NOT_IMPLEMENTED, /* sve_width */
846 4, /* memmov_cost. */
847 2, /* issue_rate. */
848 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
849 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
850 "16", /* function_align. */
851 "4", /* jump_align. */
852 "8", /* loop_align. */
853 2, /* int_reassoc_width. */
854 4, /* fp_reassoc_width. */
855 1, /* vec_reassoc_width. */
856 2, /* min_div_recip_mul_sf. */
857 2, /* min_div_recip_mul_df. */
858 0, /* max_case_values. */
859 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
860 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
861 &generic_prefetch_tune
862 };
863
864
865
866 static const struct tune_params exynosm1_tunings =
867 {
868 &exynosm1_extra_costs,
869 &exynosm1_addrcost_table,
870 &exynosm1_regmove_cost,
871 &exynosm1_vector_cost,
872 &generic_branch_cost,
873 &exynosm1_approx_modes,
874 SVE_NOT_IMPLEMENTED, /* sve_width */
875 4, /* memmov_cost */
876 3, /* issue_rate */
877 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
878 "4", /* function_align. */
879 "4", /* jump_align. */
880 "4", /* loop_align. */
881 2, /* int_reassoc_width. */
882 4, /* fp_reassoc_width. */
883 1, /* vec_reassoc_width. */
884 2, /* min_div_recip_mul_sf. */
885 2, /* min_div_recip_mul_df. */
886 48, /* max_case_values. */
887 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
888 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
889 &exynosm1_prefetch_tune
890 };
891
892 static const struct tune_params thunderxt88_tunings =
893 {
894 &thunderx_extra_costs,
895 &generic_addrcost_table,
896 &thunderx_regmove_cost,
897 &thunderx_vector_cost,
898 &generic_branch_cost,
899 &generic_approx_modes,
900 SVE_NOT_IMPLEMENTED, /* sve_width */
901 6, /* memmov_cost */
902 2, /* issue_rate */
903 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
904 "8", /* function_align. */
905 "8", /* jump_align. */
906 "8", /* loop_align. */
907 2, /* int_reassoc_width. */
908 4, /* fp_reassoc_width. */
909 1, /* vec_reassoc_width. */
910 2, /* min_div_recip_mul_sf. */
911 2, /* min_div_recip_mul_df. */
912 0, /* max_case_values. */
913 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
914 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
915 &thunderxt88_prefetch_tune
916 };
917
918 static const struct tune_params thunderx_tunings =
919 {
920 &thunderx_extra_costs,
921 &generic_addrcost_table,
922 &thunderx_regmove_cost,
923 &thunderx_vector_cost,
924 &generic_branch_cost,
925 &generic_approx_modes,
926 SVE_NOT_IMPLEMENTED, /* sve_width */
927 6, /* memmov_cost */
928 2, /* issue_rate */
929 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
930 "8", /* function_align. */
931 "8", /* jump_align. */
932 "8", /* loop_align. */
933 2, /* int_reassoc_width. */
934 4, /* fp_reassoc_width. */
935 1, /* vec_reassoc_width. */
936 2, /* min_div_recip_mul_sf. */
937 2, /* min_div_recip_mul_df. */
938 0, /* max_case_values. */
939 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
940 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
941 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
942 &thunderx_prefetch_tune
943 };
944
945 static const struct tune_params tsv110_tunings =
946 {
947 &tsv110_extra_costs,
948 &tsv110_addrcost_table,
949 &tsv110_regmove_cost,
950 &tsv110_vector_cost,
951 &generic_branch_cost,
952 &generic_approx_modes,
953 SVE_NOT_IMPLEMENTED, /* sve_width */
954 4, /* memmov_cost */
955 4, /* issue_rate */
956 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
957 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
958 "16", /* function_align. */
959 "4", /* jump_align. */
960 "8", /* loop_align. */
961 2, /* int_reassoc_width. */
962 4, /* fp_reassoc_width. */
963 1, /* vec_reassoc_width. */
964 2, /* min_div_recip_mul_sf. */
965 2, /* min_div_recip_mul_df. */
966 0, /* max_case_values. */
967 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
968 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
969 &tsv110_prefetch_tune
970 };
971
972 static const struct tune_params xgene1_tunings =
973 {
974 &xgene1_extra_costs,
975 &xgene1_addrcost_table,
976 &xgene1_regmove_cost,
977 &xgene1_vector_cost,
978 &generic_branch_cost,
979 &xgene1_approx_modes,
980 SVE_NOT_IMPLEMENTED, /* sve_width */
981 6, /* memmov_cost */
982 4, /* issue_rate */
983 AARCH64_FUSE_NOTHING, /* fusible_ops */
984 "16", /* function_align. */
985 "16", /* jump_align. */
986 "16", /* loop_align. */
987 2, /* int_reassoc_width. */
988 4, /* fp_reassoc_width. */
989 1, /* vec_reassoc_width. */
990 2, /* min_div_recip_mul_sf. */
991 2, /* min_div_recip_mul_df. */
992 17, /* max_case_values. */
993 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
994 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
995 &xgene1_prefetch_tune
996 };
997
998 static const struct tune_params emag_tunings =
999 {
1000 &xgene1_extra_costs,
1001 &xgene1_addrcost_table,
1002 &xgene1_regmove_cost,
1003 &xgene1_vector_cost,
1004 &generic_branch_cost,
1005 &xgene1_approx_modes,
1006 SVE_NOT_IMPLEMENTED,
1007 6, /* memmov_cost */
1008 4, /* issue_rate */
1009 AARCH64_FUSE_NOTHING, /* fusible_ops */
1010 "16", /* function_align. */
1011 "16", /* jump_align. */
1012 "16", /* loop_align. */
1013 2, /* int_reassoc_width. */
1014 4, /* fp_reassoc_width. */
1015 1, /* vec_reassoc_width. */
1016 2, /* min_div_recip_mul_sf. */
1017 2, /* min_div_recip_mul_df. */
1018 17, /* max_case_values. */
1019 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1020 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1021 &xgene1_prefetch_tune
1022 };
1023
1024 static const struct tune_params qdf24xx_tunings =
1025 {
1026 &qdf24xx_extra_costs,
1027 &qdf24xx_addrcost_table,
1028 &qdf24xx_regmove_cost,
1029 &qdf24xx_vector_cost,
1030 &generic_branch_cost,
1031 &generic_approx_modes,
1032 SVE_NOT_IMPLEMENTED, /* sve_width */
1033 4, /* memmov_cost */
1034 4, /* issue_rate */
1035 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1036 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1037 "16", /* function_align. */
1038 "8", /* jump_align. */
1039 "16", /* loop_align. */
1040 2, /* int_reassoc_width. */
1041 4, /* fp_reassoc_width. */
1042 1, /* vec_reassoc_width. */
1043 2, /* min_div_recip_mul_sf. */
1044 2, /* min_div_recip_mul_df. */
1045 0, /* max_case_values. */
1046 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1047 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1048 &qdf24xx_prefetch_tune
1049 };
1050
1051 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1052 for now. */
1053 static const struct tune_params saphira_tunings =
1054 {
1055 &generic_extra_costs,
1056 &generic_addrcost_table,
1057 &generic_regmove_cost,
1058 &generic_vector_cost,
1059 &generic_branch_cost,
1060 &generic_approx_modes,
1061 SVE_NOT_IMPLEMENTED, /* sve_width */
1062 4, /* memmov_cost */
1063 4, /* issue_rate */
1064 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1065 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1066 "16", /* function_align. */
1067 "8", /* jump_align. */
1068 "16", /* loop_align. */
1069 2, /* int_reassoc_width. */
1070 4, /* fp_reassoc_width. */
1071 1, /* vec_reassoc_width. */
1072 2, /* min_div_recip_mul_sf. */
1073 2, /* min_div_recip_mul_df. */
1074 0, /* max_case_values. */
1075 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1076 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1077 &generic_prefetch_tune
1078 };
1079
1080 static const struct tune_params thunderx2t99_tunings =
1081 {
1082 &thunderx2t99_extra_costs,
1083 &thunderx2t99_addrcost_table,
1084 &thunderx2t99_regmove_cost,
1085 &thunderx2t99_vector_cost,
1086 &generic_branch_cost,
1087 &generic_approx_modes,
1088 SVE_NOT_IMPLEMENTED, /* sve_width */
1089 4, /* memmov_cost. */
1090 4, /* issue_rate. */
1091 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1092 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1093 "16", /* function_align. */
1094 "8", /* jump_align. */
1095 "16", /* loop_align. */
1096 3, /* int_reassoc_width. */
1097 2, /* fp_reassoc_width. */
1098 2, /* vec_reassoc_width. */
1099 2, /* min_div_recip_mul_sf. */
1100 2, /* min_div_recip_mul_df. */
1101 0, /* max_case_values. */
1102 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1103 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1104 &thunderx2t99_prefetch_tune
1105 };
1106
1107 static const struct tune_params neoversen1_tunings =
1108 {
1109 &cortexa57_extra_costs,
1110 &generic_addrcost_table,
1111 &generic_regmove_cost,
1112 &cortexa57_vector_cost,
1113 &generic_branch_cost,
1114 &generic_approx_modes,
1115 SVE_NOT_IMPLEMENTED, /* sve_width */
1116 4, /* memmov_cost */
1117 3, /* issue_rate */
1118 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1119 "32:16", /* function_align. */
1120 "32:16", /* jump_align. */
1121 "32:16", /* loop_align. */
1122 2, /* int_reassoc_width. */
1123 4, /* fp_reassoc_width. */
1124 2, /* vec_reassoc_width. */
1125 2, /* min_div_recip_mul_sf. */
1126 2, /* min_div_recip_mul_df. */
1127 0, /* max_case_values. */
1128 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1129 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1130 &generic_prefetch_tune
1131 };
1132
1133 /* Support for fine-grained override of the tuning structures. */
1134 struct aarch64_tuning_override_function
1135 {
1136 const char* name;
1137 void (*parse_override)(const char*, struct tune_params*);
1138 };
1139
1140 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1141 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1142 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1143
1144 static const struct aarch64_tuning_override_function
1145 aarch64_tuning_override_functions[] =
1146 {
1147 { "fuse", aarch64_parse_fuse_string },
1148 { "tune", aarch64_parse_tune_string },
1149 { "sve_width", aarch64_parse_sve_width_string },
1150 { NULL, NULL }
1151 };
1152
1153 /* A processor implementing AArch64. */
1154 struct processor
1155 {
1156 const char *const name;
1157 enum aarch64_processor ident;
1158 enum aarch64_processor sched_core;
1159 enum aarch64_arch arch;
1160 unsigned architecture_version;
1161 const uint64_t flags;
1162 const struct tune_params *const tune;
1163 };
1164
1165 /* Architectures implementing AArch64. */
1166 static const struct processor all_architectures[] =
1167 {
1168 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1169 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1170 #include "aarch64-arches.def"
1171 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1172 };
1173
1174 /* Processor cores implementing AArch64. */
1175 static const struct processor all_cores[] =
1176 {
1177 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1178 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1179 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1180 FLAGS, &COSTS##_tunings},
1181 #include "aarch64-cores.def"
1182 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1183 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1184 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1185 };
1186
1187
1188 /* Target specification. These are populated by the -march, -mtune, -mcpu
1189 handling code or by target attributes. */
1190 static const struct processor *selected_arch;
1191 static const struct processor *selected_cpu;
1192 static const struct processor *selected_tune;
1193
1194 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1195
1196 /* The current tuning set. */
1197 struct tune_params aarch64_tune_params = generic_tunings;
1198
1199 /* Table of machine attributes. */
1200 static const struct attribute_spec aarch64_attribute_table[] =
1201 {
1202 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1203 affects_type_identity, handler, exclude } */
1204 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1205 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1206 };
1207
1208 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1209
1210 /* An ISA extension in the co-processor and main instruction set space. */
1211 struct aarch64_option_extension
1212 {
1213 const char *const name;
1214 const unsigned long flags_on;
1215 const unsigned long flags_off;
1216 };
1217
1218 typedef enum aarch64_cond_code
1219 {
1220 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1221 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1222 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1223 }
1224 aarch64_cc;
1225
1226 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1227
1228 struct aarch64_branch_protect_type
1229 {
1230 /* The type's name that the user passes to the branch-protection option
1231 string. */
1232 const char* name;
1233 /* Function to handle the protection type and set global variables.
1234 First argument is the string token corresponding with this type and the
1235 second argument is the next token in the option string.
1236 Return values:
1237 * AARCH64_PARSE_OK: Handling was sucessful.
1238 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1239 should print an error.
1240 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1241 own error. */
1242 enum aarch64_parse_opt_result (*handler)(char*, char*);
1243 /* A list of types that can follow this type in the option string. */
1244 const aarch64_branch_protect_type* subtypes;
1245 unsigned int num_subtypes;
1246 };
1247
1248 static enum aarch64_parse_opt_result
1249 aarch64_handle_no_branch_protection (char* str, char* rest)
1250 {
1251 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1252 aarch64_enable_bti = 0;
1253 if (rest)
1254 {
1255 error ("unexpected %<%s%> after %<%s%>", rest, str);
1256 return AARCH64_PARSE_INVALID_FEATURE;
1257 }
1258 return AARCH64_PARSE_OK;
1259 }
1260
1261 static enum aarch64_parse_opt_result
1262 aarch64_handle_standard_branch_protection (char* str, char* rest)
1263 {
1264 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1265 aarch64_ra_sign_key = AARCH64_KEY_A;
1266 aarch64_enable_bti = 1;
1267 if (rest)
1268 {
1269 error ("unexpected %<%s%> after %<%s%>", rest, str);
1270 return AARCH64_PARSE_INVALID_FEATURE;
1271 }
1272 return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1277 char* rest ATTRIBUTE_UNUSED)
1278 {
1279 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1280 aarch64_ra_sign_key = AARCH64_KEY_A;
1281 return AARCH64_PARSE_OK;
1282 }
1283
1284 static enum aarch64_parse_opt_result
1285 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1286 char* rest ATTRIBUTE_UNUSED)
1287 {
1288 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1289 return AARCH64_PARSE_OK;
1290 }
1291
1292 static enum aarch64_parse_opt_result
1293 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1294 char* rest ATTRIBUTE_UNUSED)
1295 {
1296 aarch64_ra_sign_key = AARCH64_KEY_B;
1297 return AARCH64_PARSE_OK;
1298 }
1299
1300 static enum aarch64_parse_opt_result
1301 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1302 char* rest ATTRIBUTE_UNUSED)
1303 {
1304 aarch64_enable_bti = 1;
1305 return AARCH64_PARSE_OK;
1306 }
1307
1308 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1309 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1310 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1311 { NULL, NULL, NULL, 0 }
1312 };
1313
1314 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1315 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1316 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1317 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1318 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1319 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1320 { NULL, NULL, NULL, 0 }
1321 };
1322
1323 /* The condition codes of the processor, and the inverse function. */
1324 static const char * const aarch64_condition_codes[] =
1325 {
1326 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1327 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1328 };
1329
1330 /* The preferred condition codes for SVE conditions. */
1331 static const char *const aarch64_sve_condition_codes[] =
1332 {
1333 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1334 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1335 };
1336
1337 /* Generate code to enable conditional branches in functions over 1 MiB. */
1338 const char *
1339 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1340 const char * branch_format)
1341 {
1342 rtx_code_label * tmp_label = gen_label_rtx ();
1343 char label_buf[256];
1344 char buffer[128];
1345 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1346 CODE_LABEL_NUMBER (tmp_label));
1347 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1348 rtx dest_label = operands[pos_label];
1349 operands[pos_label] = tmp_label;
1350
1351 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1352 output_asm_insn (buffer, operands);
1353
1354 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1355 operands[pos_label] = dest_label;
1356 output_asm_insn (buffer, operands);
1357 return "";
1358 }
1359
1360 void
1361 aarch64_err_no_fpadvsimd (machine_mode mode)
1362 {
1363 if (TARGET_GENERAL_REGS_ONLY)
1364 if (FLOAT_MODE_P (mode))
1365 error ("%qs is incompatible with the use of floating-point types",
1366 "-mgeneral-regs-only");
1367 else
1368 error ("%qs is incompatible with the use of vector types",
1369 "-mgeneral-regs-only");
1370 else
1371 if (FLOAT_MODE_P (mode))
1372 error ("%qs feature modifier is incompatible with the use of"
1373 " floating-point types", "+nofp");
1374 else
1375 error ("%qs feature modifier is incompatible with the use of"
1376 " vector types", "+nofp");
1377 }
1378
1379 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1380 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1381 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1382 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1383 and GENERAL_REGS is lower than the memory cost (in this case the best class
1384 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1385 cost results in bad allocations with many redundant int<->FP moves which
1386 are expensive on various cores.
1387 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1388 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1389 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1390 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1391 The result of this is that it is no longer inefficient to have a higher
1392 memory move cost than the register move cost.
1393 */
1394
1395 static reg_class_t
1396 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1397 reg_class_t best_class)
1398 {
1399 machine_mode mode;
1400
1401 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1402 || !reg_class_subset_p (FP_REGS, allocno_class))
1403 return allocno_class;
1404
1405 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1406 || !reg_class_subset_p (FP_REGS, best_class))
1407 return best_class;
1408
1409 mode = PSEUDO_REGNO_MODE (regno);
1410 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1411 }
1412
1413 static unsigned int
1414 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1415 {
1416 if (GET_MODE_UNIT_SIZE (mode) == 4)
1417 return aarch64_tune_params.min_div_recip_mul_sf;
1418 return aarch64_tune_params.min_div_recip_mul_df;
1419 }
1420
1421 /* Return the reassociation width of treeop OPC with mode MODE. */
1422 static int
1423 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1424 {
1425 if (VECTOR_MODE_P (mode))
1426 return aarch64_tune_params.vec_reassoc_width;
1427 if (INTEGRAL_MODE_P (mode))
1428 return aarch64_tune_params.int_reassoc_width;
1429 /* Avoid reassociating floating point addition so we emit more FMAs. */
1430 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1431 return aarch64_tune_params.fp_reassoc_width;
1432 return 1;
1433 }
1434
1435 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1436 unsigned
1437 aarch64_dbx_register_number (unsigned regno)
1438 {
1439 if (GP_REGNUM_P (regno))
1440 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1441 else if (regno == SP_REGNUM)
1442 return AARCH64_DWARF_SP;
1443 else if (FP_REGNUM_P (regno))
1444 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1445 else if (PR_REGNUM_P (regno))
1446 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1447 else if (regno == VG_REGNUM)
1448 return AARCH64_DWARF_VG;
1449
1450 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1451 equivalent DWARF register. */
1452 return DWARF_FRAME_REGISTERS;
1453 }
1454
1455 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1456 static bool
1457 aarch64_advsimd_struct_mode_p (machine_mode mode)
1458 {
1459 return (TARGET_SIMD
1460 && (mode == OImode || mode == CImode || mode == XImode));
1461 }
1462
1463 /* Return true if MODE is an SVE predicate mode. */
1464 static bool
1465 aarch64_sve_pred_mode_p (machine_mode mode)
1466 {
1467 return (TARGET_SVE
1468 && (mode == VNx16BImode
1469 || mode == VNx8BImode
1470 || mode == VNx4BImode
1471 || mode == VNx2BImode));
1472 }
1473
1474 /* Three mutually-exclusive flags describing a vector or predicate type. */
1475 const unsigned int VEC_ADVSIMD = 1;
1476 const unsigned int VEC_SVE_DATA = 2;
1477 const unsigned int VEC_SVE_PRED = 4;
1478 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1479 a structure of 2, 3 or 4 vectors. */
1480 const unsigned int VEC_STRUCT = 8;
1481 /* Useful combinations of the above. */
1482 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1483 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1484
1485 /* Return a set of flags describing the vector properties of mode MODE.
1486 Ignore modes that are not supported by the current target. */
1487 static unsigned int
1488 aarch64_classify_vector_mode (machine_mode mode)
1489 {
1490 if (aarch64_advsimd_struct_mode_p (mode))
1491 return VEC_ADVSIMD | VEC_STRUCT;
1492
1493 if (aarch64_sve_pred_mode_p (mode))
1494 return VEC_SVE_PRED;
1495
1496 /* Make the decision based on the mode's enum value rather than its
1497 properties, so that we keep the correct classification regardless
1498 of -msve-vector-bits. */
1499 switch (mode)
1500 {
1501 /* Single SVE vectors. */
1502 case E_VNx16QImode:
1503 case E_VNx8HImode:
1504 case E_VNx4SImode:
1505 case E_VNx2DImode:
1506 case E_VNx8HFmode:
1507 case E_VNx4SFmode:
1508 case E_VNx2DFmode:
1509 return TARGET_SVE ? VEC_SVE_DATA : 0;
1510
1511 /* x2 SVE vectors. */
1512 case E_VNx32QImode:
1513 case E_VNx16HImode:
1514 case E_VNx8SImode:
1515 case E_VNx4DImode:
1516 case E_VNx16HFmode:
1517 case E_VNx8SFmode:
1518 case E_VNx4DFmode:
1519 /* x3 SVE vectors. */
1520 case E_VNx48QImode:
1521 case E_VNx24HImode:
1522 case E_VNx12SImode:
1523 case E_VNx6DImode:
1524 case E_VNx24HFmode:
1525 case E_VNx12SFmode:
1526 case E_VNx6DFmode:
1527 /* x4 SVE vectors. */
1528 case E_VNx64QImode:
1529 case E_VNx32HImode:
1530 case E_VNx16SImode:
1531 case E_VNx8DImode:
1532 case E_VNx32HFmode:
1533 case E_VNx16SFmode:
1534 case E_VNx8DFmode:
1535 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1536
1537 /* 64-bit Advanced SIMD vectors. */
1538 case E_V8QImode:
1539 case E_V4HImode:
1540 case E_V2SImode:
1541 /* ...E_V1DImode doesn't exist. */
1542 case E_V4HFmode:
1543 case E_V2SFmode:
1544 case E_V1DFmode:
1545 /* 128-bit Advanced SIMD vectors. */
1546 case E_V16QImode:
1547 case E_V8HImode:
1548 case E_V4SImode:
1549 case E_V2DImode:
1550 case E_V8HFmode:
1551 case E_V4SFmode:
1552 case E_V2DFmode:
1553 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1554
1555 default:
1556 return 0;
1557 }
1558 }
1559
1560 /* Return true if MODE is any of the data vector modes, including
1561 structure modes. */
1562 static bool
1563 aarch64_vector_data_mode_p (machine_mode mode)
1564 {
1565 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1566 }
1567
1568 /* Return true if MODE is an SVE data vector mode; either a single vector
1569 or a structure of vectors. */
1570 static bool
1571 aarch64_sve_data_mode_p (machine_mode mode)
1572 {
1573 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1574 }
1575
1576 /* Implement target hook TARGET_ARRAY_MODE. */
1577 static opt_machine_mode
1578 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1579 {
1580 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1581 && IN_RANGE (nelems, 2, 4))
1582 return mode_for_vector (GET_MODE_INNER (mode),
1583 GET_MODE_NUNITS (mode) * nelems);
1584
1585 return opt_machine_mode ();
1586 }
1587
1588 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1589 static bool
1590 aarch64_array_mode_supported_p (machine_mode mode,
1591 unsigned HOST_WIDE_INT nelems)
1592 {
1593 if (TARGET_SIMD
1594 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1595 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1596 && (nelems >= 2 && nelems <= 4))
1597 return true;
1598
1599 return false;
1600 }
1601
1602 /* Return the SVE predicate mode to use for elements that have
1603 ELEM_NBYTES bytes, if such a mode exists. */
1604
1605 opt_machine_mode
1606 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1607 {
1608 if (TARGET_SVE)
1609 {
1610 if (elem_nbytes == 1)
1611 return VNx16BImode;
1612 if (elem_nbytes == 2)
1613 return VNx8BImode;
1614 if (elem_nbytes == 4)
1615 return VNx4BImode;
1616 if (elem_nbytes == 8)
1617 return VNx2BImode;
1618 }
1619 return opt_machine_mode ();
1620 }
1621
1622 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1623
1624 static opt_machine_mode
1625 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1626 {
1627 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1628 {
1629 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1630 machine_mode pred_mode;
1631 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1632 return pred_mode;
1633 }
1634
1635 return default_get_mask_mode (nunits, nbytes);
1636 }
1637
1638 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1639 prefer to use the first arithmetic operand as the else value if
1640 the else value doesn't matter, since that exactly matches the SVE
1641 destructive merging form. For ternary operations we could either
1642 pick the first operand and use FMAD-like instructions or the last
1643 operand and use FMLA-like instructions; the latter seems more
1644 natural. */
1645
1646 static tree
1647 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1648 {
1649 return nops == 3 ? ops[2] : ops[0];
1650 }
1651
1652 /* Implement TARGET_HARD_REGNO_NREGS. */
1653
1654 static unsigned int
1655 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1656 {
1657 /* ??? Logically we should only need to provide a value when
1658 HARD_REGNO_MODE_OK says that the combination is valid,
1659 but at the moment we need to handle all modes. Just ignore
1660 any runtime parts for registers that can't store them. */
1661 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1662 switch (aarch64_regno_regclass (regno))
1663 {
1664 case FP_REGS:
1665 case FP_LO_REGS:
1666 case FP_LO8_REGS:
1667 if (aarch64_sve_data_mode_p (mode))
1668 return exact_div (GET_MODE_SIZE (mode),
1669 BYTES_PER_SVE_VECTOR).to_constant ();
1670 return CEIL (lowest_size, UNITS_PER_VREG);
1671 case PR_REGS:
1672 case PR_LO_REGS:
1673 case PR_HI_REGS:
1674 return 1;
1675 default:
1676 return CEIL (lowest_size, UNITS_PER_WORD);
1677 }
1678 gcc_unreachable ();
1679 }
1680
1681 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1682
1683 static bool
1684 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1685 {
1686 if (GET_MODE_CLASS (mode) == MODE_CC)
1687 return regno == CC_REGNUM;
1688
1689 if (regno == VG_REGNUM)
1690 /* This must have the same size as _Unwind_Word. */
1691 return mode == DImode;
1692
1693 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1694 if (vec_flags & VEC_SVE_PRED)
1695 return PR_REGNUM_P (regno);
1696
1697 if (PR_REGNUM_P (regno))
1698 return 0;
1699
1700 if (regno == SP_REGNUM)
1701 /* The purpose of comparing with ptr_mode is to support the
1702 global register variable associated with the stack pointer
1703 register via the syntax of asm ("wsp") in ILP32. */
1704 return mode == Pmode || mode == ptr_mode;
1705
1706 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1707 return mode == Pmode;
1708
1709 if (GP_REGNUM_P (regno))
1710 {
1711 if (known_le (GET_MODE_SIZE (mode), 8))
1712 return true;
1713 else if (known_le (GET_MODE_SIZE (mode), 16))
1714 return (regno & 1) == 0;
1715 }
1716 else if (FP_REGNUM_P (regno))
1717 {
1718 if (vec_flags & VEC_STRUCT)
1719 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1720 else
1721 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1722 }
1723
1724 return false;
1725 }
1726
1727 /* Return true if this is a definition of a vectorized simd function. */
1728
1729 static bool
1730 aarch64_simd_decl_p (tree fndecl)
1731 {
1732 tree fntype;
1733
1734 if (fndecl == NULL)
1735 return false;
1736 fntype = TREE_TYPE (fndecl);
1737 if (fntype == NULL)
1738 return false;
1739
1740 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1741 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1742 return true;
1743
1744 return false;
1745 }
1746
1747 /* Return the mode a register save/restore should use. DImode for integer
1748 registers, DFmode for FP registers in non-SIMD functions (they only save
1749 the bottom half of a 128 bit register), or TFmode for FP registers in
1750 SIMD functions. */
1751
1752 static machine_mode
1753 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1754 {
1755 return GP_REGNUM_P (regno)
1756 ? E_DImode
1757 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1758 }
1759
1760 /* Return true if the instruction is a call to a SIMD function, false
1761 if it is not a SIMD function or if we do not know anything about
1762 the function. */
1763
1764 static bool
1765 aarch64_simd_call_p (rtx_insn *insn)
1766 {
1767 rtx symbol;
1768 rtx call;
1769 tree fndecl;
1770
1771 gcc_assert (CALL_P (insn));
1772 call = get_call_rtx_from (insn);
1773 symbol = XEXP (XEXP (call, 0), 0);
1774 if (GET_CODE (symbol) != SYMBOL_REF)
1775 return false;
1776 fndecl = SYMBOL_REF_DECL (symbol);
1777 if (!fndecl)
1778 return false;
1779
1780 return aarch64_simd_decl_p (fndecl);
1781 }
1782
1783 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1784 a function that uses the SIMD ABI, take advantage of the extra
1785 call-preserved registers that the ABI provides. */
1786
1787 void
1788 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1789 HARD_REG_SET *return_set)
1790 {
1791 if (aarch64_simd_call_p (insn))
1792 {
1793 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1794 if (FP_SIMD_SAVED_REGNUM_P (regno))
1795 CLEAR_HARD_REG_BIT (*return_set, regno);
1796 }
1797 }
1798
1799 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1800 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1801 clobbers the top 64 bits when restoring the bottom 64 bits. */
1802
1803 static bool
1804 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1805 machine_mode mode)
1806 {
1807 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1808 return FP_REGNUM_P (regno)
1809 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1810 }
1811
1812 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1813
1814 rtx_insn *
1815 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1816 {
1817 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1818
1819 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1820 return call_1;
1821 else
1822 return call_2;
1823 }
1824
1825 /* Implement REGMODE_NATURAL_SIZE. */
1826 poly_uint64
1827 aarch64_regmode_natural_size (machine_mode mode)
1828 {
1829 /* The natural size for SVE data modes is one SVE data vector,
1830 and similarly for predicates. We can't independently modify
1831 anything smaller than that. */
1832 /* ??? For now, only do this for variable-width SVE registers.
1833 Doing it for constant-sized registers breaks lower-subreg.c. */
1834 /* ??? And once that's fixed, we should probably have similar
1835 code for Advanced SIMD. */
1836 if (!aarch64_sve_vg.is_constant ())
1837 {
1838 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1839 if (vec_flags & VEC_SVE_PRED)
1840 return BYTES_PER_SVE_PRED;
1841 if (vec_flags & VEC_SVE_DATA)
1842 return BYTES_PER_SVE_VECTOR;
1843 }
1844 return UNITS_PER_WORD;
1845 }
1846
1847 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1848 machine_mode
1849 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1850 machine_mode mode)
1851 {
1852 /* The predicate mode determines which bits are significant and
1853 which are "don't care". Decreasing the number of lanes would
1854 lose data while increasing the number of lanes would make bits
1855 unnecessarily significant. */
1856 if (PR_REGNUM_P (regno))
1857 return mode;
1858 if (known_ge (GET_MODE_SIZE (mode), 4))
1859 return mode;
1860 else
1861 return SImode;
1862 }
1863
1864 /* Return true if I's bits are consecutive ones from the MSB. */
1865 bool
1866 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1867 {
1868 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1869 }
1870
1871 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1872 that strcpy from constants will be faster. */
1873
1874 static HOST_WIDE_INT
1875 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1876 {
1877 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1878 return MAX (align, BITS_PER_WORD);
1879 return align;
1880 }
1881
1882 /* Return true if calls to DECL should be treated as
1883 long-calls (ie called via a register). */
1884 static bool
1885 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1886 {
1887 return false;
1888 }
1889
1890 /* Return true if calls to symbol-ref SYM should be treated as
1891 long-calls (ie called via a register). */
1892 bool
1893 aarch64_is_long_call_p (rtx sym)
1894 {
1895 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1896 }
1897
1898 /* Return true if calls to symbol-ref SYM should not go through
1899 plt stubs. */
1900
1901 bool
1902 aarch64_is_noplt_call_p (rtx sym)
1903 {
1904 const_tree decl = SYMBOL_REF_DECL (sym);
1905
1906 if (flag_pic
1907 && decl
1908 && (!flag_plt
1909 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1910 && !targetm.binds_local_p (decl))
1911 return true;
1912
1913 return false;
1914 }
1915
1916 /* Return true if the offsets to a zero/sign-extract operation
1917 represent an expression that matches an extend operation. The
1918 operands represent the paramters from
1919
1920 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1921 bool
1922 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1923 rtx extract_imm)
1924 {
1925 HOST_WIDE_INT mult_val, extract_val;
1926
1927 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1928 return false;
1929
1930 mult_val = INTVAL (mult_imm);
1931 extract_val = INTVAL (extract_imm);
1932
1933 if (extract_val > 8
1934 && extract_val < GET_MODE_BITSIZE (mode)
1935 && exact_log2 (extract_val & ~7) > 0
1936 && (extract_val & 7) <= 4
1937 && mult_val == (1 << (extract_val & 7)))
1938 return true;
1939
1940 return false;
1941 }
1942
1943 /* Emit an insn that's a simple single-set. Both the operands must be
1944 known to be valid. */
1945 inline static rtx_insn *
1946 emit_set_insn (rtx x, rtx y)
1947 {
1948 return emit_insn (gen_rtx_SET (x, y));
1949 }
1950
1951 /* X and Y are two things to compare using CODE. Emit the compare insn and
1952 return the rtx for register 0 in the proper mode. */
1953 rtx
1954 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1955 {
1956 machine_mode mode = SELECT_CC_MODE (code, x, y);
1957 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1958
1959 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1960 return cc_reg;
1961 }
1962
1963 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
1964
1965 static rtx
1966 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1967 machine_mode y_mode)
1968 {
1969 if (y_mode == E_QImode || y_mode == E_HImode)
1970 {
1971 if (CONST_INT_P (y))
1972 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1973 else
1974 {
1975 rtx t, cc_reg;
1976 machine_mode cc_mode;
1977
1978 t = gen_rtx_ZERO_EXTEND (SImode, y);
1979 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1980 cc_mode = CC_SWPmode;
1981 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1982 emit_set_insn (cc_reg, t);
1983 return cc_reg;
1984 }
1985 }
1986
1987 return aarch64_gen_compare_reg (code, x, y);
1988 }
1989
1990 /* Build the SYMBOL_REF for __tls_get_addr. */
1991
1992 static GTY(()) rtx tls_get_addr_libfunc;
1993
1994 rtx
1995 aarch64_tls_get_addr (void)
1996 {
1997 if (!tls_get_addr_libfunc)
1998 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1999 return tls_get_addr_libfunc;
2000 }
2001
2002 /* Return the TLS model to use for ADDR. */
2003
2004 static enum tls_model
2005 tls_symbolic_operand_type (rtx addr)
2006 {
2007 enum tls_model tls_kind = TLS_MODEL_NONE;
2008 if (GET_CODE (addr) == CONST)
2009 {
2010 poly_int64 addend;
2011 rtx sym = strip_offset (addr, &addend);
2012 if (GET_CODE (sym) == SYMBOL_REF)
2013 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2014 }
2015 else if (GET_CODE (addr) == SYMBOL_REF)
2016 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2017
2018 return tls_kind;
2019 }
2020
2021 /* We'll allow lo_sum's in addresses in our legitimate addresses
2022 so that combine would take care of combining addresses where
2023 necessary, but for generation purposes, we'll generate the address
2024 as :
2025 RTL Absolute
2026 tmp = hi (symbol_ref); adrp x1, foo
2027 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2028 nop
2029
2030 PIC TLS
2031 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2032 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2033 bl __tls_get_addr
2034 nop
2035
2036 Load TLS symbol, depending on TLS mechanism and TLS access model.
2037
2038 Global Dynamic - Traditional TLS:
2039 adrp tmp, :tlsgd:imm
2040 add dest, tmp, #:tlsgd_lo12:imm
2041 bl __tls_get_addr
2042
2043 Global Dynamic - TLS Descriptors:
2044 adrp dest, :tlsdesc:imm
2045 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2046 add dest, dest, #:tlsdesc_lo12:imm
2047 blr tmp
2048 mrs tp, tpidr_el0
2049 add dest, dest, tp
2050
2051 Initial Exec:
2052 mrs tp, tpidr_el0
2053 adrp tmp, :gottprel:imm
2054 ldr dest, [tmp, #:gottprel_lo12:imm]
2055 add dest, dest, tp
2056
2057 Local Exec:
2058 mrs tp, tpidr_el0
2059 add t0, tp, #:tprel_hi12:imm, lsl #12
2060 add t0, t0, #:tprel_lo12_nc:imm
2061 */
2062
2063 static void
2064 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2065 enum aarch64_symbol_type type)
2066 {
2067 switch (type)
2068 {
2069 case SYMBOL_SMALL_ABSOLUTE:
2070 {
2071 /* In ILP32, the mode of dest can be either SImode or DImode. */
2072 rtx tmp_reg = dest;
2073 machine_mode mode = GET_MODE (dest);
2074
2075 gcc_assert (mode == Pmode || mode == ptr_mode);
2076
2077 if (can_create_pseudo_p ())
2078 tmp_reg = gen_reg_rtx (mode);
2079
2080 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2081 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2082 return;
2083 }
2084
2085 case SYMBOL_TINY_ABSOLUTE:
2086 emit_insn (gen_rtx_SET (dest, imm));
2087 return;
2088
2089 case SYMBOL_SMALL_GOT_28K:
2090 {
2091 machine_mode mode = GET_MODE (dest);
2092 rtx gp_rtx = pic_offset_table_rtx;
2093 rtx insn;
2094 rtx mem;
2095
2096 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2097 here before rtl expand. Tree IVOPT will generate rtl pattern to
2098 decide rtx costs, in which case pic_offset_table_rtx is not
2099 initialized. For that case no need to generate the first adrp
2100 instruction as the final cost for global variable access is
2101 one instruction. */
2102 if (gp_rtx != NULL)
2103 {
2104 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2105 using the page base as GOT base, the first page may be wasted,
2106 in the worst scenario, there is only 28K space for GOT).
2107
2108 The generate instruction sequence for accessing global variable
2109 is:
2110
2111 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2112
2113 Only one instruction needed. But we must initialize
2114 pic_offset_table_rtx properly. We generate initialize insn for
2115 every global access, and allow CSE to remove all redundant.
2116
2117 The final instruction sequences will look like the following
2118 for multiply global variables access.
2119
2120 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2121
2122 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2123 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2124 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2125 ... */
2126
2127 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2128 crtl->uses_pic_offset_table = 1;
2129 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2130
2131 if (mode != GET_MODE (gp_rtx))
2132 gp_rtx = gen_lowpart (mode, gp_rtx);
2133
2134 }
2135
2136 if (mode == ptr_mode)
2137 {
2138 if (mode == DImode)
2139 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2140 else
2141 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2142
2143 mem = XVECEXP (SET_SRC (insn), 0, 0);
2144 }
2145 else
2146 {
2147 gcc_assert (mode == Pmode);
2148
2149 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2150 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2151 }
2152
2153 /* The operand is expected to be MEM. Whenever the related insn
2154 pattern changed, above code which calculate mem should be
2155 updated. */
2156 gcc_assert (GET_CODE (mem) == MEM);
2157 MEM_READONLY_P (mem) = 1;
2158 MEM_NOTRAP_P (mem) = 1;
2159 emit_insn (insn);
2160 return;
2161 }
2162
2163 case SYMBOL_SMALL_GOT_4G:
2164 {
2165 /* In ILP32, the mode of dest can be either SImode or DImode,
2166 while the got entry is always of SImode size. The mode of
2167 dest depends on how dest is used: if dest is assigned to a
2168 pointer (e.g. in the memory), it has SImode; it may have
2169 DImode if dest is dereferenced to access the memeory.
2170 This is why we have to handle three different ldr_got_small
2171 patterns here (two patterns for ILP32). */
2172
2173 rtx insn;
2174 rtx mem;
2175 rtx tmp_reg = dest;
2176 machine_mode mode = GET_MODE (dest);
2177
2178 if (can_create_pseudo_p ())
2179 tmp_reg = gen_reg_rtx (mode);
2180
2181 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2182 if (mode == ptr_mode)
2183 {
2184 if (mode == DImode)
2185 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2186 else
2187 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2188
2189 mem = XVECEXP (SET_SRC (insn), 0, 0);
2190 }
2191 else
2192 {
2193 gcc_assert (mode == Pmode);
2194
2195 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2196 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2197 }
2198
2199 gcc_assert (GET_CODE (mem) == MEM);
2200 MEM_READONLY_P (mem) = 1;
2201 MEM_NOTRAP_P (mem) = 1;
2202 emit_insn (insn);
2203 return;
2204 }
2205
2206 case SYMBOL_SMALL_TLSGD:
2207 {
2208 rtx_insn *insns;
2209 machine_mode mode = GET_MODE (dest);
2210 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2211
2212 start_sequence ();
2213 if (TARGET_ILP32)
2214 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2215 else
2216 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2217 insns = get_insns ();
2218 end_sequence ();
2219
2220 RTL_CONST_CALL_P (insns) = 1;
2221 emit_libcall_block (insns, dest, result, imm);
2222 return;
2223 }
2224
2225 case SYMBOL_SMALL_TLSDESC:
2226 {
2227 machine_mode mode = GET_MODE (dest);
2228 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2229 rtx tp;
2230
2231 gcc_assert (mode == Pmode || mode == ptr_mode);
2232
2233 /* In ILP32, the got entry is always of SImode size. Unlike
2234 small GOT, the dest is fixed at reg 0. */
2235 if (TARGET_ILP32)
2236 emit_insn (gen_tlsdesc_small_si (imm));
2237 else
2238 emit_insn (gen_tlsdesc_small_di (imm));
2239 tp = aarch64_load_tp (NULL);
2240
2241 if (mode != Pmode)
2242 tp = gen_lowpart (mode, tp);
2243
2244 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2245 if (REG_P (dest))
2246 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2247 return;
2248 }
2249
2250 case SYMBOL_SMALL_TLSIE:
2251 {
2252 /* In ILP32, the mode of dest can be either SImode or DImode,
2253 while the got entry is always of SImode size. The mode of
2254 dest depends on how dest is used: if dest is assigned to a
2255 pointer (e.g. in the memory), it has SImode; it may have
2256 DImode if dest is dereferenced to access the memeory.
2257 This is why we have to handle three different tlsie_small
2258 patterns here (two patterns for ILP32). */
2259 machine_mode mode = GET_MODE (dest);
2260 rtx tmp_reg = gen_reg_rtx (mode);
2261 rtx tp = aarch64_load_tp (NULL);
2262
2263 if (mode == ptr_mode)
2264 {
2265 if (mode == DImode)
2266 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2267 else
2268 {
2269 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2270 tp = gen_lowpart (mode, tp);
2271 }
2272 }
2273 else
2274 {
2275 gcc_assert (mode == Pmode);
2276 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2277 }
2278
2279 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2280 if (REG_P (dest))
2281 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2282 return;
2283 }
2284
2285 case SYMBOL_TLSLE12:
2286 case SYMBOL_TLSLE24:
2287 case SYMBOL_TLSLE32:
2288 case SYMBOL_TLSLE48:
2289 {
2290 machine_mode mode = GET_MODE (dest);
2291 rtx tp = aarch64_load_tp (NULL);
2292
2293 if (mode != Pmode)
2294 tp = gen_lowpart (mode, tp);
2295
2296 switch (type)
2297 {
2298 case SYMBOL_TLSLE12:
2299 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2300 (dest, tp, imm));
2301 break;
2302 case SYMBOL_TLSLE24:
2303 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2304 (dest, tp, imm));
2305 break;
2306 case SYMBOL_TLSLE32:
2307 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2308 (dest, imm));
2309 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2310 (dest, dest, tp));
2311 break;
2312 case SYMBOL_TLSLE48:
2313 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2314 (dest, imm));
2315 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2316 (dest, dest, tp));
2317 break;
2318 default:
2319 gcc_unreachable ();
2320 }
2321
2322 if (REG_P (dest))
2323 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2324 return;
2325 }
2326
2327 case SYMBOL_TINY_GOT:
2328 emit_insn (gen_ldr_got_tiny (dest, imm));
2329 return;
2330
2331 case SYMBOL_TINY_TLSIE:
2332 {
2333 machine_mode mode = GET_MODE (dest);
2334 rtx tp = aarch64_load_tp (NULL);
2335
2336 if (mode == ptr_mode)
2337 {
2338 if (mode == DImode)
2339 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2340 else
2341 {
2342 tp = gen_lowpart (mode, tp);
2343 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2344 }
2345 }
2346 else
2347 {
2348 gcc_assert (mode == Pmode);
2349 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2350 }
2351
2352 if (REG_P (dest))
2353 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2354 return;
2355 }
2356
2357 default:
2358 gcc_unreachable ();
2359 }
2360 }
2361
2362 /* Emit a move from SRC to DEST. Assume that the move expanders can
2363 handle all moves if !can_create_pseudo_p (). The distinction is
2364 important because, unlike emit_move_insn, the move expanders know
2365 how to force Pmode objects into the constant pool even when the
2366 constant pool address is not itself legitimate. */
2367 static rtx
2368 aarch64_emit_move (rtx dest, rtx src)
2369 {
2370 return (can_create_pseudo_p ()
2371 ? emit_move_insn (dest, src)
2372 : emit_move_insn_1 (dest, src));
2373 }
2374
2375 /* Apply UNOPTAB to OP and store the result in DEST. */
2376
2377 static void
2378 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2379 {
2380 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2381 if (dest != tmp)
2382 emit_move_insn (dest, tmp);
2383 }
2384
2385 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2386
2387 static void
2388 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2389 {
2390 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2391 OPTAB_DIRECT);
2392 if (dest != tmp)
2393 emit_move_insn (dest, tmp);
2394 }
2395
2396 /* Split a 128-bit move operation into two 64-bit move operations,
2397 taking care to handle partial overlap of register to register
2398 copies. Special cases are needed when moving between GP regs and
2399 FP regs. SRC can be a register, constant or memory; DST a register
2400 or memory. If either operand is memory it must not have any side
2401 effects. */
2402 void
2403 aarch64_split_128bit_move (rtx dst, rtx src)
2404 {
2405 rtx dst_lo, dst_hi;
2406 rtx src_lo, src_hi;
2407
2408 machine_mode mode = GET_MODE (dst);
2409
2410 gcc_assert (mode == TImode || mode == TFmode);
2411 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2412 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2413
2414 if (REG_P (dst) && REG_P (src))
2415 {
2416 int src_regno = REGNO (src);
2417 int dst_regno = REGNO (dst);
2418
2419 /* Handle FP <-> GP regs. */
2420 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2421 {
2422 src_lo = gen_lowpart (word_mode, src);
2423 src_hi = gen_highpart (word_mode, src);
2424
2425 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2426 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2427 return;
2428 }
2429 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2430 {
2431 dst_lo = gen_lowpart (word_mode, dst);
2432 dst_hi = gen_highpart (word_mode, dst);
2433
2434 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2435 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2436 return;
2437 }
2438 }
2439
2440 dst_lo = gen_lowpart (word_mode, dst);
2441 dst_hi = gen_highpart (word_mode, dst);
2442 src_lo = gen_lowpart (word_mode, src);
2443 src_hi = gen_highpart_mode (word_mode, mode, src);
2444
2445 /* At most one pairing may overlap. */
2446 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2447 {
2448 aarch64_emit_move (dst_hi, src_hi);
2449 aarch64_emit_move (dst_lo, src_lo);
2450 }
2451 else
2452 {
2453 aarch64_emit_move (dst_lo, src_lo);
2454 aarch64_emit_move (dst_hi, src_hi);
2455 }
2456 }
2457
2458 bool
2459 aarch64_split_128bit_move_p (rtx dst, rtx src)
2460 {
2461 return (! REG_P (src)
2462 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2463 }
2464
2465 /* Split a complex SIMD combine. */
2466
2467 void
2468 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2469 {
2470 machine_mode src_mode = GET_MODE (src1);
2471 machine_mode dst_mode = GET_MODE (dst);
2472
2473 gcc_assert (VECTOR_MODE_P (dst_mode));
2474 gcc_assert (register_operand (dst, dst_mode)
2475 && register_operand (src1, src_mode)
2476 && register_operand (src2, src_mode));
2477
2478 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2479 return;
2480 }
2481
2482 /* Split a complex SIMD move. */
2483
2484 void
2485 aarch64_split_simd_move (rtx dst, rtx src)
2486 {
2487 machine_mode src_mode = GET_MODE (src);
2488 machine_mode dst_mode = GET_MODE (dst);
2489
2490 gcc_assert (VECTOR_MODE_P (dst_mode));
2491
2492 if (REG_P (dst) && REG_P (src))
2493 {
2494 gcc_assert (VECTOR_MODE_P (src_mode));
2495 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2496 }
2497 }
2498
2499 bool
2500 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2501 machine_mode ymode, rtx y)
2502 {
2503 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2504 gcc_assert (r != NULL);
2505 return rtx_equal_p (x, r);
2506 }
2507
2508
2509 static rtx
2510 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2511 {
2512 if (can_create_pseudo_p ())
2513 return force_reg (mode, value);
2514 else
2515 {
2516 gcc_assert (x);
2517 aarch64_emit_move (x, value);
2518 return x;
2519 }
2520 }
2521
2522 /* Return an all-true predicate register of mode MODE. */
2523
2524 rtx
2525 aarch64_ptrue_reg (machine_mode mode)
2526 {
2527 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2528 return force_reg (mode, CONSTM1_RTX (mode));
2529 }
2530
2531 /* Return an all-false predicate register of mode MODE. */
2532
2533 rtx
2534 aarch64_pfalse_reg (machine_mode mode)
2535 {
2536 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2537 return force_reg (mode, CONST0_RTX (mode));
2538 }
2539
2540 /* Return true if we can move VALUE into a register using a single
2541 CNT[BHWD] instruction. */
2542
2543 static bool
2544 aarch64_sve_cnt_immediate_p (poly_int64 value)
2545 {
2546 HOST_WIDE_INT factor = value.coeffs[0];
2547 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2548 return (value.coeffs[1] == factor
2549 && IN_RANGE (factor, 2, 16 * 16)
2550 && (factor & 1) == 0
2551 && factor <= 16 * (factor & -factor));
2552 }
2553
2554 /* Likewise for rtx X. */
2555
2556 bool
2557 aarch64_sve_cnt_immediate_p (rtx x)
2558 {
2559 poly_int64 value;
2560 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2561 }
2562
2563 /* Return the asm string for an instruction with a CNT-like vector size
2564 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2565 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2566 first part of the operands template (the part that comes before the
2567 vector size itself). FACTOR is the number of quadwords.
2568 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2569 If it is zero, we can use any element size. */
2570
2571 static char *
2572 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2573 unsigned int factor,
2574 unsigned int nelts_per_vq)
2575 {
2576 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2577
2578 if (nelts_per_vq == 0)
2579 /* There is some overlap in the ranges of the four CNT instructions.
2580 Here we always use the smallest possible element size, so that the
2581 multiplier is 1 whereever possible. */
2582 nelts_per_vq = factor & -factor;
2583 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2584 gcc_assert (IN_RANGE (shift, 1, 4));
2585 char suffix = "dwhb"[shift - 1];
2586
2587 factor >>= shift;
2588 unsigned int written;
2589 if (factor == 1)
2590 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2591 prefix, suffix, operands);
2592 else
2593 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2594 prefix, suffix, operands, factor);
2595 gcc_assert (written < sizeof (buffer));
2596 return buffer;
2597 }
2598
2599 /* Return the asm string for an instruction with a CNT-like vector size
2600 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2601 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2602 first part of the operands template (the part that comes before the
2603 vector size itself). X is the value of the vector size operand,
2604 as a polynomial integer rtx. */
2605
2606 char *
2607 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2608 rtx x)
2609 {
2610 poly_int64 value = rtx_to_poly_int64 (x);
2611 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2612 return aarch64_output_sve_cnt_immediate (prefix, operands,
2613 value.coeffs[1], 0);
2614 }
2615
2616 /* Return true if we can add VALUE to a register using a single ADDVL
2617 or ADDPL instruction. */
2618
2619 static bool
2620 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2621 {
2622 HOST_WIDE_INT factor = value.coeffs[0];
2623 if (factor == 0 || value.coeffs[1] != factor)
2624 return false;
2625 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2626 and a value of 16 is one vector width. */
2627 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2628 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2629 }
2630
2631 /* Likewise for rtx X. */
2632
2633 bool
2634 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2635 {
2636 poly_int64 value;
2637 return (poly_int_rtx_p (x, &value)
2638 && aarch64_sve_addvl_addpl_immediate_p (value));
2639 }
2640
2641 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2642 and storing the result in operand 0. */
2643
2644 char *
2645 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2646 {
2647 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2648 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2649 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2650
2651 /* Use INC or DEC if possible. */
2652 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2653 {
2654 if (aarch64_sve_cnt_immediate_p (offset_value))
2655 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2656 offset_value.coeffs[1], 0);
2657 if (aarch64_sve_cnt_immediate_p (-offset_value))
2658 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2659 -offset_value.coeffs[1], 0);
2660 }
2661
2662 int factor = offset_value.coeffs[1];
2663 if ((factor & 15) == 0)
2664 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2665 else
2666 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2667 return buffer;
2668 }
2669
2670 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2671 instruction. If it is, store the number of elements in each vector
2672 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2673 factor in *FACTOR_OUT (if nonnull). */
2674
2675 bool
2676 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2677 unsigned int *nelts_per_vq_out)
2678 {
2679 rtx elt;
2680 poly_int64 value;
2681
2682 if (!const_vec_duplicate_p (x, &elt)
2683 || !poly_int_rtx_p (elt, &value))
2684 return false;
2685
2686 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2687 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2688 /* There's no vector INCB. */
2689 return false;
2690
2691 HOST_WIDE_INT factor = value.coeffs[0];
2692 if (value.coeffs[1] != factor)
2693 return false;
2694
2695 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2696 if ((factor % nelts_per_vq) != 0
2697 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2698 return false;
2699
2700 if (factor_out)
2701 *factor_out = factor;
2702 if (nelts_per_vq_out)
2703 *nelts_per_vq_out = nelts_per_vq;
2704 return true;
2705 }
2706
2707 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2708 instruction. */
2709
2710 bool
2711 aarch64_sve_inc_dec_immediate_p (rtx x)
2712 {
2713 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2714 }
2715
2716 /* Return the asm template for an SVE vector INC or DEC instruction.
2717 OPERANDS gives the operands before the vector count and X is the
2718 value of the vector count operand itself. */
2719
2720 char *
2721 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2722 {
2723 int factor;
2724 unsigned int nelts_per_vq;
2725 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2726 gcc_unreachable ();
2727 if (factor < 0)
2728 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2729 nelts_per_vq);
2730 else
2731 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2732 nelts_per_vq);
2733 }
2734
2735 static int
2736 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2737 scalar_int_mode mode)
2738 {
2739 int i;
2740 unsigned HOST_WIDE_INT val, val2, mask;
2741 int one_match, zero_match;
2742 int num_insns;
2743
2744 val = INTVAL (imm);
2745
2746 if (aarch64_move_imm (val, mode))
2747 {
2748 if (generate)
2749 emit_insn (gen_rtx_SET (dest, imm));
2750 return 1;
2751 }
2752
2753 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2754 (with XXXX non-zero). In that case check to see if the move can be done in
2755 a smaller mode. */
2756 val2 = val & 0xffffffff;
2757 if (mode == DImode
2758 && aarch64_move_imm (val2, SImode)
2759 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2760 {
2761 if (generate)
2762 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2763
2764 /* Check if we have to emit a second instruction by checking to see
2765 if any of the upper 32 bits of the original DI mode value is set. */
2766 if (val == val2)
2767 return 1;
2768
2769 i = (val >> 48) ? 48 : 32;
2770
2771 if (generate)
2772 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2773 GEN_INT ((val >> i) & 0xffff)));
2774
2775 return 2;
2776 }
2777
2778 if ((val >> 32) == 0 || mode == SImode)
2779 {
2780 if (generate)
2781 {
2782 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2783 if (mode == SImode)
2784 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2785 GEN_INT ((val >> 16) & 0xffff)));
2786 else
2787 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2788 GEN_INT ((val >> 16) & 0xffff)));
2789 }
2790 return 2;
2791 }
2792
2793 /* Remaining cases are all for DImode. */
2794
2795 mask = 0xffff;
2796 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2797 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2798 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2799 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2800
2801 if (zero_match != 2 && one_match != 2)
2802 {
2803 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2804 For a 64-bit bitmask try whether changing 16 bits to all ones or
2805 zeroes creates a valid bitmask. To check any repeated bitmask,
2806 try using 16 bits from the other 32-bit half of val. */
2807
2808 for (i = 0; i < 64; i += 16, mask <<= 16)
2809 {
2810 val2 = val & ~mask;
2811 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2812 break;
2813 val2 = val | mask;
2814 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2815 break;
2816 val2 = val2 & ~mask;
2817 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2818 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2819 break;
2820 }
2821 if (i != 64)
2822 {
2823 if (generate)
2824 {
2825 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2826 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2827 GEN_INT ((val >> i) & 0xffff)));
2828 }
2829 return 2;
2830 }
2831 }
2832
2833 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2834 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2835 otherwise skip zero bits. */
2836
2837 num_insns = 1;
2838 mask = 0xffff;
2839 val2 = one_match > zero_match ? ~val : val;
2840 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2841
2842 if (generate)
2843 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2844 ? (val | ~(mask << i))
2845 : (val & (mask << i)))));
2846 for (i += 16; i < 64; i += 16)
2847 {
2848 if ((val2 & (mask << i)) == 0)
2849 continue;
2850 if (generate)
2851 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2852 GEN_INT ((val >> i) & 0xffff)));
2853 num_insns ++;
2854 }
2855
2856 return num_insns;
2857 }
2858
2859 /* Return whether imm is a 128-bit immediate which is simple enough to
2860 expand inline. */
2861 bool
2862 aarch64_mov128_immediate (rtx imm)
2863 {
2864 if (GET_CODE (imm) == CONST_INT)
2865 return true;
2866
2867 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2868
2869 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2870 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2871
2872 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2873 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2874 }
2875
2876
2877 /* Return the number of temporary registers that aarch64_add_offset_1
2878 would need to add OFFSET to a register. */
2879
2880 static unsigned int
2881 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2882 {
2883 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2884 }
2885
2886 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2887 a non-polynomial OFFSET. MODE is the mode of the addition.
2888 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2889 be set and CFA adjustments added to the generated instructions.
2890
2891 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2892 temporary if register allocation is already complete. This temporary
2893 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2894 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2895 the immediate again.
2896
2897 Since this function may be used to adjust the stack pointer, we must
2898 ensure that it cannot cause transient stack deallocation (for example
2899 by first incrementing SP and then decrementing when adjusting by a
2900 large immediate). */
2901
2902 static void
2903 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2904 rtx src, HOST_WIDE_INT offset, rtx temp1,
2905 bool frame_related_p, bool emit_move_imm)
2906 {
2907 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2908 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2909
2910 HOST_WIDE_INT moffset = abs_hwi (offset);
2911 rtx_insn *insn;
2912
2913 if (!moffset)
2914 {
2915 if (!rtx_equal_p (dest, src))
2916 {
2917 insn = emit_insn (gen_rtx_SET (dest, src));
2918 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2919 }
2920 return;
2921 }
2922
2923 /* Single instruction adjustment. */
2924 if (aarch64_uimm12_shift (moffset))
2925 {
2926 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2927 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2928 return;
2929 }
2930
2931 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2932 and either:
2933
2934 a) the offset cannot be loaded by a 16-bit move or
2935 b) there is no spare register into which we can move it. */
2936 if (moffset < 0x1000000
2937 && ((!temp1 && !can_create_pseudo_p ())
2938 || !aarch64_move_imm (moffset, mode)))
2939 {
2940 HOST_WIDE_INT low_off = moffset & 0xfff;
2941
2942 low_off = offset < 0 ? -low_off : low_off;
2943 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2944 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2945 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2946 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2947 return;
2948 }
2949
2950 /* Emit a move immediate if required and an addition/subtraction. */
2951 if (emit_move_imm)
2952 {
2953 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2954 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2955 }
2956 insn = emit_insn (offset < 0
2957 ? gen_sub3_insn (dest, src, temp1)
2958 : gen_add3_insn (dest, src, temp1));
2959 if (frame_related_p)
2960 {
2961 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2962 rtx adj = plus_constant (mode, src, offset);
2963 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2964 }
2965 }
2966
2967 /* Return the number of temporary registers that aarch64_add_offset
2968 would need to move OFFSET into a register or add OFFSET to a register;
2969 ADD_P is true if we want the latter rather than the former. */
2970
2971 static unsigned int
2972 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2973 {
2974 /* This follows the same structure as aarch64_add_offset. */
2975 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2976 return 0;
2977
2978 unsigned int count = 0;
2979 HOST_WIDE_INT factor = offset.coeffs[1];
2980 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2981 poly_int64 poly_offset (factor, factor);
2982 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2983 /* Need one register for the ADDVL/ADDPL result. */
2984 count += 1;
2985 else if (factor != 0)
2986 {
2987 factor = abs (factor);
2988 if (factor > 16 * (factor & -factor))
2989 /* Need one register for the CNT result and one for the multiplication
2990 factor. If necessary, the second temporary can be reused for the
2991 constant part of the offset. */
2992 return 2;
2993 /* Need one register for the CNT result (which might then
2994 be shifted). */
2995 count += 1;
2996 }
2997 return count + aarch64_add_offset_1_temporaries (constant);
2998 }
2999
3000 /* If X can be represented as a poly_int64, return the number
3001 of temporaries that are required to add it to a register.
3002 Return -1 otherwise. */
3003
3004 int
3005 aarch64_add_offset_temporaries (rtx x)
3006 {
3007 poly_int64 offset;
3008 if (!poly_int_rtx_p (x, &offset))
3009 return -1;
3010 return aarch64_offset_temporaries (true, offset);
3011 }
3012
3013 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3014 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3015 be set and CFA adjustments added to the generated instructions.
3016
3017 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3018 temporary if register allocation is already complete. This temporary
3019 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3020 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3021 false to avoid emitting the immediate again.
3022
3023 TEMP2, if nonnull, is a second temporary register that doesn't
3024 overlap either DEST or REG.
3025
3026 Since this function may be used to adjust the stack pointer, we must
3027 ensure that it cannot cause transient stack deallocation (for example
3028 by first incrementing SP and then decrementing when adjusting by a
3029 large immediate). */
3030
3031 static void
3032 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3033 poly_int64 offset, rtx temp1, rtx temp2,
3034 bool frame_related_p, bool emit_move_imm = true)
3035 {
3036 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3037 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3038 gcc_assert (temp1 == NULL_RTX
3039 || !frame_related_p
3040 || !reg_overlap_mentioned_p (temp1, dest));
3041 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3042
3043 /* Try using ADDVL or ADDPL to add the whole value. */
3044 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3045 {
3046 rtx offset_rtx = gen_int_mode (offset, mode);
3047 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3048 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3049 return;
3050 }
3051
3052 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3053 SVE vector register, over and above the minimum size of 128 bits.
3054 This is equivalent to half the value returned by CNTD with a
3055 vector shape of ALL. */
3056 HOST_WIDE_INT factor = offset.coeffs[1];
3057 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3058
3059 /* Try using ADDVL or ADDPL to add the VG-based part. */
3060 poly_int64 poly_offset (factor, factor);
3061 if (src != const0_rtx
3062 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3063 {
3064 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3065 if (frame_related_p)
3066 {
3067 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3068 RTX_FRAME_RELATED_P (insn) = true;
3069 src = dest;
3070 }
3071 else
3072 {
3073 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3074 src = aarch64_force_temporary (mode, temp1, addr);
3075 temp1 = temp2;
3076 temp2 = NULL_RTX;
3077 }
3078 }
3079 /* Otherwise use a CNT-based sequence. */
3080 else if (factor != 0)
3081 {
3082 /* Use a subtraction if we have a negative factor. */
3083 rtx_code code = PLUS;
3084 if (factor < 0)
3085 {
3086 factor = -factor;
3087 code = MINUS;
3088 }
3089
3090 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3091 into the multiplication. */
3092 rtx val;
3093 int shift = 0;
3094 if (factor & 1)
3095 /* Use a right shift by 1. */
3096 shift = -1;
3097 else
3098 factor /= 2;
3099 HOST_WIDE_INT low_bit = factor & -factor;
3100 if (factor <= 16 * low_bit)
3101 {
3102 if (factor > 16 * 8)
3103 {
3104 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3105 the value with the minimum multiplier and shift it into
3106 position. */
3107 int extra_shift = exact_log2 (low_bit);
3108 shift += extra_shift;
3109 factor >>= extra_shift;
3110 }
3111 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3112 }
3113 else
3114 {
3115 /* Use CNTD, then multiply it by FACTOR. */
3116 val = gen_int_mode (poly_int64 (2, 2), mode);
3117 val = aarch64_force_temporary (mode, temp1, val);
3118
3119 /* Go back to using a negative multiplication factor if we have
3120 no register from which to subtract. */
3121 if (code == MINUS && src == const0_rtx)
3122 {
3123 factor = -factor;
3124 code = PLUS;
3125 }
3126 rtx coeff1 = gen_int_mode (factor, mode);
3127 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3128 val = gen_rtx_MULT (mode, val, coeff1);
3129 }
3130
3131 if (shift > 0)
3132 {
3133 /* Multiply by 1 << SHIFT. */
3134 val = aarch64_force_temporary (mode, temp1, val);
3135 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3136 }
3137 else if (shift == -1)
3138 {
3139 /* Divide by 2. */
3140 val = aarch64_force_temporary (mode, temp1, val);
3141 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3142 }
3143
3144 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3145 if (src != const0_rtx)
3146 {
3147 val = aarch64_force_temporary (mode, temp1, val);
3148 val = gen_rtx_fmt_ee (code, mode, src, val);
3149 }
3150 else if (code == MINUS)
3151 {
3152 val = aarch64_force_temporary (mode, temp1, val);
3153 val = gen_rtx_NEG (mode, val);
3154 }
3155
3156 if (constant == 0 || frame_related_p)
3157 {
3158 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3159 if (frame_related_p)
3160 {
3161 RTX_FRAME_RELATED_P (insn) = true;
3162 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3163 gen_rtx_SET (dest, plus_constant (Pmode, src,
3164 poly_offset)));
3165 }
3166 src = dest;
3167 if (constant == 0)
3168 return;
3169 }
3170 else
3171 {
3172 src = aarch64_force_temporary (mode, temp1, val);
3173 temp1 = temp2;
3174 temp2 = NULL_RTX;
3175 }
3176
3177 emit_move_imm = true;
3178 }
3179
3180 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3181 frame_related_p, emit_move_imm);
3182 }
3183
3184 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3185 than a poly_int64. */
3186
3187 void
3188 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3189 rtx offset_rtx, rtx temp1, rtx temp2)
3190 {
3191 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3192 temp1, temp2, false);
3193 }
3194
3195 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3196 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3197 if TEMP1 already contains abs (DELTA). */
3198
3199 static inline void
3200 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3201 {
3202 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3203 temp1, temp2, true, emit_move_imm);
3204 }
3205
3206 /* Subtract DELTA from the stack pointer, marking the instructions
3207 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3208 if nonnull. */
3209
3210 static inline void
3211 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3212 bool emit_move_imm = true)
3213 {
3214 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3215 temp1, temp2, frame_related_p, emit_move_imm);
3216 }
3217
3218 /* Set DEST to (vec_series BASE STEP). */
3219
3220 static void
3221 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3222 {
3223 machine_mode mode = GET_MODE (dest);
3224 scalar_mode inner = GET_MODE_INNER (mode);
3225
3226 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3227 if (!aarch64_sve_index_immediate_p (base))
3228 base = force_reg (inner, base);
3229 if (!aarch64_sve_index_immediate_p (step))
3230 step = force_reg (inner, step);
3231
3232 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3233 }
3234
3235 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3236 integer of mode INT_MODE. Return true on success. */
3237
3238 static bool
3239 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3240 rtx src)
3241 {
3242 /* If the constant is smaller than 128 bits, we can do the move
3243 using a vector of SRC_MODEs. */
3244 if (src_mode != TImode)
3245 {
3246 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3247 GET_MODE_SIZE (src_mode));
3248 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3249 emit_move_insn (gen_lowpart (dup_mode, dest),
3250 gen_const_vec_duplicate (dup_mode, src));
3251 return true;
3252 }
3253
3254 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3255 src = force_const_mem (src_mode, src);
3256 if (!src)
3257 return false;
3258
3259 /* Make sure that the address is legitimate. */
3260 if (!aarch64_sve_ld1r_operand_p (src))
3261 {
3262 rtx addr = force_reg (Pmode, XEXP (src, 0));
3263 src = replace_equiv_address (src, addr);
3264 }
3265
3266 machine_mode mode = GET_MODE (dest);
3267 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3268 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3269 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3270 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3271 emit_insn (gen_rtx_SET (dest, src));
3272 return true;
3273 }
3274
3275 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3276 isn't a simple duplicate or series. */
3277
3278 static void
3279 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3280 {
3281 machine_mode mode = GET_MODE (src);
3282 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3283 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3284 gcc_assert (npatterns > 1);
3285
3286 if (nelts_per_pattern == 1)
3287 {
3288 /* The constant is a repeating seqeuence of at least two elements,
3289 where the repeating elements occupy no more than 128 bits.
3290 Get an integer representation of the replicated value. */
3291 scalar_int_mode int_mode;
3292 if (BYTES_BIG_ENDIAN)
3293 /* For now, always use LD1RQ to load the value on big-endian
3294 targets, since the handling of smaller integers includes a
3295 subreg that is semantically an element reverse. */
3296 int_mode = TImode;
3297 else
3298 {
3299 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3300 gcc_assert (int_bits <= 128);
3301 int_mode = int_mode_for_size (int_bits, 0).require ();
3302 }
3303 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3304 if (int_value
3305 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3306 return;
3307 }
3308
3309 /* Expand each pattern individually. */
3310 rtx_vector_builder builder;
3311 auto_vec<rtx, 16> vectors (npatterns);
3312 for (unsigned int i = 0; i < npatterns; ++i)
3313 {
3314 builder.new_vector (mode, 1, nelts_per_pattern);
3315 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3316 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3317 vectors.quick_push (force_reg (mode, builder.build ()));
3318 }
3319
3320 /* Use permutes to interleave the separate vectors. */
3321 while (npatterns > 1)
3322 {
3323 npatterns /= 2;
3324 for (unsigned int i = 0; i < npatterns; ++i)
3325 {
3326 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3327 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3328 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3329 vectors[i] = tmp;
3330 }
3331 }
3332 gcc_assert (vectors[0] == dest);
3333 }
3334
3335 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3336 is a pattern that can be used to set DEST to a replicated scalar
3337 element. */
3338
3339 void
3340 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3341 rtx (*gen_vec_duplicate) (rtx, rtx))
3342 {
3343 machine_mode mode = GET_MODE (dest);
3344
3345 /* Check on what type of symbol it is. */
3346 scalar_int_mode int_mode;
3347 if ((GET_CODE (imm) == SYMBOL_REF
3348 || GET_CODE (imm) == LABEL_REF
3349 || GET_CODE (imm) == CONST
3350 || GET_CODE (imm) == CONST_POLY_INT)
3351 && is_a <scalar_int_mode> (mode, &int_mode))
3352 {
3353 rtx mem;
3354 poly_int64 offset;
3355 HOST_WIDE_INT const_offset;
3356 enum aarch64_symbol_type sty;
3357
3358 /* If we have (const (plus symbol offset)), separate out the offset
3359 before we start classifying the symbol. */
3360 rtx base = strip_offset (imm, &offset);
3361
3362 /* We must always add an offset involving VL separately, rather than
3363 folding it into the relocation. */
3364 if (!offset.is_constant (&const_offset))
3365 {
3366 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3367 emit_insn (gen_rtx_SET (dest, imm));
3368 else
3369 {
3370 /* Do arithmetic on 32-bit values if the result is smaller
3371 than that. */
3372 if (partial_subreg_p (int_mode, SImode))
3373 {
3374 /* It is invalid to do symbol calculations in modes
3375 narrower than SImode. */
3376 gcc_assert (base == const0_rtx);
3377 dest = gen_lowpart (SImode, dest);
3378 int_mode = SImode;
3379 }
3380 if (base != const0_rtx)
3381 {
3382 base = aarch64_force_temporary (int_mode, dest, base);
3383 aarch64_add_offset (int_mode, dest, base, offset,
3384 NULL_RTX, NULL_RTX, false);
3385 }
3386 else
3387 aarch64_add_offset (int_mode, dest, base, offset,
3388 dest, NULL_RTX, false);
3389 }
3390 return;
3391 }
3392
3393 sty = aarch64_classify_symbol (base, const_offset);
3394 switch (sty)
3395 {
3396 case SYMBOL_FORCE_TO_MEM:
3397 if (const_offset != 0
3398 && targetm.cannot_force_const_mem (int_mode, imm))
3399 {
3400 gcc_assert (can_create_pseudo_p ());
3401 base = aarch64_force_temporary (int_mode, dest, base);
3402 aarch64_add_offset (int_mode, dest, base, const_offset,
3403 NULL_RTX, NULL_RTX, false);
3404 return;
3405 }
3406
3407 mem = force_const_mem (ptr_mode, imm);
3408 gcc_assert (mem);
3409
3410 /* If we aren't generating PC relative literals, then
3411 we need to expand the literal pool access carefully.
3412 This is something that needs to be done in a number
3413 of places, so could well live as a separate function. */
3414 if (!aarch64_pcrelative_literal_loads)
3415 {
3416 gcc_assert (can_create_pseudo_p ());
3417 base = gen_reg_rtx (ptr_mode);
3418 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3419 if (ptr_mode != Pmode)
3420 base = convert_memory_address (Pmode, base);
3421 mem = gen_rtx_MEM (ptr_mode, base);
3422 }
3423
3424 if (int_mode != ptr_mode)
3425 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3426
3427 emit_insn (gen_rtx_SET (dest, mem));
3428
3429 return;
3430
3431 case SYMBOL_SMALL_TLSGD:
3432 case SYMBOL_SMALL_TLSDESC:
3433 case SYMBOL_SMALL_TLSIE:
3434 case SYMBOL_SMALL_GOT_28K:
3435 case SYMBOL_SMALL_GOT_4G:
3436 case SYMBOL_TINY_GOT:
3437 case SYMBOL_TINY_TLSIE:
3438 if (const_offset != 0)
3439 {
3440 gcc_assert(can_create_pseudo_p ());
3441 base = aarch64_force_temporary (int_mode, dest, base);
3442 aarch64_add_offset (int_mode, dest, base, const_offset,
3443 NULL_RTX, NULL_RTX, false);
3444 return;
3445 }
3446 /* FALLTHRU */
3447
3448 case SYMBOL_SMALL_ABSOLUTE:
3449 case SYMBOL_TINY_ABSOLUTE:
3450 case SYMBOL_TLSLE12:
3451 case SYMBOL_TLSLE24:
3452 case SYMBOL_TLSLE32:
3453 case SYMBOL_TLSLE48:
3454 aarch64_load_symref_appropriately (dest, imm, sty);
3455 return;
3456
3457 default:
3458 gcc_unreachable ();
3459 }
3460 }
3461
3462 if (!CONST_INT_P (imm))
3463 {
3464 rtx base, step, value;
3465 if (GET_CODE (imm) == HIGH
3466 || aarch64_simd_valid_immediate (imm, NULL))
3467 emit_insn (gen_rtx_SET (dest, imm));
3468 else if (const_vec_series_p (imm, &base, &step))
3469 aarch64_expand_vec_series (dest, base, step);
3470 else if (const_vec_duplicate_p (imm, &value))
3471 {
3472 /* If the constant is out of range of an SVE vector move,
3473 load it from memory if we can, otherwise move it into
3474 a register and use a DUP. */
3475 scalar_mode inner_mode = GET_MODE_INNER (mode);
3476 rtx op = force_const_mem (inner_mode, value);
3477 if (!op)
3478 op = force_reg (inner_mode, value);
3479 else if (!aarch64_sve_ld1r_operand_p (op))
3480 {
3481 rtx addr = force_reg (Pmode, XEXP (op, 0));
3482 op = replace_equiv_address (op, addr);
3483 }
3484 emit_insn (gen_vec_duplicate (dest, op));
3485 }
3486 else if (GET_CODE (imm) == CONST_VECTOR
3487 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3488 aarch64_expand_sve_const_vector (dest, imm);
3489 else
3490 {
3491 rtx mem = force_const_mem (mode, imm);
3492 gcc_assert (mem);
3493 emit_move_insn (dest, mem);
3494 }
3495
3496 return;
3497 }
3498
3499 aarch64_internal_mov_immediate (dest, imm, true,
3500 as_a <scalar_int_mode> (mode));
3501 }
3502
3503 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3504 that is known to contain PTRUE. */
3505
3506 void
3507 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3508 {
3509 expand_operand ops[3];
3510 machine_mode mode = GET_MODE (dest);
3511 create_output_operand (&ops[0], dest, mode);
3512 create_input_operand (&ops[1], pred, GET_MODE(pred));
3513 create_input_operand (&ops[2], src, mode);
3514 temporary_volatile_ok v (true);
3515 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3516 }
3517
3518 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3519 operand is in memory. In this case we need to use the predicated LD1
3520 and ST1 instead of LDR and STR, both for correctness on big-endian
3521 targets and because LD1 and ST1 support a wider range of addressing modes.
3522 PRED_MODE is the mode of the predicate.
3523
3524 See the comment at the head of aarch64-sve.md for details about the
3525 big-endian handling. */
3526
3527 void
3528 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3529 {
3530 machine_mode mode = GET_MODE (dest);
3531 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3532 if (!register_operand (src, mode)
3533 && !register_operand (dest, mode))
3534 {
3535 rtx tmp = gen_reg_rtx (mode);
3536 if (MEM_P (src))
3537 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3538 else
3539 emit_move_insn (tmp, src);
3540 src = tmp;
3541 }
3542 aarch64_emit_sve_pred_move (dest, ptrue, src);
3543 }
3544
3545 /* Called only on big-endian targets. See whether an SVE vector move
3546 from SRC to DEST is effectively a REV[BHW] instruction, because at
3547 least one operand is a subreg of an SVE vector that has wider or
3548 narrower elements. Return true and emit the instruction if so.
3549
3550 For example:
3551
3552 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3553
3554 represents a VIEW_CONVERT between the following vectors, viewed
3555 in memory order:
3556
3557 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3558 R1: { [0], [1], [2], [3], ... }
3559
3560 The high part of lane X in R2 should therefore correspond to lane X*2
3561 of R1, but the register representations are:
3562
3563 msb lsb
3564 R2: ...... [1].high [1].low [0].high [0].low
3565 R1: ...... [3] [2] [1] [0]
3566
3567 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3568 We therefore need a reverse operation to swap the high and low values
3569 around.
3570
3571 This is purely an optimization. Without it we would spill the
3572 subreg operand to the stack in one mode and reload it in the
3573 other mode, which has the same effect as the REV. */
3574
3575 bool
3576 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3577 {
3578 gcc_assert (BYTES_BIG_ENDIAN);
3579 if (GET_CODE (dest) == SUBREG)
3580 dest = SUBREG_REG (dest);
3581 if (GET_CODE (src) == SUBREG)
3582 src = SUBREG_REG (src);
3583
3584 /* The optimization handles two single SVE REGs with different element
3585 sizes. */
3586 if (!REG_P (dest)
3587 || !REG_P (src)
3588 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3589 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3590 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3591 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3592 return false;
3593
3594 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3595 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
3596 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3597 UNSPEC_REV_SUBREG);
3598 emit_insn (gen_rtx_SET (dest, unspec));
3599 return true;
3600 }
3601
3602 /* Return a copy of X with mode MODE, without changing its other
3603 attributes. Unlike gen_lowpart, this doesn't care whether the
3604 mode change is valid. */
3605
3606 static rtx
3607 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3608 {
3609 if (GET_MODE (x) == mode)
3610 return x;
3611
3612 x = shallow_copy_rtx (x);
3613 set_mode_and_regno (x, mode, REGNO (x));
3614 return x;
3615 }
3616
3617 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3618 operands. */
3619
3620 void
3621 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3622 {
3623 /* Decide which REV operation we need. The mode with narrower elements
3624 determines the mode of the operands and the mode with the wider
3625 elements determines the reverse width. */
3626 machine_mode mode_with_wider_elts = GET_MODE (dest);
3627 machine_mode mode_with_narrower_elts = GET_MODE (src);
3628 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3629 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3630 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3631
3632 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3633 unsigned int unspec;
3634 if (wider_bytes == 8)
3635 unspec = UNSPEC_REV64;
3636 else if (wider_bytes == 4)
3637 unspec = UNSPEC_REV32;
3638 else if (wider_bytes == 2)
3639 unspec = UNSPEC_REV16;
3640 else
3641 gcc_unreachable ();
3642 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3643
3644 /* Emit:
3645
3646 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3647 UNSPEC_MERGE_PTRUE))
3648
3649 with the appropriate modes. */
3650 ptrue = gen_lowpart (pred_mode, ptrue);
3651 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3652 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3653 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3654 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3655 UNSPEC_MERGE_PTRUE);
3656 emit_insn (gen_rtx_SET (dest, src));
3657 }
3658
3659 static bool
3660 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3661 tree exp ATTRIBUTE_UNUSED)
3662 {
3663 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3664 return false;
3665
3666 return true;
3667 }
3668
3669 /* Implement TARGET_PASS_BY_REFERENCE. */
3670
3671 static bool
3672 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3673 machine_mode mode,
3674 const_tree type,
3675 bool named ATTRIBUTE_UNUSED)
3676 {
3677 HOST_WIDE_INT size;
3678 machine_mode dummymode;
3679 int nregs;
3680
3681 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3682 if (mode == BLKmode && type)
3683 size = int_size_in_bytes (type);
3684 else
3685 /* No frontends can create types with variable-sized modes, so we
3686 shouldn't be asked to pass or return them. */
3687 size = GET_MODE_SIZE (mode).to_constant ();
3688
3689 /* Aggregates are passed by reference based on their size. */
3690 if (type && AGGREGATE_TYPE_P (type))
3691 {
3692 size = int_size_in_bytes (type);
3693 }
3694
3695 /* Variable sized arguments are always returned by reference. */
3696 if (size < 0)
3697 return true;
3698
3699 /* Can this be a candidate to be passed in fp/simd register(s)? */
3700 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3701 &dummymode, &nregs,
3702 NULL))
3703 return false;
3704
3705 /* Arguments which are variable sized or larger than 2 registers are
3706 passed by reference unless they are a homogenous floating point
3707 aggregate. */
3708 return size > 2 * UNITS_PER_WORD;
3709 }
3710
3711 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3712 static bool
3713 aarch64_return_in_msb (const_tree valtype)
3714 {
3715 machine_mode dummy_mode;
3716 int dummy_int;
3717
3718 /* Never happens in little-endian mode. */
3719 if (!BYTES_BIG_ENDIAN)
3720 return false;
3721
3722 /* Only composite types smaller than or equal to 16 bytes can
3723 be potentially returned in registers. */
3724 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3725 || int_size_in_bytes (valtype) <= 0
3726 || int_size_in_bytes (valtype) > 16)
3727 return false;
3728
3729 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3730 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3731 is always passed/returned in the least significant bits of fp/simd
3732 register(s). */
3733 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3734 &dummy_mode, &dummy_int, NULL))
3735 return false;
3736
3737 return true;
3738 }
3739
3740 /* Implement TARGET_FUNCTION_VALUE.
3741 Define how to find the value returned by a function. */
3742
3743 static rtx
3744 aarch64_function_value (const_tree type, const_tree func,
3745 bool outgoing ATTRIBUTE_UNUSED)
3746 {
3747 machine_mode mode;
3748 int unsignedp;
3749 int count;
3750 machine_mode ag_mode;
3751
3752 mode = TYPE_MODE (type);
3753 if (INTEGRAL_TYPE_P (type))
3754 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3755
3756 if (aarch64_return_in_msb (type))
3757 {
3758 HOST_WIDE_INT size = int_size_in_bytes (type);
3759
3760 if (size % UNITS_PER_WORD != 0)
3761 {
3762 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3763 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3764 }
3765 }
3766
3767 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3768 &ag_mode, &count, NULL))
3769 {
3770 if (!aarch64_composite_type_p (type, mode))
3771 {
3772 gcc_assert (count == 1 && mode == ag_mode);
3773 return gen_rtx_REG (mode, V0_REGNUM);
3774 }
3775 else
3776 {
3777 int i;
3778 rtx par;
3779
3780 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3781 for (i = 0; i < count; i++)
3782 {
3783 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3784 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3785 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3786 XVECEXP (par, 0, i) = tmp;
3787 }
3788 return par;
3789 }
3790 }
3791 else
3792 return gen_rtx_REG (mode, R0_REGNUM);
3793 }
3794
3795 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3796 Return true if REGNO is the number of a hard register in which the values
3797 of called function may come back. */
3798
3799 static bool
3800 aarch64_function_value_regno_p (const unsigned int regno)
3801 {
3802 /* Maximum of 16 bytes can be returned in the general registers. Examples
3803 of 16-byte return values are: 128-bit integers and 16-byte small
3804 structures (excluding homogeneous floating-point aggregates). */
3805 if (regno == R0_REGNUM || regno == R1_REGNUM)
3806 return true;
3807
3808 /* Up to four fp/simd registers can return a function value, e.g. a
3809 homogeneous floating-point aggregate having four members. */
3810 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3811 return TARGET_FLOAT;
3812
3813 return false;
3814 }
3815
3816 /* Implement TARGET_RETURN_IN_MEMORY.
3817
3818 If the type T of the result of a function is such that
3819 void func (T arg)
3820 would require that arg be passed as a value in a register (or set of
3821 registers) according to the parameter passing rules, then the result
3822 is returned in the same registers as would be used for such an
3823 argument. */
3824
3825 static bool
3826 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3827 {
3828 HOST_WIDE_INT size;
3829 machine_mode ag_mode;
3830 int count;
3831
3832 if (!AGGREGATE_TYPE_P (type)
3833 && TREE_CODE (type) != COMPLEX_TYPE
3834 && TREE_CODE (type) != VECTOR_TYPE)
3835 /* Simple scalar types always returned in registers. */
3836 return false;
3837
3838 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3839 type,
3840 &ag_mode,
3841 &count,
3842 NULL))
3843 return false;
3844
3845 /* Types larger than 2 registers returned in memory. */
3846 size = int_size_in_bytes (type);
3847 return (size < 0 || size > 2 * UNITS_PER_WORD);
3848 }
3849
3850 static bool
3851 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3852 const_tree type, int *nregs)
3853 {
3854 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3855 return aarch64_vfp_is_call_or_return_candidate (mode,
3856 type,
3857 &pcum->aapcs_vfp_rmode,
3858 nregs,
3859 NULL);
3860 }
3861
3862 /* Given MODE and TYPE of a function argument, return the alignment in
3863 bits. The idea is to suppress any stronger alignment requested by
3864 the user and opt for the natural alignment (specified in AAPCS64 \S
3865 4.1). ABI_BREAK is set to true if the alignment was incorrectly
3866 calculated in versions of GCC prior to GCC-9. This is a helper
3867 function for local use only. */
3868
3869 static unsigned int
3870 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3871 bool *abi_break)
3872 {
3873 *abi_break = false;
3874 if (!type)
3875 return GET_MODE_ALIGNMENT (mode);
3876
3877 if (integer_zerop (TYPE_SIZE (type)))
3878 return 0;
3879
3880 gcc_assert (TYPE_MODE (type) == mode);
3881
3882 if (!AGGREGATE_TYPE_P (type))
3883 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3884
3885 if (TREE_CODE (type) == ARRAY_TYPE)
3886 return TYPE_ALIGN (TREE_TYPE (type));
3887
3888 unsigned int alignment = 0;
3889 unsigned int bitfield_alignment = 0;
3890 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3891 if (TREE_CODE (field) == FIELD_DECL)
3892 {
3893 alignment = std::max (alignment, DECL_ALIGN (field));
3894 if (DECL_BIT_FIELD_TYPE (field))
3895 bitfield_alignment
3896 = std::max (bitfield_alignment,
3897 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3898 }
3899
3900 if (bitfield_alignment > alignment)
3901 {
3902 *abi_break = true;
3903 return bitfield_alignment;
3904 }
3905
3906 return alignment;
3907 }
3908
3909 /* Layout a function argument according to the AAPCS64 rules. The rule
3910 numbers refer to the rule numbers in the AAPCS64. */
3911
3912 static void
3913 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3914 const_tree type,
3915 bool named ATTRIBUTE_UNUSED)
3916 {
3917 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3918 int ncrn, nvrn, nregs;
3919 bool allocate_ncrn, allocate_nvrn;
3920 HOST_WIDE_INT size;
3921 bool abi_break;
3922
3923 /* We need to do this once per argument. */
3924 if (pcum->aapcs_arg_processed)
3925 return;
3926
3927 pcum->aapcs_arg_processed = true;
3928
3929 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3930 if (type)
3931 size = int_size_in_bytes (type);
3932 else
3933 /* No frontends can create types with variable-sized modes, so we
3934 shouldn't be asked to pass or return them. */
3935 size = GET_MODE_SIZE (mode).to_constant ();
3936 size = ROUND_UP (size, UNITS_PER_WORD);
3937
3938 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3939 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3940 mode,
3941 type,
3942 &nregs);
3943
3944 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3945 The following code thus handles passing by SIMD/FP registers first. */
3946
3947 nvrn = pcum->aapcs_nvrn;
3948
3949 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3950 and homogenous short-vector aggregates (HVA). */
3951 if (allocate_nvrn)
3952 {
3953 if (!TARGET_FLOAT)
3954 aarch64_err_no_fpadvsimd (mode);
3955
3956 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3957 {
3958 pcum->aapcs_nextnvrn = nvrn + nregs;
3959 if (!aarch64_composite_type_p (type, mode))
3960 {
3961 gcc_assert (nregs == 1);
3962 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3963 }
3964 else
3965 {
3966 rtx par;
3967 int i;
3968 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3969 for (i = 0; i < nregs; i++)
3970 {
3971 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3972 V0_REGNUM + nvrn + i);
3973 rtx offset = gen_int_mode
3974 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3975 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3976 XVECEXP (par, 0, i) = tmp;
3977 }
3978 pcum->aapcs_reg = par;
3979 }
3980 return;
3981 }
3982 else
3983 {
3984 /* C.3 NSRN is set to 8. */
3985 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3986 goto on_stack;
3987 }
3988 }
3989
3990 ncrn = pcum->aapcs_ncrn;
3991 nregs = size / UNITS_PER_WORD;
3992
3993 /* C6 - C9. though the sign and zero extension semantics are
3994 handled elsewhere. This is the case where the argument fits
3995 entirely general registers. */
3996 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3997 {
3998 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3999
4000 /* C.8 if the argument has an alignment of 16 then the NGRN is
4001 rounded up to the next even number. */
4002 if (nregs == 2
4003 && ncrn % 2
4004 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4005 comparison is there because for > 16 * BITS_PER_UNIT
4006 alignment nregs should be > 2 and therefore it should be
4007 passed by reference rather than value. */
4008 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4009 == 16 * BITS_PER_UNIT))
4010 {
4011 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4012 inform (input_location, "parameter passing for argument of type "
4013 "%qT changed in GCC 9.1", type);
4014 ++ncrn;
4015 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4016 }
4017
4018 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4019 A reg is still generated for it, but the caller should be smart
4020 enough not to use it. */
4021 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4022 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4023 else
4024 {
4025 rtx par;
4026 int i;
4027
4028 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4029 for (i = 0; i < nregs; i++)
4030 {
4031 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4032 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4033 GEN_INT (i * UNITS_PER_WORD));
4034 XVECEXP (par, 0, i) = tmp;
4035 }
4036 pcum->aapcs_reg = par;
4037 }
4038
4039 pcum->aapcs_nextncrn = ncrn + nregs;
4040 return;
4041 }
4042
4043 /* C.11 */
4044 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4045
4046 /* The argument is passed on stack; record the needed number of words for
4047 this argument and align the total size if necessary. */
4048 on_stack:
4049 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4050
4051 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4052 == 16 * BITS_PER_UNIT)
4053 {
4054 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4055 if (pcum->aapcs_stack_size != new_size)
4056 {
4057 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4058 inform (input_location, "parameter passing for argument of type "
4059 "%qT changed in GCC 9.1", type);
4060 pcum->aapcs_stack_size = new_size;
4061 }
4062 }
4063 return;
4064 }
4065
4066 /* Implement TARGET_FUNCTION_ARG. */
4067
4068 static rtx
4069 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4070 const_tree type, bool named)
4071 {
4072 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4073 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4074
4075 if (mode == VOIDmode)
4076 return NULL_RTX;
4077
4078 aarch64_layout_arg (pcum_v, mode, type, named);
4079 return pcum->aapcs_reg;
4080 }
4081
4082 void
4083 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4084 const_tree fntype ATTRIBUTE_UNUSED,
4085 rtx libname ATTRIBUTE_UNUSED,
4086 const_tree fndecl ATTRIBUTE_UNUSED,
4087 unsigned n_named ATTRIBUTE_UNUSED)
4088 {
4089 pcum->aapcs_ncrn = 0;
4090 pcum->aapcs_nvrn = 0;
4091 pcum->aapcs_nextncrn = 0;
4092 pcum->aapcs_nextnvrn = 0;
4093 pcum->pcs_variant = ARM_PCS_AAPCS64;
4094 pcum->aapcs_reg = NULL_RTX;
4095 pcum->aapcs_arg_processed = false;
4096 pcum->aapcs_stack_words = 0;
4097 pcum->aapcs_stack_size = 0;
4098
4099 if (!TARGET_FLOAT
4100 && fndecl && TREE_PUBLIC (fndecl)
4101 && fntype && fntype != error_mark_node)
4102 {
4103 const_tree type = TREE_TYPE (fntype);
4104 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4105 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4106 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4107 &mode, &nregs, NULL))
4108 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4109 }
4110 return;
4111 }
4112
4113 static void
4114 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4115 machine_mode mode,
4116 const_tree type,
4117 bool named)
4118 {
4119 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4120 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4121 {
4122 aarch64_layout_arg (pcum_v, mode, type, named);
4123 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4124 != (pcum->aapcs_stack_words != 0));
4125 pcum->aapcs_arg_processed = false;
4126 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4127 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4128 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4129 pcum->aapcs_stack_words = 0;
4130 pcum->aapcs_reg = NULL_RTX;
4131 }
4132 }
4133
4134 bool
4135 aarch64_function_arg_regno_p (unsigned regno)
4136 {
4137 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4138 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4139 }
4140
4141 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4142 PARM_BOUNDARY bits of alignment, but will be given anything up
4143 to STACK_BOUNDARY bits if the type requires it. This makes sure
4144 that both before and after the layout of each argument, the Next
4145 Stacked Argument Address (NSAA) will have a minimum alignment of
4146 8 bytes. */
4147
4148 static unsigned int
4149 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4150 {
4151 bool abi_break;
4152 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4153 &abi_break);
4154 if (abi_break & warn_psabi)
4155 inform (input_location, "parameter passing for argument of type "
4156 "%qT changed in GCC 9.1", type);
4157
4158 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4159 }
4160
4161 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4162
4163 static fixed_size_mode
4164 aarch64_get_reg_raw_mode (int regno)
4165 {
4166 if (TARGET_SVE && FP_REGNUM_P (regno))
4167 /* Don't use the SVE part of the register for __builtin_apply and
4168 __builtin_return. The SVE registers aren't used by the normal PCS,
4169 so using them there would be a waste of time. The PCS extensions
4170 for SVE types are fundamentally incompatible with the
4171 __builtin_return/__builtin_apply interface. */
4172 return as_a <fixed_size_mode> (V16QImode);
4173 return default_get_reg_raw_mode (regno);
4174 }
4175
4176 /* Implement TARGET_FUNCTION_ARG_PADDING.
4177
4178 Small aggregate types are placed in the lowest memory address.
4179
4180 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4181
4182 static pad_direction
4183 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4184 {
4185 /* On little-endian targets, the least significant byte of every stack
4186 argument is passed at the lowest byte address of the stack slot. */
4187 if (!BYTES_BIG_ENDIAN)
4188 return PAD_UPWARD;
4189
4190 /* Otherwise, integral, floating-point and pointer types are padded downward:
4191 the least significant byte of a stack argument is passed at the highest
4192 byte address of the stack slot. */
4193 if (type
4194 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4195 || POINTER_TYPE_P (type))
4196 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4197 return PAD_DOWNWARD;
4198
4199 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4200 return PAD_UPWARD;
4201 }
4202
4203 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4204
4205 It specifies padding for the last (may also be the only)
4206 element of a block move between registers and memory. If
4207 assuming the block is in the memory, padding upward means that
4208 the last element is padded after its highest significant byte,
4209 while in downward padding, the last element is padded at the
4210 its least significant byte side.
4211
4212 Small aggregates and small complex types are always padded
4213 upwards.
4214
4215 We don't need to worry about homogeneous floating-point or
4216 short-vector aggregates; their move is not affected by the
4217 padding direction determined here. Regardless of endianness,
4218 each element of such an aggregate is put in the least
4219 significant bits of a fp/simd register.
4220
4221 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4222 register has useful data, and return the opposite if the most
4223 significant byte does. */
4224
4225 bool
4226 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4227 bool first ATTRIBUTE_UNUSED)
4228 {
4229
4230 /* Small composite types are always padded upward. */
4231 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4232 {
4233 HOST_WIDE_INT size;
4234 if (type)
4235 size = int_size_in_bytes (type);
4236 else
4237 /* No frontends can create types with variable-sized modes, so we
4238 shouldn't be asked to pass or return them. */
4239 size = GET_MODE_SIZE (mode).to_constant ();
4240 if (size < 2 * UNITS_PER_WORD)
4241 return true;
4242 }
4243
4244 /* Otherwise, use the default padding. */
4245 return !BYTES_BIG_ENDIAN;
4246 }
4247
4248 static scalar_int_mode
4249 aarch64_libgcc_cmp_return_mode (void)
4250 {
4251 return SImode;
4252 }
4253
4254 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4255
4256 /* We use the 12-bit shifted immediate arithmetic instructions so values
4257 must be multiple of (1 << 12), i.e. 4096. */
4258 #define ARITH_FACTOR 4096
4259
4260 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4261 #error Cannot use simple address calculation for stack probing
4262 #endif
4263
4264 /* The pair of scratch registers used for stack probing. */
4265 #define PROBE_STACK_FIRST_REG R9_REGNUM
4266 #define PROBE_STACK_SECOND_REG R10_REGNUM
4267
4268 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4269 inclusive. These are offsets from the current stack pointer. */
4270
4271 static void
4272 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4273 {
4274 HOST_WIDE_INT size;
4275 if (!poly_size.is_constant (&size))
4276 {
4277 sorry ("stack probes for SVE frames");
4278 return;
4279 }
4280
4281 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4282
4283 /* See the same assertion on PROBE_INTERVAL above. */
4284 gcc_assert ((first % ARITH_FACTOR) == 0);
4285
4286 /* See if we have a constant small number of probes to generate. If so,
4287 that's the easy case. */
4288 if (size <= PROBE_INTERVAL)
4289 {
4290 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4291
4292 emit_set_insn (reg1,
4293 plus_constant (Pmode,
4294 stack_pointer_rtx, -(first + base)));
4295 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4296 }
4297
4298 /* The run-time loop is made up of 8 insns in the generic case while the
4299 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4300 else if (size <= 4 * PROBE_INTERVAL)
4301 {
4302 HOST_WIDE_INT i, rem;
4303
4304 emit_set_insn (reg1,
4305 plus_constant (Pmode,
4306 stack_pointer_rtx,
4307 -(first + PROBE_INTERVAL)));
4308 emit_stack_probe (reg1);
4309
4310 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4311 it exceeds SIZE. If only two probes are needed, this will not
4312 generate any code. Then probe at FIRST + SIZE. */
4313 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4314 {
4315 emit_set_insn (reg1,
4316 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4317 emit_stack_probe (reg1);
4318 }
4319
4320 rem = size - (i - PROBE_INTERVAL);
4321 if (rem > 256)
4322 {
4323 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4324
4325 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4326 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4327 }
4328 else
4329 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4330 }
4331
4332 /* Otherwise, do the same as above, but in a loop. Note that we must be
4333 extra careful with variables wrapping around because we might be at
4334 the very top (or the very bottom) of the address space and we have
4335 to be able to handle this case properly; in particular, we use an
4336 equality test for the loop condition. */
4337 else
4338 {
4339 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4340
4341 /* Step 1: round SIZE to the previous multiple of the interval. */
4342
4343 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4344
4345
4346 /* Step 2: compute initial and final value of the loop counter. */
4347
4348 /* TEST_ADDR = SP + FIRST. */
4349 emit_set_insn (reg1,
4350 plus_constant (Pmode, stack_pointer_rtx, -first));
4351
4352 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4353 HOST_WIDE_INT adjustment = - (first + rounded_size);
4354 if (! aarch64_uimm12_shift (adjustment))
4355 {
4356 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4357 true, Pmode);
4358 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4359 }
4360 else
4361 emit_set_insn (reg2,
4362 plus_constant (Pmode, stack_pointer_rtx, adjustment));
4363
4364 /* Step 3: the loop
4365
4366 do
4367 {
4368 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4369 probe at TEST_ADDR
4370 }
4371 while (TEST_ADDR != LAST_ADDR)
4372
4373 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4374 until it is equal to ROUNDED_SIZE. */
4375
4376 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4377
4378
4379 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4380 that SIZE is equal to ROUNDED_SIZE. */
4381
4382 if (size != rounded_size)
4383 {
4384 HOST_WIDE_INT rem = size - rounded_size;
4385
4386 if (rem > 256)
4387 {
4388 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4389
4390 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4391 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4392 }
4393 else
4394 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4395 }
4396 }
4397
4398 /* Make sure nothing is scheduled before we are done. */
4399 emit_insn (gen_blockage ());
4400 }
4401
4402 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4403 absolute addresses. */
4404
4405 const char *
4406 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4407 {
4408 static int labelno = 0;
4409 char loop_lab[32];
4410 rtx xops[2];
4411
4412 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4413
4414 /* Loop. */
4415 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4416
4417 HOST_WIDE_INT stack_clash_probe_interval
4418 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4419
4420 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4421 xops[0] = reg1;
4422 HOST_WIDE_INT interval;
4423 if (flag_stack_clash_protection)
4424 interval = stack_clash_probe_interval;
4425 else
4426 interval = PROBE_INTERVAL;
4427
4428 gcc_assert (aarch64_uimm12_shift (interval));
4429 xops[1] = GEN_INT (interval);
4430
4431 output_asm_insn ("sub\t%0, %0, %1", xops);
4432
4433 /* If doing stack clash protection then we probe up by the ABI specified
4434 amount. We do this because we're dropping full pages at a time in the
4435 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4436 if (flag_stack_clash_protection)
4437 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4438 else
4439 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4440
4441 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4442 by this amount for each iteration. */
4443 output_asm_insn ("str\txzr, [%0, %1]", xops);
4444
4445 /* Test if TEST_ADDR == LAST_ADDR. */
4446 xops[1] = reg2;
4447 output_asm_insn ("cmp\t%0, %1", xops);
4448
4449 /* Branch. */
4450 fputs ("\tb.ne\t", asm_out_file);
4451 assemble_name_raw (asm_out_file, loop_lab);
4452 fputc ('\n', asm_out_file);
4453
4454 return "";
4455 }
4456
4457 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4458 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4459 of GUARD_SIZE. When a probe is emitted it is done at most
4460 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4461 at most MIN_PROBE_THRESHOLD. By the end of this function
4462 BASE = BASE - ADJUSTMENT. */
4463
4464 const char *
4465 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4466 rtx min_probe_threshold, rtx guard_size)
4467 {
4468 /* This function is not allowed to use any instruction generation function
4469 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4470 so instead emit the code you want using output_asm_insn. */
4471 gcc_assert (flag_stack_clash_protection);
4472 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4473 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4474
4475 /* The minimum required allocation before the residual requires probing. */
4476 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4477
4478 /* Clamp the value down to the nearest value that can be used with a cmp. */
4479 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4480 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4481
4482 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4483 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4484
4485 static int labelno = 0;
4486 char loop_start_lab[32];
4487 char loop_end_lab[32];
4488 rtx xops[2];
4489
4490 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4491 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4492
4493 /* Emit loop start label. */
4494 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4495
4496 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4497 xops[0] = adjustment;
4498 xops[1] = probe_offset_value_rtx;
4499 output_asm_insn ("cmp\t%0, %1", xops);
4500
4501 /* Branch to end if not enough adjustment to probe. */
4502 fputs ("\tb.lt\t", asm_out_file);
4503 assemble_name_raw (asm_out_file, loop_end_lab);
4504 fputc ('\n', asm_out_file);
4505
4506 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4507 xops[0] = base;
4508 xops[1] = probe_offset_value_rtx;
4509 output_asm_insn ("sub\t%0, %0, %1", xops);
4510
4511 /* Probe at BASE. */
4512 xops[1] = const0_rtx;
4513 output_asm_insn ("str\txzr, [%0, %1]", xops);
4514
4515 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4516 xops[0] = adjustment;
4517 xops[1] = probe_offset_value_rtx;
4518 output_asm_insn ("sub\t%0, %0, %1", xops);
4519
4520 /* Branch to start if still more bytes to allocate. */
4521 fputs ("\tb\t", asm_out_file);
4522 assemble_name_raw (asm_out_file, loop_start_lab);
4523 fputc ('\n', asm_out_file);
4524
4525 /* No probe leave. */
4526 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4527
4528 /* BASE = BASE - ADJUSTMENT. */
4529 xops[0] = base;
4530 xops[1] = adjustment;
4531 output_asm_insn ("sub\t%0, %0, %1", xops);
4532 return "";
4533 }
4534
4535 /* Determine whether a frame chain needs to be generated. */
4536 static bool
4537 aarch64_needs_frame_chain (void)
4538 {
4539 /* Force a frame chain for EH returns so the return address is at FP+8. */
4540 if (frame_pointer_needed || crtl->calls_eh_return)
4541 return true;
4542
4543 /* A leaf function cannot have calls or write LR. */
4544 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4545
4546 /* Don't use a frame chain in leaf functions if leaf frame pointers
4547 are disabled. */
4548 if (flag_omit_leaf_frame_pointer && is_leaf)
4549 return false;
4550
4551 return aarch64_use_frame_pointer;
4552 }
4553
4554 /* Mark the registers that need to be saved by the callee and calculate
4555 the size of the callee-saved registers area and frame record (both FP
4556 and LR may be omitted). */
4557 static void
4558 aarch64_layout_frame (void)
4559 {
4560 HOST_WIDE_INT offset = 0;
4561 int regno, last_fp_reg = INVALID_REGNUM;
4562 bool simd_function = aarch64_simd_decl_p (cfun->decl);
4563
4564 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4565
4566 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4567 the mid-end is doing. */
4568 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4569
4570 #define SLOT_NOT_REQUIRED (-2)
4571 #define SLOT_REQUIRED (-1)
4572
4573 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4574 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4575
4576 /* If this is a non-leaf simd function with calls we assume that
4577 at least one of those calls is to a non-simd function and thus
4578 we must save V8 to V23 in the prologue. */
4579
4580 if (simd_function && !crtl->is_leaf)
4581 {
4582 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4583 if (FP_SIMD_SAVED_REGNUM_P (regno))
4584 df_set_regs_ever_live (regno, true);
4585 }
4586
4587 /* First mark all the registers that really need to be saved... */
4588 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4589 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4590
4591 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4592 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4593
4594 /* ... that includes the eh data registers (if needed)... */
4595 if (crtl->calls_eh_return)
4596 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4597 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4598 = SLOT_REQUIRED;
4599
4600 /* ... and any callee saved register that dataflow says is live. */
4601 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4602 if (df_regs_ever_live_p (regno)
4603 && (regno == R30_REGNUM
4604 || !call_used_regs[regno]))
4605 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4606
4607 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4608 if (df_regs_ever_live_p (regno)
4609 && (!call_used_regs[regno]
4610 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4611 {
4612 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4613 last_fp_reg = regno;
4614 }
4615
4616 if (cfun->machine->frame.emit_frame_chain)
4617 {
4618 /* FP and LR are placed in the linkage record. */
4619 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4620 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4621 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4622 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4623 offset = 2 * UNITS_PER_WORD;
4624 }
4625
4626 /* With stack-clash, LR must be saved in non-leaf functions. */
4627 gcc_assert (crtl->is_leaf
4628 || (cfun->machine->frame.reg_offset[R30_REGNUM]
4629 != SLOT_NOT_REQUIRED));
4630
4631 /* Now assign stack slots for them. */
4632 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4633 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4634 {
4635 cfun->machine->frame.reg_offset[regno] = offset;
4636 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4637 cfun->machine->frame.wb_candidate1 = regno;
4638 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4639 cfun->machine->frame.wb_candidate2 = regno;
4640 offset += UNITS_PER_WORD;
4641 }
4642
4643 HOST_WIDE_INT max_int_offset = offset;
4644 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4645 bool has_align_gap = offset != max_int_offset;
4646
4647 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4648 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4649 {
4650 /* If there is an alignment gap between integer and fp callee-saves,
4651 allocate the last fp register to it if possible. */
4652 if (regno == last_fp_reg
4653 && has_align_gap
4654 && !simd_function
4655 && (offset & 8) == 0)
4656 {
4657 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4658 break;
4659 }
4660
4661 cfun->machine->frame.reg_offset[regno] = offset;
4662 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4663 cfun->machine->frame.wb_candidate1 = regno;
4664 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4665 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4666 cfun->machine->frame.wb_candidate2 = regno;
4667 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4668 }
4669
4670 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4671
4672 cfun->machine->frame.saved_regs_size = offset;
4673
4674 HOST_WIDE_INT varargs_and_saved_regs_size
4675 = offset + cfun->machine->frame.saved_varargs_size;
4676
4677 cfun->machine->frame.hard_fp_offset
4678 = aligned_upper_bound (varargs_and_saved_regs_size
4679 + get_frame_size (),
4680 STACK_BOUNDARY / BITS_PER_UNIT);
4681
4682 /* Both these values are already aligned. */
4683 gcc_assert (multiple_p (crtl->outgoing_args_size,
4684 STACK_BOUNDARY / BITS_PER_UNIT));
4685 cfun->machine->frame.frame_size
4686 = (cfun->machine->frame.hard_fp_offset
4687 + crtl->outgoing_args_size);
4688
4689 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4690
4691 cfun->machine->frame.initial_adjust = 0;
4692 cfun->machine->frame.final_adjust = 0;
4693 cfun->machine->frame.callee_adjust = 0;
4694 cfun->machine->frame.callee_offset = 0;
4695
4696 HOST_WIDE_INT max_push_offset = 0;
4697 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4698 max_push_offset = 512;
4699 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4700 max_push_offset = 256;
4701
4702 HOST_WIDE_INT const_size, const_fp_offset;
4703 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4704 && const_size < max_push_offset
4705 && known_eq (crtl->outgoing_args_size, 0))
4706 {
4707 /* Simple, small frame with no outgoing arguments:
4708 stp reg1, reg2, [sp, -frame_size]!
4709 stp reg3, reg4, [sp, 16] */
4710 cfun->machine->frame.callee_adjust = const_size;
4711 }
4712 else if (known_lt (crtl->outgoing_args_size
4713 + cfun->machine->frame.saved_regs_size, 512)
4714 && !(cfun->calls_alloca
4715 && known_lt (cfun->machine->frame.hard_fp_offset,
4716 max_push_offset)))
4717 {
4718 /* Frame with small outgoing arguments:
4719 sub sp, sp, frame_size
4720 stp reg1, reg2, [sp, outgoing_args_size]
4721 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4722 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4723 cfun->machine->frame.callee_offset
4724 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4725 }
4726 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4727 && const_fp_offset < max_push_offset)
4728 {
4729 /* Frame with large outgoing arguments but a small local area:
4730 stp reg1, reg2, [sp, -hard_fp_offset]!
4731 stp reg3, reg4, [sp, 16]
4732 sub sp, sp, outgoing_args_size */
4733 cfun->machine->frame.callee_adjust = const_fp_offset;
4734 cfun->machine->frame.final_adjust
4735 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4736 }
4737 else
4738 {
4739 /* Frame with large local area and outgoing arguments using frame pointer:
4740 sub sp, sp, hard_fp_offset
4741 stp x29, x30, [sp, 0]
4742 add x29, sp, 0
4743 stp reg3, reg4, [sp, 16]
4744 sub sp, sp, outgoing_args_size */
4745 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4746 cfun->machine->frame.final_adjust
4747 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4748 }
4749
4750 cfun->machine->frame.laid_out = true;
4751 }
4752
4753 /* Return true if the register REGNO is saved on entry to
4754 the current function. */
4755
4756 static bool
4757 aarch64_register_saved_on_entry (int regno)
4758 {
4759 return cfun->machine->frame.reg_offset[regno] >= 0;
4760 }
4761
4762 /* Return the next register up from REGNO up to LIMIT for the callee
4763 to save. */
4764
4765 static unsigned
4766 aarch64_next_callee_save (unsigned regno, unsigned limit)
4767 {
4768 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4769 regno ++;
4770 return regno;
4771 }
4772
4773 /* Push the register number REGNO of mode MODE to the stack with write-back
4774 adjusting the stack by ADJUSTMENT. */
4775
4776 static void
4777 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4778 HOST_WIDE_INT adjustment)
4779 {
4780 rtx base_rtx = stack_pointer_rtx;
4781 rtx insn, reg, mem;
4782
4783 reg = gen_rtx_REG (mode, regno);
4784 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4785 plus_constant (Pmode, base_rtx, -adjustment));
4786 mem = gen_frame_mem (mode, mem);
4787
4788 insn = emit_move_insn (mem, reg);
4789 RTX_FRAME_RELATED_P (insn) = 1;
4790 }
4791
4792 /* Generate and return an instruction to store the pair of registers
4793 REG and REG2 of mode MODE to location BASE with write-back adjusting
4794 the stack location BASE by ADJUSTMENT. */
4795
4796 static rtx
4797 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4798 HOST_WIDE_INT adjustment)
4799 {
4800 switch (mode)
4801 {
4802 case E_DImode:
4803 return gen_storewb_pairdi_di (base, base, reg, reg2,
4804 GEN_INT (-adjustment),
4805 GEN_INT (UNITS_PER_WORD - adjustment));
4806 case E_DFmode:
4807 return gen_storewb_pairdf_di (base, base, reg, reg2,
4808 GEN_INT (-adjustment),
4809 GEN_INT (UNITS_PER_WORD - adjustment));
4810 case E_TFmode:
4811 return gen_storewb_pairtf_di (base, base, reg, reg2,
4812 GEN_INT (-adjustment),
4813 GEN_INT (UNITS_PER_VREG - adjustment));
4814 default:
4815 gcc_unreachable ();
4816 }
4817 }
4818
4819 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4820 stack pointer by ADJUSTMENT. */
4821
4822 static void
4823 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4824 {
4825 rtx_insn *insn;
4826 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4827
4828 if (regno2 == INVALID_REGNUM)
4829 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4830
4831 rtx reg1 = gen_rtx_REG (mode, regno1);
4832 rtx reg2 = gen_rtx_REG (mode, regno2);
4833
4834 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4835 reg2, adjustment));
4836 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4837 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4838 RTX_FRAME_RELATED_P (insn) = 1;
4839 }
4840
4841 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4842 adjusting it by ADJUSTMENT afterwards. */
4843
4844 static rtx
4845 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4846 HOST_WIDE_INT adjustment)
4847 {
4848 switch (mode)
4849 {
4850 case E_DImode:
4851 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4852 GEN_INT (UNITS_PER_WORD));
4853 case E_DFmode:
4854 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4855 GEN_INT (UNITS_PER_WORD));
4856 case E_TFmode:
4857 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4858 GEN_INT (UNITS_PER_VREG));
4859 default:
4860 gcc_unreachable ();
4861 }
4862 }
4863
4864 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4865 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4866 into CFI_OPS. */
4867
4868 static void
4869 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4870 rtx *cfi_ops)
4871 {
4872 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4873 rtx reg1 = gen_rtx_REG (mode, regno1);
4874
4875 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4876
4877 if (regno2 == INVALID_REGNUM)
4878 {
4879 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4880 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4881 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4882 }
4883 else
4884 {
4885 rtx reg2 = gen_rtx_REG (mode, regno2);
4886 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4887 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4888 reg2, adjustment));
4889 }
4890 }
4891
4892 /* Generate and return a store pair instruction of mode MODE to store
4893 register REG1 to MEM1 and register REG2 to MEM2. */
4894
4895 static rtx
4896 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4897 rtx reg2)
4898 {
4899 switch (mode)
4900 {
4901 case E_DImode:
4902 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4903
4904 case E_DFmode:
4905 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4906
4907 case E_TFmode:
4908 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4909
4910 default:
4911 gcc_unreachable ();
4912 }
4913 }
4914
4915 /* Generate and regurn a load pair isntruction of mode MODE to load register
4916 REG1 from MEM1 and register REG2 from MEM2. */
4917
4918 static rtx
4919 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4920 rtx mem2)
4921 {
4922 switch (mode)
4923 {
4924 case E_DImode:
4925 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4926
4927 case E_DFmode:
4928 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4929
4930 case E_TFmode:
4931 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4932
4933 default:
4934 gcc_unreachable ();
4935 }
4936 }
4937
4938 /* Return TRUE if return address signing should be enabled for the current
4939 function, otherwise return FALSE. */
4940
4941 bool
4942 aarch64_return_address_signing_enabled (void)
4943 {
4944 /* This function should only be called after frame laid out. */
4945 gcc_assert (cfun->machine->frame.laid_out);
4946
4947 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4948 if its LR is pushed onto stack. */
4949 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4950 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4951 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4952 }
4953
4954 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
4955 bool
4956 aarch64_bti_enabled (void)
4957 {
4958 return (aarch64_enable_bti == 1);
4959 }
4960
4961 /* Emit code to save the callee-saved registers from register number START
4962 to LIMIT to the stack at the location starting at offset START_OFFSET,
4963 skipping any write-back candidates if SKIP_WB is true. */
4964
4965 static void
4966 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4967 unsigned start, unsigned limit, bool skip_wb)
4968 {
4969 rtx_insn *insn;
4970 unsigned regno;
4971 unsigned regno2;
4972
4973 for (regno = aarch64_next_callee_save (start, limit);
4974 regno <= limit;
4975 regno = aarch64_next_callee_save (regno + 1, limit))
4976 {
4977 rtx reg, mem;
4978 poly_int64 offset;
4979 int offset_diff;
4980
4981 if (skip_wb
4982 && (regno == cfun->machine->frame.wb_candidate1
4983 || regno == cfun->machine->frame.wb_candidate2))
4984 continue;
4985
4986 if (cfun->machine->reg_is_wrapped_separately[regno])
4987 continue;
4988
4989 reg = gen_rtx_REG (mode, regno);
4990 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4991 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4992 offset));
4993
4994 regno2 = aarch64_next_callee_save (regno + 1, limit);
4995 offset_diff = cfun->machine->frame.reg_offset[regno2]
4996 - cfun->machine->frame.reg_offset[regno];
4997
4998 if (regno2 <= limit
4999 && !cfun->machine->reg_is_wrapped_separately[regno2]
5000 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5001 {
5002 rtx reg2 = gen_rtx_REG (mode, regno2);
5003 rtx mem2;
5004
5005 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5006 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5007 offset));
5008 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5009 reg2));
5010
5011 /* The first part of a frame-related parallel insn is
5012 always assumed to be relevant to the frame
5013 calculations; subsequent parts, are only
5014 frame-related if explicitly marked. */
5015 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5016 regno = regno2;
5017 }
5018 else
5019 insn = emit_move_insn (mem, reg);
5020
5021 RTX_FRAME_RELATED_P (insn) = 1;
5022 }
5023 }
5024
5025 /* Emit code to restore the callee registers of mode MODE from register
5026 number START up to and including LIMIT. Restore from the stack offset
5027 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5028 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5029
5030 static void
5031 aarch64_restore_callee_saves (machine_mode mode,
5032 poly_int64 start_offset, unsigned start,
5033 unsigned limit, bool skip_wb, rtx *cfi_ops)
5034 {
5035 rtx base_rtx = stack_pointer_rtx;
5036 unsigned regno;
5037 unsigned regno2;
5038 poly_int64 offset;
5039
5040 for (regno = aarch64_next_callee_save (start, limit);
5041 regno <= limit;
5042 regno = aarch64_next_callee_save (regno + 1, limit))
5043 {
5044 if (cfun->machine->reg_is_wrapped_separately[regno])
5045 continue;
5046
5047 rtx reg, mem;
5048 int offset_diff;
5049
5050 if (skip_wb
5051 && (regno == cfun->machine->frame.wb_candidate1
5052 || regno == cfun->machine->frame.wb_candidate2))
5053 continue;
5054
5055 reg = gen_rtx_REG (mode, regno);
5056 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5057 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5058
5059 regno2 = aarch64_next_callee_save (regno + 1, limit);
5060 offset_diff = cfun->machine->frame.reg_offset[regno2]
5061 - cfun->machine->frame.reg_offset[regno];
5062
5063 if (regno2 <= limit
5064 && !cfun->machine->reg_is_wrapped_separately[regno2]
5065 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5066 {
5067 rtx reg2 = gen_rtx_REG (mode, regno2);
5068 rtx mem2;
5069
5070 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5071 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5072 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5073
5074 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5075 regno = regno2;
5076 }
5077 else
5078 emit_move_insn (reg, mem);
5079 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5080 }
5081 }
5082
5083 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5084 of MODE. */
5085
5086 static inline bool
5087 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5088 {
5089 HOST_WIDE_INT multiple;
5090 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5091 && IN_RANGE (multiple, -8, 7));
5092 }
5093
5094 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5095 of MODE. */
5096
5097 static inline bool
5098 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5099 {
5100 HOST_WIDE_INT multiple;
5101 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5102 && IN_RANGE (multiple, 0, 63));
5103 }
5104
5105 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5106 of MODE. */
5107
5108 bool
5109 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5110 {
5111 HOST_WIDE_INT multiple;
5112 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5113 && IN_RANGE (multiple, -64, 63));
5114 }
5115
5116 /* Return true if OFFSET is a signed 9-bit value. */
5117
5118 bool
5119 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5120 poly_int64 offset)
5121 {
5122 HOST_WIDE_INT const_offset;
5123 return (offset.is_constant (&const_offset)
5124 && IN_RANGE (const_offset, -256, 255));
5125 }
5126
5127 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5128 of MODE. */
5129
5130 static inline bool
5131 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5132 {
5133 HOST_WIDE_INT multiple;
5134 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5135 && IN_RANGE (multiple, -256, 255));
5136 }
5137
5138 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5139 of MODE. */
5140
5141 static inline bool
5142 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5143 {
5144 HOST_WIDE_INT multiple;
5145 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5146 && IN_RANGE (multiple, 0, 4095));
5147 }
5148
5149 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5150
5151 static sbitmap
5152 aarch64_get_separate_components (void)
5153 {
5154 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5155 bitmap_clear (components);
5156
5157 /* The registers we need saved to the frame. */
5158 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5159 if (aarch64_register_saved_on_entry (regno))
5160 {
5161 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5162 if (!frame_pointer_needed)
5163 offset += cfun->machine->frame.frame_size
5164 - cfun->machine->frame.hard_fp_offset;
5165 /* Check that we can access the stack slot of the register with one
5166 direct load with no adjustments needed. */
5167 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5168 bitmap_set_bit (components, regno);
5169 }
5170
5171 /* Don't mess with the hard frame pointer. */
5172 if (frame_pointer_needed)
5173 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5174
5175 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5176 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5177 /* If registers have been chosen to be stored/restored with
5178 writeback don't interfere with them to avoid having to output explicit
5179 stack adjustment instructions. */
5180 if (reg2 != INVALID_REGNUM)
5181 bitmap_clear_bit (components, reg2);
5182 if (reg1 != INVALID_REGNUM)
5183 bitmap_clear_bit (components, reg1);
5184
5185 bitmap_clear_bit (components, LR_REGNUM);
5186 bitmap_clear_bit (components, SP_REGNUM);
5187
5188 return components;
5189 }
5190
5191 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5192
5193 static sbitmap
5194 aarch64_components_for_bb (basic_block bb)
5195 {
5196 bitmap in = DF_LIVE_IN (bb);
5197 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5198 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5199 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5200
5201 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5202 bitmap_clear (components);
5203
5204 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5205 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5206 if ((!call_used_regs[regno]
5207 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5208 && (bitmap_bit_p (in, regno)
5209 || bitmap_bit_p (gen, regno)
5210 || bitmap_bit_p (kill, regno)))
5211 {
5212 unsigned regno2, offset, offset2;
5213 bitmap_set_bit (components, regno);
5214
5215 /* If there is a callee-save at an adjacent offset, add it too
5216 to increase the use of LDP/STP. */
5217 offset = cfun->machine->frame.reg_offset[regno];
5218 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5219
5220 if (regno2 <= LAST_SAVED_REGNUM)
5221 {
5222 offset2 = cfun->machine->frame.reg_offset[regno2];
5223 if ((offset & ~8) == (offset2 & ~8))
5224 bitmap_set_bit (components, regno2);
5225 }
5226 }
5227
5228 return components;
5229 }
5230
5231 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5232 Nothing to do for aarch64. */
5233
5234 static void
5235 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5236 {
5237 }
5238
5239 /* Return the next set bit in BMP from START onwards. Return the total number
5240 of bits in BMP if no set bit is found at or after START. */
5241
5242 static unsigned int
5243 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5244 {
5245 unsigned int nbits = SBITMAP_SIZE (bmp);
5246 if (start == nbits)
5247 return start;
5248
5249 gcc_assert (start < nbits);
5250 for (unsigned int i = start; i < nbits; i++)
5251 if (bitmap_bit_p (bmp, i))
5252 return i;
5253
5254 return nbits;
5255 }
5256
5257 /* Do the work for aarch64_emit_prologue_components and
5258 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5259 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5260 for these components or the epilogue sequence. That is, it determines
5261 whether we should emit stores or loads and what kind of CFA notes to attach
5262 to the insns. Otherwise the logic for the two sequences is very
5263 similar. */
5264
5265 static void
5266 aarch64_process_components (sbitmap components, bool prologue_p)
5267 {
5268 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5269 ? HARD_FRAME_POINTER_REGNUM
5270 : STACK_POINTER_REGNUM);
5271
5272 unsigned last_regno = SBITMAP_SIZE (components);
5273 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5274 rtx_insn *insn = NULL;
5275
5276 while (regno != last_regno)
5277 {
5278 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5279 so DFmode for the vector registers is enough. For simd functions
5280 we want to save the low 128 bits. */
5281 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5282
5283 rtx reg = gen_rtx_REG (mode, regno);
5284 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5285 if (!frame_pointer_needed)
5286 offset += cfun->machine->frame.frame_size
5287 - cfun->machine->frame.hard_fp_offset;
5288 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5289 rtx mem = gen_frame_mem (mode, addr);
5290
5291 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5292 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5293 /* No more registers to handle after REGNO.
5294 Emit a single save/restore and exit. */
5295 if (regno2 == last_regno)
5296 {
5297 insn = emit_insn (set);
5298 RTX_FRAME_RELATED_P (insn) = 1;
5299 if (prologue_p)
5300 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5301 else
5302 add_reg_note (insn, REG_CFA_RESTORE, reg);
5303 break;
5304 }
5305
5306 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5307 /* The next register is not of the same class or its offset is not
5308 mergeable with the current one into a pair. */
5309 if (!satisfies_constraint_Ump (mem)
5310 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5311 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5312 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5313 GET_MODE_SIZE (mode)))
5314 {
5315 insn = emit_insn (set);
5316 RTX_FRAME_RELATED_P (insn) = 1;
5317 if (prologue_p)
5318 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5319 else
5320 add_reg_note (insn, REG_CFA_RESTORE, reg);
5321
5322 regno = regno2;
5323 continue;
5324 }
5325
5326 /* REGNO2 can be saved/restored in a pair with REGNO. */
5327 rtx reg2 = gen_rtx_REG (mode, regno2);
5328 if (!frame_pointer_needed)
5329 offset2 += cfun->machine->frame.frame_size
5330 - cfun->machine->frame.hard_fp_offset;
5331 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5332 rtx mem2 = gen_frame_mem (mode, addr2);
5333 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5334 : gen_rtx_SET (reg2, mem2);
5335
5336 if (prologue_p)
5337 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5338 else
5339 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5340
5341 RTX_FRAME_RELATED_P (insn) = 1;
5342 if (prologue_p)
5343 {
5344 add_reg_note (insn, REG_CFA_OFFSET, set);
5345 add_reg_note (insn, REG_CFA_OFFSET, set2);
5346 }
5347 else
5348 {
5349 add_reg_note (insn, REG_CFA_RESTORE, reg);
5350 add_reg_note (insn, REG_CFA_RESTORE, reg2);
5351 }
5352
5353 regno = aarch64_get_next_set_bit (components, regno2 + 1);
5354 }
5355 }
5356
5357 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5358
5359 static void
5360 aarch64_emit_prologue_components (sbitmap components)
5361 {
5362 aarch64_process_components (components, true);
5363 }
5364
5365 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5366
5367 static void
5368 aarch64_emit_epilogue_components (sbitmap components)
5369 {
5370 aarch64_process_components (components, false);
5371 }
5372
5373 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5374
5375 static void
5376 aarch64_set_handled_components (sbitmap components)
5377 {
5378 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5379 if (bitmap_bit_p (components, regno))
5380 cfun->machine->reg_is_wrapped_separately[regno] = true;
5381 }
5382
5383 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5384 determining the probe offset for alloca. */
5385
5386 static HOST_WIDE_INT
5387 aarch64_stack_clash_protection_alloca_probe_range (void)
5388 {
5389 return STACK_CLASH_CALLER_GUARD;
5390 }
5391
5392
5393 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5394 registers. If POLY_SIZE is not large enough to require a probe this function
5395 will only adjust the stack. When allocating the stack space
5396 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5397 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5398 arguments. If we are then we ensure that any allocation larger than the ABI
5399 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5400 maintained.
5401
5402 We emit barriers after each stack adjustment to prevent optimizations from
5403 breaking the invariant that we never drop the stack more than a page. This
5404 invariant is needed to make it easier to correctly handle asynchronous
5405 events, e.g. if we were to allow the stack to be dropped by more than a page
5406 and then have multiple probes up and we take a signal somewhere in between
5407 then the signal handler doesn't know the state of the stack and can make no
5408 assumptions about which pages have been probed. */
5409
5410 static void
5411 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5412 poly_int64 poly_size,
5413 bool frame_related_p,
5414 bool final_adjustment_p)
5415 {
5416 HOST_WIDE_INT guard_size
5417 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5418 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5419 /* When doing the final adjustment for the outgoing argument size we can't
5420 assume that LR was saved at position 0. So subtract it's offset from the
5421 ABI safe buffer so that we don't accidentally allow an adjustment that
5422 would result in an allocation larger than the ABI buffer without
5423 probing. */
5424 HOST_WIDE_INT min_probe_threshold
5425 = final_adjustment_p
5426 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5427 : guard_size - guard_used_by_caller;
5428
5429 poly_int64 frame_size = cfun->machine->frame.frame_size;
5430
5431 /* We should always have a positive probe threshold. */
5432 gcc_assert (min_probe_threshold > 0);
5433
5434 if (flag_stack_clash_protection && !final_adjustment_p)
5435 {
5436 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5437 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5438
5439 if (known_eq (frame_size, 0))
5440 {
5441 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5442 }
5443 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5444 && known_lt (final_adjust, guard_used_by_caller))
5445 {
5446 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5447 }
5448 }
5449
5450 /* If SIZE is not large enough to require probing, just adjust the stack and
5451 exit. */
5452 if (known_lt (poly_size, min_probe_threshold)
5453 || !flag_stack_clash_protection)
5454 {
5455 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5456 return;
5457 }
5458
5459 HOST_WIDE_INT size;
5460 /* Handle the SVE non-constant case first. */
5461 if (!poly_size.is_constant (&size))
5462 {
5463 if (dump_file)
5464 {
5465 fprintf (dump_file, "Stack clash SVE prologue: ");
5466 print_dec (poly_size, dump_file);
5467 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5468 }
5469
5470 /* First calculate the amount of bytes we're actually spilling. */
5471 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5472 poly_size, temp1, temp2, false, true);
5473
5474 rtx_insn *insn = get_last_insn ();
5475
5476 if (frame_related_p)
5477 {
5478 /* This is done to provide unwinding information for the stack
5479 adjustments we're about to do, however to prevent the optimizers
5480 from removing the R11 move and leaving the CFA note (which would be
5481 very wrong) we tie the old and new stack pointer together.
5482 The tie will expand to nothing but the optimizers will not touch
5483 the instruction. */
5484 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5485 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5486 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5487
5488 /* We want the CFA independent of the stack pointer for the
5489 duration of the loop. */
5490 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5491 RTX_FRAME_RELATED_P (insn) = 1;
5492 }
5493
5494 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5495 rtx guard_const = gen_int_mode (guard_size, Pmode);
5496
5497 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5498 stack_pointer_rtx, temp1,
5499 probe_const, guard_const));
5500
5501 /* Now reset the CFA register if needed. */
5502 if (frame_related_p)
5503 {
5504 add_reg_note (insn, REG_CFA_DEF_CFA,
5505 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5506 gen_int_mode (poly_size, Pmode)));
5507 RTX_FRAME_RELATED_P (insn) = 1;
5508 }
5509
5510 return;
5511 }
5512
5513 if (dump_file)
5514 fprintf (dump_file,
5515 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5516 " bytes, probing will be required.\n", size);
5517
5518 /* Round size to the nearest multiple of guard_size, and calculate the
5519 residual as the difference between the original size and the rounded
5520 size. */
5521 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5522 HOST_WIDE_INT residual = size - rounded_size;
5523
5524 /* We can handle a small number of allocations/probes inline. Otherwise
5525 punt to a loop. */
5526 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5527 {
5528 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5529 {
5530 aarch64_sub_sp (NULL, temp2, guard_size, true);
5531 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5532 guard_used_by_caller));
5533 emit_insn (gen_blockage ());
5534 }
5535 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5536 }
5537 else
5538 {
5539 /* Compute the ending address. */
5540 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5541 temp1, NULL, false, true);
5542 rtx_insn *insn = get_last_insn ();
5543
5544 /* For the initial allocation, we don't have a frame pointer
5545 set up, so we always need CFI notes. If we're doing the
5546 final allocation, then we may have a frame pointer, in which
5547 case it is the CFA, otherwise we need CFI notes.
5548
5549 We can determine which allocation we are doing by looking at
5550 the value of FRAME_RELATED_P since the final allocations are not
5551 frame related. */
5552 if (frame_related_p)
5553 {
5554 /* We want the CFA independent of the stack pointer for the
5555 duration of the loop. */
5556 add_reg_note (insn, REG_CFA_DEF_CFA,
5557 plus_constant (Pmode, temp1, rounded_size));
5558 RTX_FRAME_RELATED_P (insn) = 1;
5559 }
5560
5561 /* This allocates and probes the stack. Note that this re-uses some of
5562 the existing Ada stack protection code. However we are guaranteed not
5563 to enter the non loop or residual branches of that code.
5564
5565 The non-loop part won't be entered because if our allocation amount
5566 doesn't require a loop, the case above would handle it.
5567
5568 The residual amount won't be entered because TEMP1 is a mutliple of
5569 the allocation size. The residual will always be 0. As such, the only
5570 part we are actually using from that code is the loop setup. The
5571 actual probing is done in aarch64_output_probe_stack_range. */
5572 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5573 stack_pointer_rtx, temp1));
5574
5575 /* Now reset the CFA register if needed. */
5576 if (frame_related_p)
5577 {
5578 add_reg_note (insn, REG_CFA_DEF_CFA,
5579 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5580 RTX_FRAME_RELATED_P (insn) = 1;
5581 }
5582
5583 emit_insn (gen_blockage ());
5584 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5585 }
5586
5587 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5588 be probed. This maintains the requirement that each page is probed at
5589 least once. For initial probing we probe only if the allocation is
5590 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5591 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5592 GUARD_SIZE. This works that for any allocation that is large enough to
5593 trigger a probe here, we'll have at least one, and if they're not large
5594 enough for this code to emit anything for them, The page would have been
5595 probed by the saving of FP/LR either by this function or any callees. If
5596 we don't have any callees then we won't have more stack adjustments and so
5597 are still safe. */
5598 if (residual)
5599 {
5600 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5601 /* If we're doing final adjustments, and we've done any full page
5602 allocations then any residual needs to be probed. */
5603 if (final_adjustment_p && rounded_size != 0)
5604 min_probe_threshold = 0;
5605 /* If doing a small final adjustment, we always probe at offset 0.
5606 This is done to avoid issues when LR is not at position 0 or when
5607 the final adjustment is smaller than the probing offset. */
5608 else if (final_adjustment_p && rounded_size == 0)
5609 residual_probe_offset = 0;
5610
5611 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5612 if (residual >= min_probe_threshold)
5613 {
5614 if (dump_file)
5615 fprintf (dump_file,
5616 "Stack clash AArch64 prologue residuals: "
5617 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5618 "\n", residual);
5619
5620 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5621 residual_probe_offset));
5622 emit_insn (gen_blockage ());
5623 }
5624 }
5625 }
5626
5627 /* Return 1 if the register is used by the epilogue. We need to say the
5628 return register is used, but only after epilogue generation is complete.
5629 Note that in the case of sibcalls, the values "used by the epilogue" are
5630 considered live at the start of the called function.
5631
5632 For SIMD functions we need to return 1 for FP registers that are saved and
5633 restored by a function but are not zero in call_used_regs. If we do not do
5634 this optimizations may remove the restore of the register. */
5635
5636 int
5637 aarch64_epilogue_uses (int regno)
5638 {
5639 if (epilogue_completed)
5640 {
5641 if (regno == LR_REGNUM)
5642 return 1;
5643 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5644 return 1;
5645 }
5646 return 0;
5647 }
5648
5649 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5650 is saved at BASE + OFFSET. */
5651
5652 static void
5653 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5654 rtx base, poly_int64 offset)
5655 {
5656 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5657 add_reg_note (insn, REG_CFA_EXPRESSION,
5658 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5659 }
5660
5661 /* AArch64 stack frames generated by this compiler look like:
5662
5663 +-------------------------------+
5664 | |
5665 | incoming stack arguments |
5666 | |
5667 +-------------------------------+
5668 | | <-- incoming stack pointer (aligned)
5669 | callee-allocated save area |
5670 | for register varargs |
5671 | |
5672 +-------------------------------+
5673 | local variables | <-- frame_pointer_rtx
5674 | |
5675 +-------------------------------+
5676 | padding | \
5677 +-------------------------------+ |
5678 | callee-saved registers | | frame.saved_regs_size
5679 +-------------------------------+ |
5680 | LR' | |
5681 +-------------------------------+ |
5682 | FP' | / <- hard_frame_pointer_rtx (aligned)
5683 +-------------------------------+
5684 | dynamic allocation |
5685 +-------------------------------+
5686 | padding |
5687 +-------------------------------+
5688 | outgoing stack arguments | <-- arg_pointer
5689 | |
5690 +-------------------------------+
5691 | | <-- stack_pointer_rtx (aligned)
5692
5693 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5694 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5695 unchanged.
5696
5697 By default for stack-clash we assume the guard is at least 64KB, but this
5698 value is configurable to either 4KB or 64KB. We also force the guard size to
5699 be the same as the probing interval and both values are kept in sync.
5700
5701 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5702 on the guard size) of stack space without probing.
5703
5704 When probing is needed, we emit a probe at the start of the prologue
5705 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5706
5707 We have to track how much space has been allocated and the only stores
5708 to the stack we track as implicit probes are the FP/LR stores.
5709
5710 For outgoing arguments we probe if the size is larger than 1KB, such that
5711 the ABI specified buffer is maintained for the next callee.
5712
5713 The following registers are reserved during frame layout and should not be
5714 used for any other purpose:
5715
5716 - r11: Used by stack clash protection when SVE is enabled.
5717 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5718 - r14 and r15: Used for speculation tracking.
5719 - r16(IP0), r17(IP1): Used by indirect tailcalls.
5720 - r30(LR), r29(FP): Used by standard frame layout.
5721
5722 These registers must be avoided in frame layout related code unless the
5723 explicit intention is to interact with one of the features listed above. */
5724
5725 /* Generate the prologue instructions for entry into a function.
5726 Establish the stack frame by decreasing the stack pointer with a
5727 properly calculated size and, if necessary, create a frame record
5728 filled with the values of LR and previous frame pointer. The
5729 current FP is also set up if it is in use. */
5730
5731 void
5732 aarch64_expand_prologue (void)
5733 {
5734 poly_int64 frame_size = cfun->machine->frame.frame_size;
5735 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5736 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5737 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5738 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5739 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5740 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5741 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5742 rtx_insn *insn;
5743
5744 /* Sign return address for functions. */
5745 if (aarch64_return_address_signing_enabled ())
5746 {
5747 switch (aarch64_ra_sign_key)
5748 {
5749 case AARCH64_KEY_A:
5750 insn = emit_insn (gen_paciasp ());
5751 break;
5752 case AARCH64_KEY_B:
5753 insn = emit_insn (gen_pacibsp ());
5754 break;
5755 default:
5756 gcc_unreachable ();
5757 }
5758 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5759 RTX_FRAME_RELATED_P (insn) = 1;
5760 }
5761
5762 if (flag_stack_usage_info)
5763 current_function_static_stack_size = constant_lower_bound (frame_size);
5764
5765 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5766 {
5767 if (crtl->is_leaf && !cfun->calls_alloca)
5768 {
5769 if (maybe_gt (frame_size, PROBE_INTERVAL)
5770 && maybe_gt (frame_size, get_stack_check_protect ()))
5771 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5772 (frame_size
5773 - get_stack_check_protect ()));
5774 }
5775 else if (maybe_gt (frame_size, 0))
5776 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5777 }
5778
5779 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5780 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5781
5782 /* In theory we should never have both an initial adjustment
5783 and a callee save adjustment. Verify that is the case since the
5784 code below does not handle it for -fstack-clash-protection. */
5785 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5786
5787 /* Will only probe if the initial adjustment is larger than the guard
5788 less the amount of the guard reserved for use by the caller's
5789 outgoing args. */
5790 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5791 true, false);
5792
5793 if (callee_adjust != 0)
5794 aarch64_push_regs (reg1, reg2, callee_adjust);
5795
5796 if (emit_frame_chain)
5797 {
5798 poly_int64 reg_offset = callee_adjust;
5799 if (callee_adjust == 0)
5800 {
5801 reg1 = R29_REGNUM;
5802 reg2 = R30_REGNUM;
5803 reg_offset = callee_offset;
5804 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5805 }
5806 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5807 stack_pointer_rtx, callee_offset,
5808 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5809 if (frame_pointer_needed && !frame_size.is_constant ())
5810 {
5811 /* Variable-sized frames need to describe the save slot
5812 address using DW_CFA_expression rather than DW_CFA_offset.
5813 This means that, without taking further action, the
5814 locations of the registers that we've already saved would
5815 remain based on the stack pointer even after we redefine
5816 the CFA based on the frame pointer. We therefore need new
5817 DW_CFA_expressions to re-express the save slots with addresses
5818 based on the frame pointer. */
5819 rtx_insn *insn = get_last_insn ();
5820 gcc_assert (RTX_FRAME_RELATED_P (insn));
5821
5822 /* Add an explicit CFA definition if this was previously
5823 implicit. */
5824 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5825 {
5826 rtx src = plus_constant (Pmode, stack_pointer_rtx,
5827 callee_offset);
5828 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5829 gen_rtx_SET (hard_frame_pointer_rtx, src));
5830 }
5831
5832 /* Change the save slot expressions for the registers that
5833 we've already saved. */
5834 reg_offset -= callee_offset;
5835 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5836 reg_offset + UNITS_PER_WORD);
5837 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5838 reg_offset);
5839 }
5840 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5841 }
5842
5843 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5844 callee_adjust != 0 || emit_frame_chain);
5845 if (aarch64_simd_decl_p (cfun->decl))
5846 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5847 callee_adjust != 0 || emit_frame_chain);
5848 else
5849 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5850 callee_adjust != 0 || emit_frame_chain);
5851
5852 /* We may need to probe the final adjustment if it is larger than the guard
5853 that is assumed by the called. */
5854 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5855 !frame_pointer_needed, true);
5856 }
5857
5858 /* Return TRUE if we can use a simple_return insn.
5859
5860 This function checks whether the callee saved stack is empty, which
5861 means no restore actions are need. The pro_and_epilogue will use
5862 this to check whether shrink-wrapping opt is feasible. */
5863
5864 bool
5865 aarch64_use_return_insn_p (void)
5866 {
5867 if (!reload_completed)
5868 return false;
5869
5870 if (crtl->profile)
5871 return false;
5872
5873 return known_eq (cfun->machine->frame.frame_size, 0);
5874 }
5875
5876 /* Return false for non-leaf SIMD functions in order to avoid
5877 shrink-wrapping them. Doing this will lose the necessary
5878 save/restore of FP registers. */
5879
5880 bool
5881 aarch64_use_simple_return_insn_p (void)
5882 {
5883 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5884 return false;
5885
5886 return true;
5887 }
5888
5889 /* Generate the epilogue instructions for returning from a function.
5890 This is almost exactly the reverse of the prolog sequence, except
5891 that we need to insert barriers to avoid scheduling loads that read
5892 from a deallocated stack, and we optimize the unwind records by
5893 emitting them all together if possible. */
5894 void
5895 aarch64_expand_epilogue (bool for_sibcall)
5896 {
5897 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5898 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5899 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5900 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5901 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5902 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5903 rtx cfi_ops = NULL;
5904 rtx_insn *insn;
5905 /* A stack clash protection prologue may not have left EP0_REGNUM or
5906 EP1_REGNUM in a usable state. The same is true for allocations
5907 with an SVE component, since we then need both temporary registers
5908 for each allocation. For stack clash we are in a usable state if
5909 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5910 HOST_WIDE_INT guard_size
5911 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5912 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5913
5914 /* We can re-use the registers when the allocation amount is smaller than
5915 guard_size - guard_used_by_caller because we won't be doing any probes
5916 then. In such situations the register should remain live with the correct
5917 value. */
5918 bool can_inherit_p = (initial_adjust.is_constant ()
5919 && final_adjust.is_constant ())
5920 && (!flag_stack_clash_protection
5921 || known_lt (initial_adjust,
5922 guard_size - guard_used_by_caller));
5923
5924 /* We need to add memory barrier to prevent read from deallocated stack. */
5925 bool need_barrier_p
5926 = maybe_ne (get_frame_size ()
5927 + cfun->machine->frame.saved_varargs_size, 0);
5928
5929 /* Emit a barrier to prevent loads from a deallocated stack. */
5930 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5931 || cfun->calls_alloca
5932 || crtl->calls_eh_return)
5933 {
5934 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5935 need_barrier_p = false;
5936 }
5937
5938 /* Restore the stack pointer from the frame pointer if it may not
5939 be the same as the stack pointer. */
5940 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5941 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5942 if (frame_pointer_needed
5943 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5944 /* If writeback is used when restoring callee-saves, the CFA
5945 is restored on the instruction doing the writeback. */
5946 aarch64_add_offset (Pmode, stack_pointer_rtx,
5947 hard_frame_pointer_rtx, -callee_offset,
5948 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5949 else
5950 /* The case where we need to re-use the register here is very rare, so
5951 avoid the complicated condition and just always emit a move if the
5952 immediate doesn't fit. */
5953 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5954
5955 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5956 callee_adjust != 0, &cfi_ops);
5957 if (aarch64_simd_decl_p (cfun->decl))
5958 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5959 callee_adjust != 0, &cfi_ops);
5960 else
5961 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5962 callee_adjust != 0, &cfi_ops);
5963
5964 if (need_barrier_p)
5965 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5966
5967 if (callee_adjust != 0)
5968 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5969
5970 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5971 {
5972 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5973 insn = get_last_insn ();
5974 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5975 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5976 RTX_FRAME_RELATED_P (insn) = 1;
5977 cfi_ops = NULL;
5978 }
5979
5980 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5981 add restriction on emit_move optimization to leaf functions. */
5982 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5983 (!can_inherit_p || !crtl->is_leaf
5984 || df_regs_ever_live_p (EP0_REGNUM)));
5985
5986 if (cfi_ops)
5987 {
5988 /* Emit delayed restores and reset the CFA to be SP. */
5989 insn = get_last_insn ();
5990 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5991 REG_NOTES (insn) = cfi_ops;
5992 RTX_FRAME_RELATED_P (insn) = 1;
5993 }
5994
5995 /* We prefer to emit the combined return/authenticate instruction RETAA,
5996 however there are three cases in which we must instead emit an explicit
5997 authentication instruction.
5998
5999 1) Sibcalls don't return in a normal way, so if we're about to call one
6000 we must authenticate.
6001
6002 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6003 generating code for !TARGET_ARMV8_3 we can't use it and must
6004 explicitly authenticate.
6005
6006 3) On an eh_return path we make extra stack adjustments to update the
6007 canonical frame address to be the exception handler's CFA. We want
6008 to authenticate using the CFA of the function which calls eh_return.
6009 */
6010 if (aarch64_return_address_signing_enabled ()
6011 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6012 {
6013 switch (aarch64_ra_sign_key)
6014 {
6015 case AARCH64_KEY_A:
6016 insn = emit_insn (gen_autiasp ());
6017 break;
6018 case AARCH64_KEY_B:
6019 insn = emit_insn (gen_autibsp ());
6020 break;
6021 default:
6022 gcc_unreachable ();
6023 }
6024 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6025 RTX_FRAME_RELATED_P (insn) = 1;
6026 }
6027
6028 /* Stack adjustment for exception handler. */
6029 if (crtl->calls_eh_return && !for_sibcall)
6030 {
6031 /* We need to unwind the stack by the offset computed by
6032 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6033 to be SP; letting the CFA move during this adjustment
6034 is just as correct as retaining the CFA from the body
6035 of the function. Therefore, do nothing special. */
6036 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6037 }
6038
6039 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6040 if (!for_sibcall)
6041 emit_jump_insn (ret_rtx);
6042 }
6043
6044 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6045 normally or return to a previous frame after unwinding.
6046
6047 An EH return uses a single shared return sequence. The epilogue is
6048 exactly like a normal epilogue except that it has an extra input
6049 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6050 that must be applied after the frame has been destroyed. An extra label
6051 is inserted before the epilogue which initializes this register to zero,
6052 and this is the entry point for a normal return.
6053
6054 An actual EH return updates the return address, initializes the stack
6055 adjustment and jumps directly into the epilogue (bypassing the zeroing
6056 of the adjustment). Since the return address is typically saved on the
6057 stack when a function makes a call, the saved LR must be updated outside
6058 the epilogue.
6059
6060 This poses problems as the store is generated well before the epilogue,
6061 so the offset of LR is not known yet. Also optimizations will remove the
6062 store as it appears dead, even after the epilogue is generated (as the
6063 base or offset for loading LR is different in many cases).
6064
6065 To avoid these problems this implementation forces the frame pointer
6066 in eh_return functions so that the location of LR is fixed and known early.
6067 It also marks the store volatile, so no optimization is permitted to
6068 remove the store. */
6069 rtx
6070 aarch64_eh_return_handler_rtx (void)
6071 {
6072 rtx tmp = gen_frame_mem (Pmode,
6073 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6074
6075 /* Mark the store volatile, so no optimization is permitted to remove it. */
6076 MEM_VOLATILE_P (tmp) = true;
6077 return tmp;
6078 }
6079
6080 /* Output code to add DELTA to the first argument, and then jump
6081 to FUNCTION. Used for C++ multiple inheritance. */
6082 static void
6083 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6084 HOST_WIDE_INT delta,
6085 HOST_WIDE_INT vcall_offset,
6086 tree function)
6087 {
6088 /* The this pointer is always in x0. Note that this differs from
6089 Arm where the this pointer maybe bumped to r1 if r0 is required
6090 to return a pointer to an aggregate. On AArch64 a result value
6091 pointer will be in x8. */
6092 int this_regno = R0_REGNUM;
6093 rtx this_rtx, temp0, temp1, addr, funexp;
6094 rtx_insn *insn;
6095 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6096
6097 if (aarch64_bti_enabled ())
6098 emit_insn (gen_bti_c());
6099
6100 reload_completed = 1;
6101 emit_note (NOTE_INSN_PROLOGUE_END);
6102
6103 this_rtx = gen_rtx_REG (Pmode, this_regno);
6104 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6105 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6106
6107 if (vcall_offset == 0)
6108 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6109 else
6110 {
6111 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6112
6113 addr = this_rtx;
6114 if (delta != 0)
6115 {
6116 if (delta >= -256 && delta < 256)
6117 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6118 plus_constant (Pmode, this_rtx, delta));
6119 else
6120 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6121 temp1, temp0, false);
6122 }
6123
6124 if (Pmode == ptr_mode)
6125 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6126 else
6127 aarch64_emit_move (temp0,
6128 gen_rtx_ZERO_EXTEND (Pmode,
6129 gen_rtx_MEM (ptr_mode, addr)));
6130
6131 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6132 addr = plus_constant (Pmode, temp0, vcall_offset);
6133 else
6134 {
6135 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6136 Pmode);
6137 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6138 }
6139
6140 if (Pmode == ptr_mode)
6141 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6142 else
6143 aarch64_emit_move (temp1,
6144 gen_rtx_SIGN_EXTEND (Pmode,
6145 gen_rtx_MEM (ptr_mode, addr)));
6146
6147 emit_insn (gen_add2_insn (this_rtx, temp1));
6148 }
6149
6150 /* Generate a tail call to the target function. */
6151 if (!TREE_USED (function))
6152 {
6153 assemble_external (function);
6154 TREE_USED (function) = 1;
6155 }
6156 funexp = XEXP (DECL_RTL (function), 0);
6157 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6158 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6159 SIBLING_CALL_P (insn) = 1;
6160
6161 insn = get_insns ();
6162 shorten_branches (insn);
6163
6164 assemble_start_function (thunk, fnname);
6165 final_start_function (insn, file, 1);
6166 final (insn, file, 1);
6167 final_end_function ();
6168 assemble_end_function (thunk, fnname);
6169
6170 /* Stop pretending to be a post-reload pass. */
6171 reload_completed = 0;
6172 }
6173
6174 static bool
6175 aarch64_tls_referenced_p (rtx x)
6176 {
6177 if (!TARGET_HAVE_TLS)
6178 return false;
6179 subrtx_iterator::array_type array;
6180 FOR_EACH_SUBRTX (iter, array, x, ALL)
6181 {
6182 const_rtx x = *iter;
6183 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6184 return true;
6185 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6186 TLS offsets, not real symbol references. */
6187 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6188 iter.skip_subrtxes ();
6189 }
6190 return false;
6191 }
6192
6193
6194 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6195 a left shift of 0 or 12 bits. */
6196 bool
6197 aarch64_uimm12_shift (HOST_WIDE_INT val)
6198 {
6199 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6200 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6201 );
6202 }
6203
6204 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6205 that can be created with a left shift of 0 or 12. */
6206 static HOST_WIDE_INT
6207 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6208 {
6209 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6210 handle correctly. */
6211 gcc_assert ((val & 0xffffff) == val);
6212
6213 if (((val & 0xfff) << 0) == val)
6214 return val;
6215
6216 return val & (0xfff << 12);
6217 }
6218
6219 /* Return true if val is an immediate that can be loaded into a
6220 register by a MOVZ instruction. */
6221 static bool
6222 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6223 {
6224 if (GET_MODE_SIZE (mode) > 4)
6225 {
6226 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6227 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6228 return 1;
6229 }
6230 else
6231 {
6232 /* Ignore sign extension. */
6233 val &= (HOST_WIDE_INT) 0xffffffff;
6234 }
6235 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6236 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6237 }
6238
6239 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6240 64-bit (DImode) integer. */
6241
6242 static unsigned HOST_WIDE_INT
6243 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6244 {
6245 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6246 while (size < 64)
6247 {
6248 val &= (HOST_WIDE_INT_1U << size) - 1;
6249 val |= val << size;
6250 size *= 2;
6251 }
6252 return val;
6253 }
6254
6255 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6256
6257 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6258 {
6259 0x0000000100000001ull,
6260 0x0001000100010001ull,
6261 0x0101010101010101ull,
6262 0x1111111111111111ull,
6263 0x5555555555555555ull,
6264 };
6265
6266
6267 /* Return true if val is a valid bitmask immediate. */
6268
6269 bool
6270 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6271 {
6272 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6273 int bits;
6274
6275 /* Check for a single sequence of one bits and return quickly if so.
6276 The special cases of all ones and all zeroes returns false. */
6277 val = aarch64_replicate_bitmask_imm (val_in, mode);
6278 tmp = val + (val & -val);
6279
6280 if (tmp == (tmp & -tmp))
6281 return (val + 1) > 1;
6282
6283 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6284 if (mode == SImode)
6285 val = (val << 32) | (val & 0xffffffff);
6286
6287 /* Invert if the immediate doesn't start with a zero bit - this means we
6288 only need to search for sequences of one bits. */
6289 if (val & 1)
6290 val = ~val;
6291
6292 /* Find the first set bit and set tmp to val with the first sequence of one
6293 bits removed. Return success if there is a single sequence of ones. */
6294 first_one = val & -val;
6295 tmp = val & (val + first_one);
6296
6297 if (tmp == 0)
6298 return true;
6299
6300 /* Find the next set bit and compute the difference in bit position. */
6301 next_one = tmp & -tmp;
6302 bits = clz_hwi (first_one) - clz_hwi (next_one);
6303 mask = val ^ tmp;
6304
6305 /* Check the bit position difference is a power of 2, and that the first
6306 sequence of one bits fits within 'bits' bits. */
6307 if ((mask >> bits) != 0 || bits != (bits & -bits))
6308 return false;
6309
6310 /* Check the sequence of one bits is repeated 64/bits times. */
6311 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6312 }
6313
6314 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6315 Assumed precondition: VAL_IN Is not zero. */
6316
6317 unsigned HOST_WIDE_INT
6318 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6319 {
6320 int lowest_bit_set = ctz_hwi (val_in);
6321 int highest_bit_set = floor_log2 (val_in);
6322 gcc_assert (val_in != 0);
6323
6324 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6325 (HOST_WIDE_INT_1U << lowest_bit_set));
6326 }
6327
6328 /* Create constant where bits outside of lowest bit set to highest bit set
6329 are set to 1. */
6330
6331 unsigned HOST_WIDE_INT
6332 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6333 {
6334 return val_in | ~aarch64_and_split_imm1 (val_in);
6335 }
6336
6337 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6338
6339 bool
6340 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6341 {
6342 scalar_int_mode int_mode;
6343 if (!is_a <scalar_int_mode> (mode, &int_mode))
6344 return false;
6345
6346 if (aarch64_bitmask_imm (val_in, int_mode))
6347 return false;
6348
6349 if (aarch64_move_imm (val_in, int_mode))
6350 return false;
6351
6352 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6353
6354 return aarch64_bitmask_imm (imm2, int_mode);
6355 }
6356
6357 /* Return true if val is an immediate that can be loaded into a
6358 register in a single instruction. */
6359 bool
6360 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6361 {
6362 scalar_int_mode int_mode;
6363 if (!is_a <scalar_int_mode> (mode, &int_mode))
6364 return false;
6365
6366 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6367 return 1;
6368 return aarch64_bitmask_imm (val, int_mode);
6369 }
6370
6371 static bool
6372 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6373 {
6374 rtx base, offset;
6375
6376 if (GET_CODE (x) == HIGH)
6377 return true;
6378
6379 /* There's no way to calculate VL-based values using relocations. */
6380 subrtx_iterator::array_type array;
6381 FOR_EACH_SUBRTX (iter, array, x, ALL)
6382 if (GET_CODE (*iter) == CONST_POLY_INT)
6383 return true;
6384
6385 split_const (x, &base, &offset);
6386 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6387 {
6388 if (aarch64_classify_symbol (base, INTVAL (offset))
6389 != SYMBOL_FORCE_TO_MEM)
6390 return true;
6391 else
6392 /* Avoid generating a 64-bit relocation in ILP32; leave
6393 to aarch64_expand_mov_immediate to handle it properly. */
6394 return mode != ptr_mode;
6395 }
6396
6397 return aarch64_tls_referenced_p (x);
6398 }
6399
6400 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6401 The expansion for a table switch is quite expensive due to the number
6402 of instructions, the table lookup and hard to predict indirect jump.
6403 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6404 set, otherwise use tables for > 16 cases as a tradeoff between size and
6405 performance. When optimizing for size, use the default setting. */
6406
6407 static unsigned int
6408 aarch64_case_values_threshold (void)
6409 {
6410 /* Use the specified limit for the number of cases before using jump
6411 tables at higher optimization levels. */
6412 if (optimize > 2
6413 && selected_cpu->tune->max_case_values != 0)
6414 return selected_cpu->tune->max_case_values;
6415 else
6416 return optimize_size ? default_case_values_threshold () : 17;
6417 }
6418
6419 /* Return true if register REGNO is a valid index register.
6420 STRICT_P is true if REG_OK_STRICT is in effect. */
6421
6422 bool
6423 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6424 {
6425 if (!HARD_REGISTER_NUM_P (regno))
6426 {
6427 if (!strict_p)
6428 return true;
6429
6430 if (!reg_renumber)
6431 return false;
6432
6433 regno = reg_renumber[regno];
6434 }
6435 return GP_REGNUM_P (regno);
6436 }
6437
6438 /* Return true if register REGNO is a valid base register for mode MODE.
6439 STRICT_P is true if REG_OK_STRICT is in effect. */
6440
6441 bool
6442 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6443 {
6444 if (!HARD_REGISTER_NUM_P (regno))
6445 {
6446 if (!strict_p)
6447 return true;
6448
6449 if (!reg_renumber)
6450 return false;
6451
6452 regno = reg_renumber[regno];
6453 }
6454
6455 /* The fake registers will be eliminated to either the stack or
6456 hard frame pointer, both of which are usually valid base registers.
6457 Reload deals with the cases where the eliminated form isn't valid. */
6458 return (GP_REGNUM_P (regno)
6459 || regno == SP_REGNUM
6460 || regno == FRAME_POINTER_REGNUM
6461 || regno == ARG_POINTER_REGNUM);
6462 }
6463
6464 /* Return true if X is a valid base register for mode MODE.
6465 STRICT_P is true if REG_OK_STRICT is in effect. */
6466
6467 static bool
6468 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6469 {
6470 if (!strict_p
6471 && GET_CODE (x) == SUBREG
6472 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6473 x = SUBREG_REG (x);
6474
6475 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6476 }
6477
6478 /* Return true if address offset is a valid index. If it is, fill in INFO
6479 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6480
6481 static bool
6482 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6483 machine_mode mode, bool strict_p)
6484 {
6485 enum aarch64_address_type type;
6486 rtx index;
6487 int shift;
6488
6489 /* (reg:P) */
6490 if ((REG_P (x) || GET_CODE (x) == SUBREG)
6491 && GET_MODE (x) == Pmode)
6492 {
6493 type = ADDRESS_REG_REG;
6494 index = x;
6495 shift = 0;
6496 }
6497 /* (sign_extend:DI (reg:SI)) */
6498 else if ((GET_CODE (x) == SIGN_EXTEND
6499 || GET_CODE (x) == ZERO_EXTEND)
6500 && GET_MODE (x) == DImode
6501 && GET_MODE (XEXP (x, 0)) == SImode)
6502 {
6503 type = (GET_CODE (x) == SIGN_EXTEND)
6504 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6505 index = XEXP (x, 0);
6506 shift = 0;
6507 }
6508 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6509 else if (GET_CODE (x) == MULT
6510 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6511 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6512 && GET_MODE (XEXP (x, 0)) == DImode
6513 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6514 && CONST_INT_P (XEXP (x, 1)))
6515 {
6516 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6517 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6518 index = XEXP (XEXP (x, 0), 0);
6519 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6520 }
6521 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6522 else if (GET_CODE (x) == ASHIFT
6523 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6524 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6525 && GET_MODE (XEXP (x, 0)) == DImode
6526 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6527 && CONST_INT_P (XEXP (x, 1)))
6528 {
6529 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6530 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6531 index = XEXP (XEXP (x, 0), 0);
6532 shift = INTVAL (XEXP (x, 1));
6533 }
6534 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6535 else if ((GET_CODE (x) == SIGN_EXTRACT
6536 || GET_CODE (x) == ZERO_EXTRACT)
6537 && GET_MODE (x) == DImode
6538 && GET_CODE (XEXP (x, 0)) == MULT
6539 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6540 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6541 {
6542 type = (GET_CODE (x) == SIGN_EXTRACT)
6543 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6544 index = XEXP (XEXP (x, 0), 0);
6545 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6546 if (INTVAL (XEXP (x, 1)) != 32 + shift
6547 || INTVAL (XEXP (x, 2)) != 0)
6548 shift = -1;
6549 }
6550 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6551 (const_int 0xffffffff<<shift)) */
6552 else if (GET_CODE (x) == AND
6553 && GET_MODE (x) == DImode
6554 && GET_CODE (XEXP (x, 0)) == MULT
6555 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6556 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6557 && CONST_INT_P (XEXP (x, 1)))
6558 {
6559 type = ADDRESS_REG_UXTW;
6560 index = XEXP (XEXP (x, 0), 0);
6561 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6562 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6563 shift = -1;
6564 }
6565 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6566 else if ((GET_CODE (x) == SIGN_EXTRACT
6567 || GET_CODE (x) == ZERO_EXTRACT)
6568 && GET_MODE (x) == DImode
6569 && GET_CODE (XEXP (x, 0)) == ASHIFT
6570 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6571 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6572 {
6573 type = (GET_CODE (x) == SIGN_EXTRACT)
6574 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6575 index = XEXP (XEXP (x, 0), 0);
6576 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6577 if (INTVAL (XEXP (x, 1)) != 32 + shift
6578 || INTVAL (XEXP (x, 2)) != 0)
6579 shift = -1;
6580 }
6581 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6582 (const_int 0xffffffff<<shift)) */
6583 else if (GET_CODE (x) == AND
6584 && GET_MODE (x) == DImode
6585 && GET_CODE (XEXP (x, 0)) == ASHIFT
6586 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6587 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6588 && CONST_INT_P (XEXP (x, 1)))
6589 {
6590 type = ADDRESS_REG_UXTW;
6591 index = XEXP (XEXP (x, 0), 0);
6592 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6593 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6594 shift = -1;
6595 }
6596 /* (mult:P (reg:P) (const_int scale)) */
6597 else if (GET_CODE (x) == MULT
6598 && GET_MODE (x) == Pmode
6599 && GET_MODE (XEXP (x, 0)) == Pmode
6600 && CONST_INT_P (XEXP (x, 1)))
6601 {
6602 type = ADDRESS_REG_REG;
6603 index = XEXP (x, 0);
6604 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6605 }
6606 /* (ashift:P (reg:P) (const_int shift)) */
6607 else if (GET_CODE (x) == ASHIFT
6608 && GET_MODE (x) == Pmode
6609 && GET_MODE (XEXP (x, 0)) == Pmode
6610 && CONST_INT_P (XEXP (x, 1)))
6611 {
6612 type = ADDRESS_REG_REG;
6613 index = XEXP (x, 0);
6614 shift = INTVAL (XEXP (x, 1));
6615 }
6616 else
6617 return false;
6618
6619 if (!strict_p
6620 && GET_CODE (index) == SUBREG
6621 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6622 index = SUBREG_REG (index);
6623
6624 if (aarch64_sve_data_mode_p (mode))
6625 {
6626 if (type != ADDRESS_REG_REG
6627 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6628 return false;
6629 }
6630 else
6631 {
6632 if (shift != 0
6633 && !(IN_RANGE (shift, 1, 3)
6634 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6635 return false;
6636 }
6637
6638 if (REG_P (index)
6639 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6640 {
6641 info->type = type;
6642 info->offset = index;
6643 info->shift = shift;
6644 return true;
6645 }
6646
6647 return false;
6648 }
6649
6650 /* Return true if MODE is one of the modes for which we
6651 support LDP/STP operations. */
6652
6653 static bool
6654 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6655 {
6656 return mode == SImode || mode == DImode
6657 || mode == SFmode || mode == DFmode
6658 || (aarch64_vector_mode_supported_p (mode)
6659 && (known_eq (GET_MODE_SIZE (mode), 8)
6660 || (known_eq (GET_MODE_SIZE (mode), 16)
6661 && (aarch64_tune_params.extra_tuning_flags
6662 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6663 }
6664
6665 /* Return true if REGNO is a virtual pointer register, or an eliminable
6666 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6667 include stack_pointer or hard_frame_pointer. */
6668 static bool
6669 virt_or_elim_regno_p (unsigned regno)
6670 {
6671 return ((regno >= FIRST_VIRTUAL_REGISTER
6672 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6673 || regno == FRAME_POINTER_REGNUM
6674 || regno == ARG_POINTER_REGNUM);
6675 }
6676
6677 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6678 If it is, fill in INFO appropriately. STRICT_P is true if
6679 REG_OK_STRICT is in effect. */
6680
6681 bool
6682 aarch64_classify_address (struct aarch64_address_info *info,
6683 rtx x, machine_mode mode, bool strict_p,
6684 aarch64_addr_query_type type)
6685 {
6686 enum rtx_code code = GET_CODE (x);
6687 rtx op0, op1;
6688 poly_int64 offset;
6689
6690 HOST_WIDE_INT const_size;
6691
6692 /* On BE, we use load/store pair for all large int mode load/stores.
6693 TI/TFmode may also use a load/store pair. */
6694 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6695 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6696 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6697 || type == ADDR_QUERY_LDP_STP_N
6698 || mode == TImode
6699 || mode == TFmode
6700 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6701
6702 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6703 corresponds to the actual size of the memory being loaded/stored and the
6704 mode of the corresponding addressing mode is half of that. */
6705 if (type == ADDR_QUERY_LDP_STP_N
6706 && known_eq (GET_MODE_SIZE (mode), 16))
6707 mode = DFmode;
6708
6709 bool allow_reg_index_p = (!load_store_pair_p
6710 && (known_lt (GET_MODE_SIZE (mode), 16)
6711 || vec_flags == VEC_ADVSIMD
6712 || vec_flags & VEC_SVE_DATA));
6713
6714 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6715 [Rn, #offset, MUL VL]. */
6716 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6717 && (code != REG && code != PLUS))
6718 return false;
6719
6720 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6721 REG addressing. */
6722 if (advsimd_struct_p
6723 && !BYTES_BIG_ENDIAN
6724 && (code != POST_INC && code != REG))
6725 return false;
6726
6727 gcc_checking_assert (GET_MODE (x) == VOIDmode
6728 || SCALAR_INT_MODE_P (GET_MODE (x)));
6729
6730 switch (code)
6731 {
6732 case REG:
6733 case SUBREG:
6734 info->type = ADDRESS_REG_IMM;
6735 info->base = x;
6736 info->offset = const0_rtx;
6737 info->const_offset = 0;
6738 return aarch64_base_register_rtx_p (x, strict_p);
6739
6740 case PLUS:
6741 op0 = XEXP (x, 0);
6742 op1 = XEXP (x, 1);
6743
6744 if (! strict_p
6745 && REG_P (op0)
6746 && virt_or_elim_regno_p (REGNO (op0))
6747 && poly_int_rtx_p (op1, &offset))
6748 {
6749 info->type = ADDRESS_REG_IMM;
6750 info->base = op0;
6751 info->offset = op1;
6752 info->const_offset = offset;
6753
6754 return true;
6755 }
6756
6757 if (maybe_ne (GET_MODE_SIZE (mode), 0)
6758 && aarch64_base_register_rtx_p (op0, strict_p)
6759 && poly_int_rtx_p (op1, &offset))
6760 {
6761 info->type = ADDRESS_REG_IMM;
6762 info->base = op0;
6763 info->offset = op1;
6764 info->const_offset = offset;
6765
6766 /* TImode and TFmode values are allowed in both pairs of X
6767 registers and individual Q registers. The available
6768 address modes are:
6769 X,X: 7-bit signed scaled offset
6770 Q: 9-bit signed offset
6771 We conservatively require an offset representable in either mode.
6772 When performing the check for pairs of X registers i.e. LDP/STP
6773 pass down DImode since that is the natural size of the LDP/STP
6774 instruction memory accesses. */
6775 if (mode == TImode || mode == TFmode)
6776 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6777 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6778 || offset_12bit_unsigned_scaled_p (mode, offset)));
6779
6780 /* A 7bit offset check because OImode will emit a ldp/stp
6781 instruction (only big endian will get here).
6782 For ldp/stp instructions, the offset is scaled for the size of a
6783 single element of the pair. */
6784 if (mode == OImode)
6785 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6786
6787 /* Three 9/12 bit offsets checks because CImode will emit three
6788 ldr/str instructions (only big endian will get here). */
6789 if (mode == CImode)
6790 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6791 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6792 offset + 32)
6793 || offset_12bit_unsigned_scaled_p (V16QImode,
6794 offset + 32)));
6795
6796 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6797 instructions (only big endian will get here). */
6798 if (mode == XImode)
6799 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6800 && aarch64_offset_7bit_signed_scaled_p (TImode,
6801 offset + 32));
6802
6803 /* Make "m" use the LD1 offset range for SVE data modes, so
6804 that pre-RTL optimizers like ivopts will work to that
6805 instead of the wider LDR/STR range. */
6806 if (vec_flags == VEC_SVE_DATA)
6807 return (type == ADDR_QUERY_M
6808 ? offset_4bit_signed_scaled_p (mode, offset)
6809 : offset_9bit_signed_scaled_p (mode, offset));
6810
6811 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6812 {
6813 poly_int64 end_offset = (offset
6814 + GET_MODE_SIZE (mode)
6815 - BYTES_PER_SVE_VECTOR);
6816 return (type == ADDR_QUERY_M
6817 ? offset_4bit_signed_scaled_p (mode, offset)
6818 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6819 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6820 end_offset)));
6821 }
6822
6823 if (vec_flags == VEC_SVE_PRED)
6824 return offset_9bit_signed_scaled_p (mode, offset);
6825
6826 if (load_store_pair_p)
6827 return ((known_eq (GET_MODE_SIZE (mode), 4)
6828 || known_eq (GET_MODE_SIZE (mode), 8)
6829 || known_eq (GET_MODE_SIZE (mode), 16))
6830 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6831 else
6832 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6833 || offset_12bit_unsigned_scaled_p (mode, offset));
6834 }
6835
6836 if (allow_reg_index_p)
6837 {
6838 /* Look for base + (scaled/extended) index register. */
6839 if (aarch64_base_register_rtx_p (op0, strict_p)
6840 && aarch64_classify_index (info, op1, mode, strict_p))
6841 {
6842 info->base = op0;
6843 return true;
6844 }
6845 if (aarch64_base_register_rtx_p (op1, strict_p)
6846 && aarch64_classify_index (info, op0, mode, strict_p))
6847 {
6848 info->base = op1;
6849 return true;
6850 }
6851 }
6852
6853 return false;
6854
6855 case POST_INC:
6856 case POST_DEC:
6857 case PRE_INC:
6858 case PRE_DEC:
6859 info->type = ADDRESS_REG_WB;
6860 info->base = XEXP (x, 0);
6861 info->offset = NULL_RTX;
6862 return aarch64_base_register_rtx_p (info->base, strict_p);
6863
6864 case POST_MODIFY:
6865 case PRE_MODIFY:
6866 info->type = ADDRESS_REG_WB;
6867 info->base = XEXP (x, 0);
6868 if (GET_CODE (XEXP (x, 1)) == PLUS
6869 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6870 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6871 && aarch64_base_register_rtx_p (info->base, strict_p))
6872 {
6873 info->offset = XEXP (XEXP (x, 1), 1);
6874 info->const_offset = offset;
6875
6876 /* TImode and TFmode values are allowed in both pairs of X
6877 registers and individual Q registers. The available
6878 address modes are:
6879 X,X: 7-bit signed scaled offset
6880 Q: 9-bit signed offset
6881 We conservatively require an offset representable in either mode.
6882 */
6883 if (mode == TImode || mode == TFmode)
6884 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6885 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6886
6887 if (load_store_pair_p)
6888 return ((known_eq (GET_MODE_SIZE (mode), 4)
6889 || known_eq (GET_MODE_SIZE (mode), 8)
6890 || known_eq (GET_MODE_SIZE (mode), 16))
6891 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6892 else
6893 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6894 }
6895 return false;
6896
6897 case CONST:
6898 case SYMBOL_REF:
6899 case LABEL_REF:
6900 /* load literal: pc-relative constant pool entry. Only supported
6901 for SI mode or larger. */
6902 info->type = ADDRESS_SYMBOLIC;
6903
6904 if (!load_store_pair_p
6905 && GET_MODE_SIZE (mode).is_constant (&const_size)
6906 && const_size >= 4)
6907 {
6908 rtx sym, addend;
6909
6910 split_const (x, &sym, &addend);
6911 return ((GET_CODE (sym) == LABEL_REF
6912 || (GET_CODE (sym) == SYMBOL_REF
6913 && CONSTANT_POOL_ADDRESS_P (sym)
6914 && aarch64_pcrelative_literal_loads)));
6915 }
6916 return false;
6917
6918 case LO_SUM:
6919 info->type = ADDRESS_LO_SUM;
6920 info->base = XEXP (x, 0);
6921 info->offset = XEXP (x, 1);
6922 if (allow_reg_index_p
6923 && aarch64_base_register_rtx_p (info->base, strict_p))
6924 {
6925 rtx sym, offs;
6926 split_const (info->offset, &sym, &offs);
6927 if (GET_CODE (sym) == SYMBOL_REF
6928 && (aarch64_classify_symbol (sym, INTVAL (offs))
6929 == SYMBOL_SMALL_ABSOLUTE))
6930 {
6931 /* The symbol and offset must be aligned to the access size. */
6932 unsigned int align;
6933
6934 if (CONSTANT_POOL_ADDRESS_P (sym))
6935 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6936 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6937 {
6938 tree exp = SYMBOL_REF_DECL (sym);
6939 align = TYPE_ALIGN (TREE_TYPE (exp));
6940 align = aarch64_constant_alignment (exp, align);
6941 }
6942 else if (SYMBOL_REF_DECL (sym))
6943 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6944 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6945 && SYMBOL_REF_BLOCK (sym) != NULL)
6946 align = SYMBOL_REF_BLOCK (sym)->alignment;
6947 else
6948 align = BITS_PER_UNIT;
6949
6950 poly_int64 ref_size = GET_MODE_SIZE (mode);
6951 if (known_eq (ref_size, 0))
6952 ref_size = GET_MODE_SIZE (DImode);
6953
6954 return (multiple_p (INTVAL (offs), ref_size)
6955 && multiple_p (align / BITS_PER_UNIT, ref_size));
6956 }
6957 }
6958 return false;
6959
6960 default:
6961 return false;
6962 }
6963 }
6964
6965 /* Return true if the address X is valid for a PRFM instruction.
6966 STRICT_P is true if we should do strict checking with
6967 aarch64_classify_address. */
6968
6969 bool
6970 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6971 {
6972 struct aarch64_address_info addr;
6973
6974 /* PRFM accepts the same addresses as DImode... */
6975 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6976 if (!res)
6977 return false;
6978
6979 /* ... except writeback forms. */
6980 return addr.type != ADDRESS_REG_WB;
6981 }
6982
6983 bool
6984 aarch64_symbolic_address_p (rtx x)
6985 {
6986 rtx offset;
6987
6988 split_const (x, &x, &offset);
6989 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6990 }
6991
6992 /* Classify the base of symbolic expression X. */
6993
6994 enum aarch64_symbol_type
6995 aarch64_classify_symbolic_expression (rtx x)
6996 {
6997 rtx offset;
6998
6999 split_const (x, &x, &offset);
7000 return aarch64_classify_symbol (x, INTVAL (offset));
7001 }
7002
7003
7004 /* Return TRUE if X is a legitimate address for accessing memory in
7005 mode MODE. */
7006 static bool
7007 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7008 {
7009 struct aarch64_address_info addr;
7010
7011 return aarch64_classify_address (&addr, x, mode, strict_p);
7012 }
7013
7014 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7015 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7016 bool
7017 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7018 aarch64_addr_query_type type)
7019 {
7020 struct aarch64_address_info addr;
7021
7022 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7023 }
7024
7025 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7026
7027 static bool
7028 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7029 poly_int64 orig_offset,
7030 machine_mode mode)
7031 {
7032 HOST_WIDE_INT size;
7033 if (GET_MODE_SIZE (mode).is_constant (&size))
7034 {
7035 HOST_WIDE_INT const_offset, second_offset;
7036
7037 /* A general SVE offset is A * VQ + B. Remove the A component from
7038 coefficient 0 in order to get the constant B. */
7039 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7040
7041 /* Split an out-of-range address displacement into a base and
7042 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7043 range otherwise to increase opportunities for sharing the base
7044 address of different sizes. Unaligned accesses use the signed
7045 9-bit range, TImode/TFmode use the intersection of signed
7046 scaled 7-bit and signed 9-bit offset. */
7047 if (mode == TImode || mode == TFmode)
7048 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7049 else if ((const_offset & (size - 1)) != 0)
7050 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7051 else
7052 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7053
7054 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7055 return false;
7056
7057 /* Split the offset into second_offset and the rest. */
7058 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7059 *offset2 = gen_int_mode (second_offset, Pmode);
7060 return true;
7061 }
7062 else
7063 {
7064 /* Get the mode we should use as the basis of the range. For structure
7065 modes this is the mode of one vector. */
7066 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7067 machine_mode step_mode
7068 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7069
7070 /* Get the "mul vl" multiplier we'd like to use. */
7071 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7072 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7073 if (vec_flags & VEC_SVE_DATA)
7074 /* LDR supports a 9-bit range, but the move patterns for
7075 structure modes require all vectors to be in range of the
7076 same base. The simplest way of accomodating that while still
7077 promoting reuse of anchor points between different modes is
7078 to use an 8-bit range unconditionally. */
7079 vnum = ((vnum + 128) & 255) - 128;
7080 else
7081 /* Predicates are only handled singly, so we might as well use
7082 the full range. */
7083 vnum = ((vnum + 256) & 511) - 256;
7084 if (vnum == 0)
7085 return false;
7086
7087 /* Convert the "mul vl" multiplier into a byte offset. */
7088 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7089 if (known_eq (second_offset, orig_offset))
7090 return false;
7091
7092 /* Split the offset into second_offset and the rest. */
7093 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7094 *offset2 = gen_int_mode (second_offset, Pmode);
7095 return true;
7096 }
7097 }
7098
7099 /* Return the binary representation of floating point constant VALUE in INTVAL.
7100 If the value cannot be converted, return false without setting INTVAL.
7101 The conversion is done in the given MODE. */
7102 bool
7103 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7104 {
7105
7106 /* We make a general exception for 0. */
7107 if (aarch64_float_const_zero_rtx_p (value))
7108 {
7109 *intval = 0;
7110 return true;
7111 }
7112
7113 scalar_float_mode mode;
7114 if (GET_CODE (value) != CONST_DOUBLE
7115 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7116 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7117 /* Only support up to DF mode. */
7118 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7119 return false;
7120
7121 unsigned HOST_WIDE_INT ival = 0;
7122
7123 long res[2];
7124 real_to_target (res,
7125 CONST_DOUBLE_REAL_VALUE (value),
7126 REAL_MODE_FORMAT (mode));
7127
7128 if (mode == DFmode)
7129 {
7130 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7131 ival = zext_hwi (res[order], 32);
7132 ival |= (zext_hwi (res[1 - order], 32) << 32);
7133 }
7134 else
7135 ival = zext_hwi (res[0], 32);
7136
7137 *intval = ival;
7138 return true;
7139 }
7140
7141 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7142 single MOV(+MOVK) followed by an FMOV. */
7143 bool
7144 aarch64_float_const_rtx_p (rtx x)
7145 {
7146 machine_mode mode = GET_MODE (x);
7147 if (mode == VOIDmode)
7148 return false;
7149
7150 /* Determine whether it's cheaper to write float constants as
7151 mov/movk pairs over ldr/adrp pairs. */
7152 unsigned HOST_WIDE_INT ival;
7153
7154 if (GET_CODE (x) == CONST_DOUBLE
7155 && SCALAR_FLOAT_MODE_P (mode)
7156 && aarch64_reinterpret_float_as_int (x, &ival))
7157 {
7158 scalar_int_mode imode = (mode == HFmode
7159 ? SImode
7160 : int_mode_for_mode (mode).require ());
7161 int num_instr = aarch64_internal_mov_immediate
7162 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7163 return num_instr < 3;
7164 }
7165
7166 return false;
7167 }
7168
7169 /* Return TRUE if rtx X is immediate constant 0.0 */
7170 bool
7171 aarch64_float_const_zero_rtx_p (rtx x)
7172 {
7173 if (GET_MODE (x) == VOIDmode)
7174 return false;
7175
7176 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7177 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7178 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7179 }
7180
7181 /* Return TRUE if rtx X is immediate constant that fits in a single
7182 MOVI immediate operation. */
7183 bool
7184 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7185 {
7186 if (!TARGET_SIMD)
7187 return false;
7188
7189 machine_mode vmode;
7190 scalar_int_mode imode;
7191 unsigned HOST_WIDE_INT ival;
7192
7193 if (GET_CODE (x) == CONST_DOUBLE
7194 && SCALAR_FLOAT_MODE_P (mode))
7195 {
7196 if (!aarch64_reinterpret_float_as_int (x, &ival))
7197 return false;
7198
7199 /* We make a general exception for 0. */
7200 if (aarch64_float_const_zero_rtx_p (x))
7201 return true;
7202
7203 imode = int_mode_for_mode (mode).require ();
7204 }
7205 else if (GET_CODE (x) == CONST_INT
7206 && is_a <scalar_int_mode> (mode, &imode))
7207 ival = INTVAL (x);
7208 else
7209 return false;
7210
7211 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7212 a 128 bit vector mode. */
7213 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7214
7215 vmode = aarch64_simd_container_mode (imode, width);
7216 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7217
7218 return aarch64_simd_valid_immediate (v_op, NULL);
7219 }
7220
7221
7222 /* Return the fixed registers used for condition codes. */
7223
7224 static bool
7225 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7226 {
7227 *p1 = CC_REGNUM;
7228 *p2 = INVALID_REGNUM;
7229 return true;
7230 }
7231
7232 /* This function is used by the call expanders of the machine description.
7233 RESULT is the register in which the result is returned. It's NULL for
7234 "call" and "sibcall".
7235 MEM is the location of the function call.
7236 SIBCALL indicates whether this function call is normal call or sibling call.
7237 It will generate different pattern accordingly. */
7238
7239 void
7240 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7241 {
7242 rtx call, callee, tmp;
7243 rtvec vec;
7244 machine_mode mode;
7245
7246 gcc_assert (MEM_P (mem));
7247 callee = XEXP (mem, 0);
7248 mode = GET_MODE (callee);
7249 gcc_assert (mode == Pmode);
7250
7251 /* Decide if we should generate indirect calls by loading the
7252 address of the callee into a register before performing
7253 the branch-and-link. */
7254 if (SYMBOL_REF_P (callee)
7255 ? (aarch64_is_long_call_p (callee)
7256 || aarch64_is_noplt_call_p (callee))
7257 : !REG_P (callee))
7258 XEXP (mem, 0) = force_reg (mode, callee);
7259
7260 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7261
7262 if (result != NULL_RTX)
7263 call = gen_rtx_SET (result, call);
7264
7265 if (sibcall)
7266 tmp = ret_rtx;
7267 else
7268 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7269
7270 vec = gen_rtvec (2, call, tmp);
7271 call = gen_rtx_PARALLEL (VOIDmode, vec);
7272
7273 aarch64_emit_call_insn (call);
7274 }
7275
7276 /* Emit call insn with PAT and do aarch64-specific handling. */
7277
7278 void
7279 aarch64_emit_call_insn (rtx pat)
7280 {
7281 rtx insn = emit_call_insn (pat);
7282
7283 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7284 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7285 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7286 }
7287
7288 machine_mode
7289 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7290 {
7291 machine_mode mode_x = GET_MODE (x);
7292 rtx_code code_x = GET_CODE (x);
7293
7294 /* All floating point compares return CCFP if it is an equality
7295 comparison, and CCFPE otherwise. */
7296 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7297 {
7298 switch (code)
7299 {
7300 case EQ:
7301 case NE:
7302 case UNORDERED:
7303 case ORDERED:
7304 case UNLT:
7305 case UNLE:
7306 case UNGT:
7307 case UNGE:
7308 case UNEQ:
7309 return CCFPmode;
7310
7311 case LT:
7312 case LE:
7313 case GT:
7314 case GE:
7315 case LTGT:
7316 return CCFPEmode;
7317
7318 default:
7319 gcc_unreachable ();
7320 }
7321 }
7322
7323 /* Equality comparisons of short modes against zero can be performed
7324 using the TST instruction with the appropriate bitmask. */
7325 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7326 && (code == EQ || code == NE)
7327 && (mode_x == HImode || mode_x == QImode))
7328 return CC_NZmode;
7329
7330 /* Similarly, comparisons of zero_extends from shorter modes can
7331 be performed using an ANDS with an immediate mask. */
7332 if (y == const0_rtx && code_x == ZERO_EXTEND
7333 && (mode_x == SImode || mode_x == DImode)
7334 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7335 && (code == EQ || code == NE))
7336 return CC_NZmode;
7337
7338 if ((mode_x == SImode || mode_x == DImode)
7339 && y == const0_rtx
7340 && (code == EQ || code == NE || code == LT || code == GE)
7341 && (code_x == PLUS || code_x == MINUS || code_x == AND
7342 || code_x == NEG
7343 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7344 && CONST_INT_P (XEXP (x, 2)))))
7345 return CC_NZmode;
7346
7347 /* A compare with a shifted operand. Because of canonicalization,
7348 the comparison will have to be swapped when we emit the assembly
7349 code. */
7350 if ((mode_x == SImode || mode_x == DImode)
7351 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7352 && (code_x == ASHIFT || code_x == ASHIFTRT
7353 || code_x == LSHIFTRT
7354 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7355 return CC_SWPmode;
7356
7357 /* Similarly for a negated operand, but we can only do this for
7358 equalities. */
7359 if ((mode_x == SImode || mode_x == DImode)
7360 && (REG_P (y) || GET_CODE (y) == SUBREG)
7361 && (code == EQ || code == NE)
7362 && code_x == NEG)
7363 return CC_Zmode;
7364
7365 /* A test for unsigned overflow from an addition. */
7366 if ((mode_x == DImode || mode_x == TImode)
7367 && (code == LTU || code == GEU)
7368 && code_x == PLUS
7369 && rtx_equal_p (XEXP (x, 0), y))
7370 return CC_Cmode;
7371
7372 /* A test for unsigned overflow from an add with carry. */
7373 if ((mode_x == DImode || mode_x == TImode)
7374 && (code == LTU || code == GEU)
7375 && code_x == PLUS
7376 && CONST_SCALAR_INT_P (y)
7377 && (rtx_mode_t (y, mode_x)
7378 == (wi::shwi (1, mode_x)
7379 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7380 return CC_ADCmode;
7381
7382 /* A test for signed overflow. */
7383 if ((mode_x == DImode || mode_x == TImode)
7384 && code == NE
7385 && code_x == PLUS
7386 && GET_CODE (y) == SIGN_EXTEND)
7387 return CC_Vmode;
7388
7389 /* For everything else, return CCmode. */
7390 return CCmode;
7391 }
7392
7393 static int
7394 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7395
7396 int
7397 aarch64_get_condition_code (rtx x)
7398 {
7399 machine_mode mode = GET_MODE (XEXP (x, 0));
7400 enum rtx_code comp_code = GET_CODE (x);
7401
7402 if (GET_MODE_CLASS (mode) != MODE_CC)
7403 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7404 return aarch64_get_condition_code_1 (mode, comp_code);
7405 }
7406
7407 static int
7408 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7409 {
7410 switch (mode)
7411 {
7412 case E_CCFPmode:
7413 case E_CCFPEmode:
7414 switch (comp_code)
7415 {
7416 case GE: return AARCH64_GE;
7417 case GT: return AARCH64_GT;
7418 case LE: return AARCH64_LS;
7419 case LT: return AARCH64_MI;
7420 case NE: return AARCH64_NE;
7421 case EQ: return AARCH64_EQ;
7422 case ORDERED: return AARCH64_VC;
7423 case UNORDERED: return AARCH64_VS;
7424 case UNLT: return AARCH64_LT;
7425 case UNLE: return AARCH64_LE;
7426 case UNGT: return AARCH64_HI;
7427 case UNGE: return AARCH64_PL;
7428 default: return -1;
7429 }
7430 break;
7431
7432 case E_CCmode:
7433 switch (comp_code)
7434 {
7435 case NE: return AARCH64_NE;
7436 case EQ: return AARCH64_EQ;
7437 case GE: return AARCH64_GE;
7438 case GT: return AARCH64_GT;
7439 case LE: return AARCH64_LE;
7440 case LT: return AARCH64_LT;
7441 case GEU: return AARCH64_CS;
7442 case GTU: return AARCH64_HI;
7443 case LEU: return AARCH64_LS;
7444 case LTU: return AARCH64_CC;
7445 default: return -1;
7446 }
7447 break;
7448
7449 case E_CC_SWPmode:
7450 switch (comp_code)
7451 {
7452 case NE: return AARCH64_NE;
7453 case EQ: return AARCH64_EQ;
7454 case GE: return AARCH64_LE;
7455 case GT: return AARCH64_LT;
7456 case LE: return AARCH64_GE;
7457 case LT: return AARCH64_GT;
7458 case GEU: return AARCH64_LS;
7459 case GTU: return AARCH64_CC;
7460 case LEU: return AARCH64_CS;
7461 case LTU: return AARCH64_HI;
7462 default: return -1;
7463 }
7464 break;
7465
7466 case E_CC_NZCmode:
7467 switch (comp_code)
7468 {
7469 case NE: return AARCH64_NE; /* = any */
7470 case EQ: return AARCH64_EQ; /* = none */
7471 case GE: return AARCH64_PL; /* = nfrst */
7472 case LT: return AARCH64_MI; /* = first */
7473 case GEU: return AARCH64_CS; /* = nlast */
7474 case GTU: return AARCH64_HI; /* = pmore */
7475 case LEU: return AARCH64_LS; /* = plast */
7476 case LTU: return AARCH64_CC; /* = last */
7477 default: return -1;
7478 }
7479 break;
7480
7481 case E_CC_NZmode:
7482 switch (comp_code)
7483 {
7484 case NE: return AARCH64_NE;
7485 case EQ: return AARCH64_EQ;
7486 case GE: return AARCH64_PL;
7487 case LT: return AARCH64_MI;
7488 default: return -1;
7489 }
7490 break;
7491
7492 case E_CC_Zmode:
7493 switch (comp_code)
7494 {
7495 case NE: return AARCH64_NE;
7496 case EQ: return AARCH64_EQ;
7497 default: return -1;
7498 }
7499 break;
7500
7501 case E_CC_Cmode:
7502 switch (comp_code)
7503 {
7504 case LTU: return AARCH64_CS;
7505 case GEU: return AARCH64_CC;
7506 default: return -1;
7507 }
7508 break;
7509
7510 case E_CC_ADCmode:
7511 switch (comp_code)
7512 {
7513 case GEU: return AARCH64_CS;
7514 case LTU: return AARCH64_CC;
7515 default: return -1;
7516 }
7517 break;
7518
7519 case E_CC_Vmode:
7520 switch (comp_code)
7521 {
7522 case NE: return AARCH64_VS;
7523 case EQ: return AARCH64_VC;
7524 default: return -1;
7525 }
7526 break;
7527
7528 default:
7529 return -1;
7530 }
7531
7532 return -1;
7533 }
7534
7535 bool
7536 aarch64_const_vec_all_same_in_range_p (rtx x,
7537 HOST_WIDE_INT minval,
7538 HOST_WIDE_INT maxval)
7539 {
7540 rtx elt;
7541 return (const_vec_duplicate_p (x, &elt)
7542 && CONST_INT_P (elt)
7543 && IN_RANGE (INTVAL (elt), minval, maxval));
7544 }
7545
7546 bool
7547 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7548 {
7549 return aarch64_const_vec_all_same_in_range_p (x, val, val);
7550 }
7551
7552 /* Return true if VEC is a constant in which every element is in the range
7553 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7554
7555 static bool
7556 aarch64_const_vec_all_in_range_p (rtx vec,
7557 HOST_WIDE_INT minval,
7558 HOST_WIDE_INT maxval)
7559 {
7560 if (GET_CODE (vec) != CONST_VECTOR
7561 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7562 return false;
7563
7564 int nunits;
7565 if (!CONST_VECTOR_STEPPED_P (vec))
7566 nunits = const_vector_encoded_nelts (vec);
7567 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7568 return false;
7569
7570 for (int i = 0; i < nunits; i++)
7571 {
7572 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7573 if (!CONST_INT_P (vec_elem)
7574 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7575 return false;
7576 }
7577 return true;
7578 }
7579
7580 /* N Z C V. */
7581 #define AARCH64_CC_V 1
7582 #define AARCH64_CC_C (1 << 1)
7583 #define AARCH64_CC_Z (1 << 2)
7584 #define AARCH64_CC_N (1 << 3)
7585
7586 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7587 static const int aarch64_nzcv_codes[] =
7588 {
7589 0, /* EQ, Z == 1. */
7590 AARCH64_CC_Z, /* NE, Z == 0. */
7591 0, /* CS, C == 1. */
7592 AARCH64_CC_C, /* CC, C == 0. */
7593 0, /* MI, N == 1. */
7594 AARCH64_CC_N, /* PL, N == 0. */
7595 0, /* VS, V == 1. */
7596 AARCH64_CC_V, /* VC, V == 0. */
7597 0, /* HI, C ==1 && Z == 0. */
7598 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
7599 AARCH64_CC_V, /* GE, N == V. */
7600 0, /* LT, N != V. */
7601 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
7602 0, /* LE, !(Z == 0 && N == V). */
7603 0, /* AL, Any. */
7604 0 /* NV, Any. */
7605 };
7606
7607 /* Print floating-point vector immediate operand X to F, negating it
7608 first if NEGATE is true. Return true on success, false if it isn't
7609 a constant we can handle. */
7610
7611 static bool
7612 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7613 {
7614 rtx elt;
7615
7616 if (!const_vec_duplicate_p (x, &elt))
7617 return false;
7618
7619 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7620 if (negate)
7621 r = real_value_negate (&r);
7622
7623 /* We only handle the SVE single-bit immediates here. */
7624 if (real_equal (&r, &dconst0))
7625 asm_fprintf (f, "0.0");
7626 else if (real_equal (&r, &dconst1))
7627 asm_fprintf (f, "1.0");
7628 else if (real_equal (&r, &dconsthalf))
7629 asm_fprintf (f, "0.5");
7630 else
7631 return false;
7632
7633 return true;
7634 }
7635
7636 /* Return the equivalent letter for size. */
7637 static char
7638 sizetochar (int size)
7639 {
7640 switch (size)
7641 {
7642 case 64: return 'd';
7643 case 32: return 's';
7644 case 16: return 'h';
7645 case 8 : return 'b';
7646 default: gcc_unreachable ();
7647 }
7648 }
7649
7650 /* Print operand X to file F in a target specific manner according to CODE.
7651 The acceptable formatting commands given by CODE are:
7652 'c': An integer or symbol address without a preceding #
7653 sign.
7654 'C': Take the duplicated element in a vector constant
7655 and print it in hex.
7656 'D': Take the duplicated element in a vector constant
7657 and print it as an unsigned integer, in decimal.
7658 'e': Print the sign/zero-extend size as a character 8->b,
7659 16->h, 32->w.
7660 'p': Prints N such that 2^N == X (X must be power of 2 and
7661 const int).
7662 'P': Print the number of non-zero bits in X (a const_int).
7663 'H': Print the higher numbered register of a pair (TImode)
7664 of regs.
7665 'm': Print a condition (eq, ne, etc).
7666 'M': Same as 'm', but invert condition.
7667 'N': Take the duplicated element in a vector constant
7668 and print the negative of it in decimal.
7669 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7670 'S/T/U/V': Print a FP/SIMD register name for a register list.
7671 The register printed is the FP/SIMD register name
7672 of X + 0/1/2/3 for S/T/U/V.
7673 'R': Print a scalar FP/SIMD register name + 1.
7674 'X': Print bottom 16 bits of integer constant in hex.
7675 'w/x': Print a general register name or the zero register
7676 (32-bit or 64-bit).
7677 '0': Print a normal operand, if it's a general register,
7678 then we assume DImode.
7679 'k': Print NZCV for conditional compare instructions.
7680 'A': Output address constant representing the first
7681 argument of X, specifying a relocation offset
7682 if appropriate.
7683 'L': Output constant address specified by X
7684 with a relocation offset if appropriate.
7685 'G': Prints address of X, specifying a PC relative
7686 relocation mode if appropriate.
7687 'y': Output address of LDP or STP - this is used for
7688 some LDP/STPs which don't use a PARALLEL in their
7689 pattern (so the mode needs to be adjusted).
7690 'z': Output address of a typical LDP or STP. */
7691
7692 static void
7693 aarch64_print_operand (FILE *f, rtx x, int code)
7694 {
7695 rtx elt;
7696 switch (code)
7697 {
7698 case 'c':
7699 switch (GET_CODE (x))
7700 {
7701 case CONST_INT:
7702 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7703 break;
7704
7705 case SYMBOL_REF:
7706 output_addr_const (f, x);
7707 break;
7708
7709 case CONST:
7710 if (GET_CODE (XEXP (x, 0)) == PLUS
7711 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7712 {
7713 output_addr_const (f, x);
7714 break;
7715 }
7716 /* Fall through. */
7717
7718 default:
7719 output_operand_lossage ("unsupported operand for code '%c'", code);
7720 }
7721 break;
7722
7723 case 'e':
7724 {
7725 int n;
7726
7727 if (!CONST_INT_P (x)
7728 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7729 {
7730 output_operand_lossage ("invalid operand for '%%%c'", code);
7731 return;
7732 }
7733
7734 switch (n)
7735 {
7736 case 3:
7737 fputc ('b', f);
7738 break;
7739 case 4:
7740 fputc ('h', f);
7741 break;
7742 case 5:
7743 fputc ('w', f);
7744 break;
7745 default:
7746 output_operand_lossage ("invalid operand for '%%%c'", code);
7747 return;
7748 }
7749 }
7750 break;
7751
7752 case 'p':
7753 {
7754 int n;
7755
7756 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7757 {
7758 output_operand_lossage ("invalid operand for '%%%c'", code);
7759 return;
7760 }
7761
7762 asm_fprintf (f, "%d", n);
7763 }
7764 break;
7765
7766 case 'P':
7767 if (!CONST_INT_P (x))
7768 {
7769 output_operand_lossage ("invalid operand for '%%%c'", code);
7770 return;
7771 }
7772
7773 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7774 break;
7775
7776 case 'H':
7777 if (x == const0_rtx)
7778 {
7779 asm_fprintf (f, "xzr");
7780 break;
7781 }
7782
7783 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7784 {
7785 output_operand_lossage ("invalid operand for '%%%c'", code);
7786 return;
7787 }
7788
7789 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7790 break;
7791
7792 case 'M':
7793 case 'm':
7794 {
7795 int cond_code;
7796 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7797 if (x == const_true_rtx)
7798 {
7799 if (code == 'M')
7800 fputs ("nv", f);
7801 return;
7802 }
7803
7804 if (!COMPARISON_P (x))
7805 {
7806 output_operand_lossage ("invalid operand for '%%%c'", code);
7807 return;
7808 }
7809
7810 cond_code = aarch64_get_condition_code (x);
7811 gcc_assert (cond_code >= 0);
7812 if (code == 'M')
7813 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7814 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
7815 fputs (aarch64_sve_condition_codes[cond_code], f);
7816 else
7817 fputs (aarch64_condition_codes[cond_code], f);
7818 }
7819 break;
7820
7821 case 'N':
7822 if (!const_vec_duplicate_p (x, &elt))
7823 {
7824 output_operand_lossage ("invalid vector constant");
7825 return;
7826 }
7827
7828 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7829 asm_fprintf (f, "%wd", -INTVAL (elt));
7830 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7831 && aarch64_print_vector_float_operand (f, x, true))
7832 ;
7833 else
7834 {
7835 output_operand_lossage ("invalid vector constant");
7836 return;
7837 }
7838 break;
7839
7840 case 'b':
7841 case 'h':
7842 case 's':
7843 case 'd':
7844 case 'q':
7845 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7846 {
7847 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7848 return;
7849 }
7850 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7851 break;
7852
7853 case 'S':
7854 case 'T':
7855 case 'U':
7856 case 'V':
7857 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7858 {
7859 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7860 return;
7861 }
7862 asm_fprintf (f, "%c%d",
7863 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7864 REGNO (x) - V0_REGNUM + (code - 'S'));
7865 break;
7866
7867 case 'R':
7868 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7869 {
7870 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7871 return;
7872 }
7873 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7874 break;
7875
7876 case 'X':
7877 if (!CONST_INT_P (x))
7878 {
7879 output_operand_lossage ("invalid operand for '%%%c'", code);
7880 return;
7881 }
7882 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7883 break;
7884
7885 case 'C':
7886 {
7887 /* Print a replicated constant in hex. */
7888 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7889 {
7890 output_operand_lossage ("invalid operand for '%%%c'", code);
7891 return;
7892 }
7893 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7894 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7895 }
7896 break;
7897
7898 case 'D':
7899 {
7900 /* Print a replicated constant in decimal, treating it as
7901 unsigned. */
7902 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7903 {
7904 output_operand_lossage ("invalid operand for '%%%c'", code);
7905 return;
7906 }
7907 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7908 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7909 }
7910 break;
7911
7912 case 'w':
7913 case 'x':
7914 if (x == const0_rtx
7915 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7916 {
7917 asm_fprintf (f, "%czr", code);
7918 break;
7919 }
7920
7921 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7922 {
7923 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7924 break;
7925 }
7926
7927 if (REG_P (x) && REGNO (x) == SP_REGNUM)
7928 {
7929 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7930 break;
7931 }
7932
7933 /* Fall through */
7934
7935 case 0:
7936 if (x == NULL)
7937 {
7938 output_operand_lossage ("missing operand");
7939 return;
7940 }
7941
7942 switch (GET_CODE (x))
7943 {
7944 case REG:
7945 if (aarch64_sve_data_mode_p (GET_MODE (x)))
7946 {
7947 if (REG_NREGS (x) == 1)
7948 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7949 else
7950 {
7951 char suffix
7952 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7953 asm_fprintf (f, "{z%d.%c - z%d.%c}",
7954 REGNO (x) - V0_REGNUM, suffix,
7955 END_REGNO (x) - V0_REGNUM - 1, suffix);
7956 }
7957 }
7958 else
7959 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7960 break;
7961
7962 case MEM:
7963 output_address (GET_MODE (x), XEXP (x, 0));
7964 break;
7965
7966 case LABEL_REF:
7967 case SYMBOL_REF:
7968 output_addr_const (asm_out_file, x);
7969 break;
7970
7971 case CONST_INT:
7972 asm_fprintf (f, "%wd", INTVAL (x));
7973 break;
7974
7975 case CONST:
7976 if (!VECTOR_MODE_P (GET_MODE (x)))
7977 {
7978 output_addr_const (asm_out_file, x);
7979 break;
7980 }
7981 /* fall through */
7982
7983 case CONST_VECTOR:
7984 if (!const_vec_duplicate_p (x, &elt))
7985 {
7986 output_operand_lossage ("invalid vector constant");
7987 return;
7988 }
7989
7990 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7991 asm_fprintf (f, "%wd", INTVAL (elt));
7992 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7993 && aarch64_print_vector_float_operand (f, x, false))
7994 ;
7995 else
7996 {
7997 output_operand_lossage ("invalid vector constant");
7998 return;
7999 }
8000 break;
8001
8002 case CONST_DOUBLE:
8003 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8004 be getting CONST_DOUBLEs holding integers. */
8005 gcc_assert (GET_MODE (x) != VOIDmode);
8006 if (aarch64_float_const_zero_rtx_p (x))
8007 {
8008 fputc ('0', f);
8009 break;
8010 }
8011 else if (aarch64_float_const_representable_p (x))
8012 {
8013 #define buf_size 20
8014 char float_buf[buf_size] = {'\0'};
8015 real_to_decimal_for_mode (float_buf,
8016 CONST_DOUBLE_REAL_VALUE (x),
8017 buf_size, buf_size,
8018 1, GET_MODE (x));
8019 asm_fprintf (asm_out_file, "%s", float_buf);
8020 break;
8021 #undef buf_size
8022 }
8023 output_operand_lossage ("invalid constant");
8024 return;
8025 default:
8026 output_operand_lossage ("invalid operand");
8027 return;
8028 }
8029 break;
8030
8031 case 'A':
8032 if (GET_CODE (x) == HIGH)
8033 x = XEXP (x, 0);
8034
8035 switch (aarch64_classify_symbolic_expression (x))
8036 {
8037 case SYMBOL_SMALL_GOT_4G:
8038 asm_fprintf (asm_out_file, ":got:");
8039 break;
8040
8041 case SYMBOL_SMALL_TLSGD:
8042 asm_fprintf (asm_out_file, ":tlsgd:");
8043 break;
8044
8045 case SYMBOL_SMALL_TLSDESC:
8046 asm_fprintf (asm_out_file, ":tlsdesc:");
8047 break;
8048
8049 case SYMBOL_SMALL_TLSIE:
8050 asm_fprintf (asm_out_file, ":gottprel:");
8051 break;
8052
8053 case SYMBOL_TLSLE24:
8054 asm_fprintf (asm_out_file, ":tprel:");
8055 break;
8056
8057 case SYMBOL_TINY_GOT:
8058 gcc_unreachable ();
8059 break;
8060
8061 default:
8062 break;
8063 }
8064 output_addr_const (asm_out_file, x);
8065 break;
8066
8067 case 'L':
8068 switch (aarch64_classify_symbolic_expression (x))
8069 {
8070 case SYMBOL_SMALL_GOT_4G:
8071 asm_fprintf (asm_out_file, ":lo12:");
8072 break;
8073
8074 case SYMBOL_SMALL_TLSGD:
8075 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8076 break;
8077
8078 case SYMBOL_SMALL_TLSDESC:
8079 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8080 break;
8081
8082 case SYMBOL_SMALL_TLSIE:
8083 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8084 break;
8085
8086 case SYMBOL_TLSLE12:
8087 asm_fprintf (asm_out_file, ":tprel_lo12:");
8088 break;
8089
8090 case SYMBOL_TLSLE24:
8091 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8092 break;
8093
8094 case SYMBOL_TINY_GOT:
8095 asm_fprintf (asm_out_file, ":got:");
8096 break;
8097
8098 case SYMBOL_TINY_TLSIE:
8099 asm_fprintf (asm_out_file, ":gottprel:");
8100 break;
8101
8102 default:
8103 break;
8104 }
8105 output_addr_const (asm_out_file, x);
8106 break;
8107
8108 case 'G':
8109 switch (aarch64_classify_symbolic_expression (x))
8110 {
8111 case SYMBOL_TLSLE24:
8112 asm_fprintf (asm_out_file, ":tprel_hi12:");
8113 break;
8114 default:
8115 break;
8116 }
8117 output_addr_const (asm_out_file, x);
8118 break;
8119
8120 case 'k':
8121 {
8122 HOST_WIDE_INT cond_code;
8123
8124 if (!CONST_INT_P (x))
8125 {
8126 output_operand_lossage ("invalid operand for '%%%c'", code);
8127 return;
8128 }
8129
8130 cond_code = INTVAL (x);
8131 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8132 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8133 }
8134 break;
8135
8136 case 'y':
8137 case 'z':
8138 {
8139 machine_mode mode = GET_MODE (x);
8140
8141 if (GET_CODE (x) != MEM
8142 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8143 {
8144 output_operand_lossage ("invalid operand for '%%%c'", code);
8145 return;
8146 }
8147
8148 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8149 code == 'y'
8150 ? ADDR_QUERY_LDP_STP_N
8151 : ADDR_QUERY_LDP_STP))
8152 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8153 }
8154 break;
8155
8156 default:
8157 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8158 return;
8159 }
8160 }
8161
8162 /* Print address 'x' of a memory access with mode 'mode'.
8163 'op' is the context required by aarch64_classify_address. It can either be
8164 MEM for a normal memory access or PARALLEL for LDP/STP. */
8165 static bool
8166 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8167 aarch64_addr_query_type type)
8168 {
8169 struct aarch64_address_info addr;
8170 unsigned int size;
8171
8172 /* Check all addresses are Pmode - including ILP32. */
8173 if (GET_MODE (x) != Pmode
8174 && (!CONST_INT_P (x)
8175 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8176 {
8177 output_operand_lossage ("invalid address mode");
8178 return false;
8179 }
8180
8181 if (aarch64_classify_address (&addr, x, mode, true, type))
8182 switch (addr.type)
8183 {
8184 case ADDRESS_REG_IMM:
8185 if (known_eq (addr.const_offset, 0))
8186 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8187 else if (aarch64_sve_data_mode_p (mode))
8188 {
8189 HOST_WIDE_INT vnum
8190 = exact_div (addr.const_offset,
8191 BYTES_PER_SVE_VECTOR).to_constant ();
8192 asm_fprintf (f, "[%s, #%wd, mul vl]",
8193 reg_names[REGNO (addr.base)], vnum);
8194 }
8195 else if (aarch64_sve_pred_mode_p (mode))
8196 {
8197 HOST_WIDE_INT vnum
8198 = exact_div (addr.const_offset,
8199 BYTES_PER_SVE_PRED).to_constant ();
8200 asm_fprintf (f, "[%s, #%wd, mul vl]",
8201 reg_names[REGNO (addr.base)], vnum);
8202 }
8203 else
8204 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8205 INTVAL (addr.offset));
8206 return true;
8207
8208 case ADDRESS_REG_REG:
8209 if (addr.shift == 0)
8210 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8211 reg_names [REGNO (addr.offset)]);
8212 else
8213 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8214 reg_names [REGNO (addr.offset)], addr.shift);
8215 return true;
8216
8217 case ADDRESS_REG_UXTW:
8218 if (addr.shift == 0)
8219 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8220 REGNO (addr.offset) - R0_REGNUM);
8221 else
8222 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8223 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8224 return true;
8225
8226 case ADDRESS_REG_SXTW:
8227 if (addr.shift == 0)
8228 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8229 REGNO (addr.offset) - R0_REGNUM);
8230 else
8231 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8232 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8233 return true;
8234
8235 case ADDRESS_REG_WB:
8236 /* Writeback is only supported for fixed-width modes. */
8237 size = GET_MODE_SIZE (mode).to_constant ();
8238 switch (GET_CODE (x))
8239 {
8240 case PRE_INC:
8241 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8242 return true;
8243 case POST_INC:
8244 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8245 return true;
8246 case PRE_DEC:
8247 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8248 return true;
8249 case POST_DEC:
8250 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8251 return true;
8252 case PRE_MODIFY:
8253 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8254 INTVAL (addr.offset));
8255 return true;
8256 case POST_MODIFY:
8257 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8258 INTVAL (addr.offset));
8259 return true;
8260 default:
8261 break;
8262 }
8263 break;
8264
8265 case ADDRESS_LO_SUM:
8266 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8267 output_addr_const (f, addr.offset);
8268 asm_fprintf (f, "]");
8269 return true;
8270
8271 case ADDRESS_SYMBOLIC:
8272 output_addr_const (f, x);
8273 return true;
8274 }
8275
8276 return false;
8277 }
8278
8279 /* Print address 'x' of a memory access with mode 'mode'. */
8280 static void
8281 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8282 {
8283 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8284 output_addr_const (f, x);
8285 }
8286
8287 bool
8288 aarch64_label_mentioned_p (rtx x)
8289 {
8290 const char *fmt;
8291 int i;
8292
8293 if (GET_CODE (x) == LABEL_REF)
8294 return true;
8295
8296 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8297 referencing instruction, but they are constant offsets, not
8298 symbols. */
8299 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8300 return false;
8301
8302 fmt = GET_RTX_FORMAT (GET_CODE (x));
8303 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8304 {
8305 if (fmt[i] == 'E')
8306 {
8307 int j;
8308
8309 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8310 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8311 return 1;
8312 }
8313 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8314 return 1;
8315 }
8316
8317 return 0;
8318 }
8319
8320 /* Implement REGNO_REG_CLASS. */
8321
8322 enum reg_class
8323 aarch64_regno_regclass (unsigned regno)
8324 {
8325 if (GP_REGNUM_P (regno))
8326 return GENERAL_REGS;
8327
8328 if (regno == SP_REGNUM)
8329 return STACK_REG;
8330
8331 if (regno == FRAME_POINTER_REGNUM
8332 || regno == ARG_POINTER_REGNUM)
8333 return POINTER_REGS;
8334
8335 if (FP_REGNUM_P (regno))
8336 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
8337 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
8338
8339 if (PR_REGNUM_P (regno))
8340 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8341
8342 return NO_REGS;
8343 }
8344
8345 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8346 If OFFSET is out of range, return an offset of an anchor point
8347 that is in range. Return 0 otherwise. */
8348
8349 static HOST_WIDE_INT
8350 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8351 machine_mode mode)
8352 {
8353 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8354 if (size > 16)
8355 return (offset + 0x400) & ~0x7f0;
8356
8357 /* For offsets that aren't a multiple of the access size, the limit is
8358 -256...255. */
8359 if (offset & (size - 1))
8360 {
8361 /* BLKmode typically uses LDP of X-registers. */
8362 if (mode == BLKmode)
8363 return (offset + 512) & ~0x3ff;
8364 return (offset + 0x100) & ~0x1ff;
8365 }
8366
8367 /* Small negative offsets are supported. */
8368 if (IN_RANGE (offset, -256, 0))
8369 return 0;
8370
8371 if (mode == TImode || mode == TFmode)
8372 return (offset + 0x100) & ~0x1ff;
8373
8374 /* Use 12-bit offset by access size. */
8375 return offset & (~0xfff * size);
8376 }
8377
8378 static rtx
8379 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
8380 {
8381 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8382 where mask is selected by alignment and size of the offset.
8383 We try to pick as large a range for the offset as possible to
8384 maximize the chance of a CSE. However, for aligned addresses
8385 we limit the range to 4k so that structures with different sized
8386 elements are likely to use the same base. We need to be careful
8387 not to split a CONST for some forms of address expression, otherwise
8388 it will generate sub-optimal code. */
8389
8390 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8391 {
8392 rtx base = XEXP (x, 0);
8393 rtx offset_rtx = XEXP (x, 1);
8394 HOST_WIDE_INT offset = INTVAL (offset_rtx);
8395
8396 if (GET_CODE (base) == PLUS)
8397 {
8398 rtx op0 = XEXP (base, 0);
8399 rtx op1 = XEXP (base, 1);
8400
8401 /* Force any scaling into a temp for CSE. */
8402 op0 = force_reg (Pmode, op0);
8403 op1 = force_reg (Pmode, op1);
8404
8405 /* Let the pointer register be in op0. */
8406 if (REG_POINTER (op1))
8407 std::swap (op0, op1);
8408
8409 /* If the pointer is virtual or frame related, then we know that
8410 virtual register instantiation or register elimination is going
8411 to apply a second constant. We want the two constants folded
8412 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8413 if (virt_or_elim_regno_p (REGNO (op0)))
8414 {
8415 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8416 NULL_RTX, true, OPTAB_DIRECT);
8417 return gen_rtx_PLUS (Pmode, base, op1);
8418 }
8419
8420 /* Otherwise, in order to encourage CSE (and thence loop strength
8421 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8422 base = expand_binop (Pmode, add_optab, op0, op1,
8423 NULL_RTX, true, OPTAB_DIRECT);
8424 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8425 }
8426
8427 HOST_WIDE_INT size;
8428 if (GET_MODE_SIZE (mode).is_constant (&size))
8429 {
8430 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8431 mode);
8432 if (base_offset != 0)
8433 {
8434 base = plus_constant (Pmode, base, base_offset);
8435 base = force_operand (base, NULL_RTX);
8436 return plus_constant (Pmode, base, offset - base_offset);
8437 }
8438 }
8439 }
8440
8441 return x;
8442 }
8443
8444 static reg_class_t
8445 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8446 reg_class_t rclass,
8447 machine_mode mode,
8448 secondary_reload_info *sri)
8449 {
8450 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8451 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8452 comment at the head of aarch64-sve.md for more details about the
8453 big-endian handling. */
8454 if (BYTES_BIG_ENDIAN
8455 && reg_class_subset_p (rclass, FP_REGS)
8456 && !((REG_P (x) && HARD_REGISTER_P (x))
8457 || aarch64_simd_valid_immediate (x, NULL))
8458 && aarch64_sve_data_mode_p (mode))
8459 {
8460 sri->icode = CODE_FOR_aarch64_sve_reload_be;
8461 return NO_REGS;
8462 }
8463
8464 /* If we have to disable direct literal pool loads and stores because the
8465 function is too big, then we need a scratch register. */
8466 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8467 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8468 || targetm.vector_mode_supported_p (GET_MODE (x)))
8469 && !aarch64_pcrelative_literal_loads)
8470 {
8471 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8472 return NO_REGS;
8473 }
8474
8475 /* Without the TARGET_SIMD instructions we cannot move a Q register
8476 to a Q register directly. We need a scratch. */
8477 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8478 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8479 && reg_class_subset_p (rclass, FP_REGS))
8480 {
8481 sri->icode = code_for_aarch64_reload_mov (mode);
8482 return NO_REGS;
8483 }
8484
8485 /* A TFmode or TImode memory access should be handled via an FP_REGS
8486 because AArch64 has richer addressing modes for LDR/STR instructions
8487 than LDP/STP instructions. */
8488 if (TARGET_FLOAT && rclass == GENERAL_REGS
8489 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8490 return FP_REGS;
8491
8492 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8493 return GENERAL_REGS;
8494
8495 return NO_REGS;
8496 }
8497
8498 static bool
8499 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8500 {
8501 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8502
8503 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8504 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8505 if (frame_pointer_needed)
8506 return to == HARD_FRAME_POINTER_REGNUM;
8507 return true;
8508 }
8509
8510 poly_int64
8511 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8512 {
8513 if (to == HARD_FRAME_POINTER_REGNUM)
8514 {
8515 if (from == ARG_POINTER_REGNUM)
8516 return cfun->machine->frame.hard_fp_offset;
8517
8518 if (from == FRAME_POINTER_REGNUM)
8519 return cfun->machine->frame.hard_fp_offset
8520 - cfun->machine->frame.locals_offset;
8521 }
8522
8523 if (to == STACK_POINTER_REGNUM)
8524 {
8525 if (from == FRAME_POINTER_REGNUM)
8526 return cfun->machine->frame.frame_size
8527 - cfun->machine->frame.locals_offset;
8528 }
8529
8530 return cfun->machine->frame.frame_size;
8531 }
8532
8533 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8534 previous frame. */
8535
8536 rtx
8537 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8538 {
8539 if (count != 0)
8540 return const0_rtx;
8541 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8542 }
8543
8544
8545 static void
8546 aarch64_asm_trampoline_template (FILE *f)
8547 {
8548 int offset1 = 16;
8549 int offset2 = 20;
8550
8551 if (aarch64_bti_enabled ())
8552 {
8553 asm_fprintf (f, "\thint\t34 // bti c\n");
8554 offset1 -= 4;
8555 offset2 -= 4;
8556 }
8557
8558 if (TARGET_ILP32)
8559 {
8560 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8561 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8562 offset1);
8563 }
8564 else
8565 {
8566 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8567 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8568 offset2);
8569 }
8570 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8571
8572 /* The trampoline needs an extra padding instruction. In case if BTI is
8573 enabled the padding instruction is replaced by the BTI instruction at
8574 the beginning. */
8575 if (!aarch64_bti_enabled ())
8576 assemble_aligned_integer (4, const0_rtx);
8577
8578 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8579 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8580 }
8581
8582 static void
8583 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8584 {
8585 rtx fnaddr, mem, a_tramp;
8586 const int tramp_code_sz = 16;
8587
8588 /* Don't need to copy the trailing D-words, we fill those in below. */
8589 emit_block_move (m_tramp, assemble_trampoline_template (),
8590 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8591 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8592 fnaddr = XEXP (DECL_RTL (fndecl), 0);
8593 if (GET_MODE (fnaddr) != ptr_mode)
8594 fnaddr = convert_memory_address (ptr_mode, fnaddr);
8595 emit_move_insn (mem, fnaddr);
8596
8597 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8598 emit_move_insn (mem, chain_value);
8599
8600 /* XXX We should really define a "clear_cache" pattern and use
8601 gen_clear_cache(). */
8602 a_tramp = XEXP (m_tramp, 0);
8603 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8604 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8605 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8606 ptr_mode);
8607 }
8608
8609 static unsigned char
8610 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8611 {
8612 /* ??? Logically we should only need to provide a value when
8613 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8614 can hold MODE, but at the moment we need to handle all modes.
8615 Just ignore any runtime parts for registers that can't store them. */
8616 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8617 unsigned int nregs;
8618 switch (regclass)
8619 {
8620 case TAILCALL_ADDR_REGS:
8621 case POINTER_REGS:
8622 case GENERAL_REGS:
8623 case ALL_REGS:
8624 case POINTER_AND_FP_REGS:
8625 case FP_REGS:
8626 case FP_LO_REGS:
8627 case FP_LO8_REGS:
8628 if (aarch64_sve_data_mode_p (mode)
8629 && constant_multiple_p (GET_MODE_SIZE (mode),
8630 BYTES_PER_SVE_VECTOR, &nregs))
8631 return nregs;
8632 return (aarch64_vector_data_mode_p (mode)
8633 ? CEIL (lowest_size, UNITS_PER_VREG)
8634 : CEIL (lowest_size, UNITS_PER_WORD));
8635 case STACK_REG:
8636 case PR_REGS:
8637 case PR_LO_REGS:
8638 case PR_HI_REGS:
8639 return 1;
8640
8641 case NO_REGS:
8642 return 0;
8643
8644 default:
8645 break;
8646 }
8647 gcc_unreachable ();
8648 }
8649
8650 static reg_class_t
8651 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8652 {
8653 if (regclass == POINTER_REGS)
8654 return GENERAL_REGS;
8655
8656 if (regclass == STACK_REG)
8657 {
8658 if (REG_P(x)
8659 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8660 return regclass;
8661
8662 return NO_REGS;
8663 }
8664
8665 /* Register eliminiation can result in a request for
8666 SP+constant->FP_REGS. We cannot support such operations which
8667 use SP as source and an FP_REG as destination, so reject out
8668 right now. */
8669 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8670 {
8671 rtx lhs = XEXP (x, 0);
8672
8673 /* Look through a possible SUBREG introduced by ILP32. */
8674 if (GET_CODE (lhs) == SUBREG)
8675 lhs = SUBREG_REG (lhs);
8676
8677 gcc_assert (REG_P (lhs));
8678 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8679 POINTER_REGS));
8680 return NO_REGS;
8681 }
8682
8683 return regclass;
8684 }
8685
8686 void
8687 aarch64_asm_output_labelref (FILE* f, const char *name)
8688 {
8689 asm_fprintf (f, "%U%s", name);
8690 }
8691
8692 static void
8693 aarch64_elf_asm_constructor (rtx symbol, int priority)
8694 {
8695 if (priority == DEFAULT_INIT_PRIORITY)
8696 default_ctor_section_asm_out_constructor (symbol, priority);
8697 else
8698 {
8699 section *s;
8700 /* While priority is known to be in range [0, 65535], so 18 bytes
8701 would be enough, the compiler might not know that. To avoid
8702 -Wformat-truncation false positive, use a larger size. */
8703 char buf[23];
8704 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8705 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8706 switch_to_section (s);
8707 assemble_align (POINTER_SIZE);
8708 assemble_aligned_integer (POINTER_BYTES, symbol);
8709 }
8710 }
8711
8712 static void
8713 aarch64_elf_asm_destructor (rtx symbol, int priority)
8714 {
8715 if (priority == DEFAULT_INIT_PRIORITY)
8716 default_dtor_section_asm_out_destructor (symbol, priority);
8717 else
8718 {
8719 section *s;
8720 /* While priority is known to be in range [0, 65535], so 18 bytes
8721 would be enough, the compiler might not know that. To avoid
8722 -Wformat-truncation false positive, use a larger size. */
8723 char buf[23];
8724 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8725 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8726 switch_to_section (s);
8727 assemble_align (POINTER_SIZE);
8728 assemble_aligned_integer (POINTER_BYTES, symbol);
8729 }
8730 }
8731
8732 const char*
8733 aarch64_output_casesi (rtx *operands)
8734 {
8735 char buf[100];
8736 char label[100];
8737 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8738 int index;
8739 static const char *const patterns[4][2] =
8740 {
8741 {
8742 "ldrb\t%w3, [%0,%w1,uxtw]",
8743 "add\t%3, %4, %w3, sxtb #2"
8744 },
8745 {
8746 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8747 "add\t%3, %4, %w3, sxth #2"
8748 },
8749 {
8750 "ldr\t%w3, [%0,%w1,uxtw #2]",
8751 "add\t%3, %4, %w3, sxtw #2"
8752 },
8753 /* We assume that DImode is only generated when not optimizing and
8754 that we don't really need 64-bit address offsets. That would
8755 imply an object file with 8GB of code in a single function! */
8756 {
8757 "ldr\t%w3, [%0,%w1,uxtw #2]",
8758 "add\t%3, %4, %w3, sxtw #2"
8759 }
8760 };
8761
8762 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8763
8764 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8765 index = exact_log2 (GET_MODE_SIZE (mode));
8766
8767 gcc_assert (index >= 0 && index <= 3);
8768
8769 /* Need to implement table size reduction, by chaning the code below. */
8770 output_asm_insn (patterns[index][0], operands);
8771 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8772 snprintf (buf, sizeof (buf),
8773 "adr\t%%4, %s", targetm.strip_name_encoding (label));
8774 output_asm_insn (buf, operands);
8775 output_asm_insn (patterns[index][1], operands);
8776 output_asm_insn ("br\t%3", operands);
8777 assemble_label (asm_out_file, label);
8778 return "";
8779 }
8780
8781
8782 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8783 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8784 operator. */
8785
8786 int
8787 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8788 {
8789 if (shift >= 0 && shift <= 3)
8790 {
8791 int size;
8792 for (size = 8; size <= 32; size *= 2)
8793 {
8794 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8795 if (mask == bits << shift)
8796 return size;
8797 }
8798 }
8799 return 0;
8800 }
8801
8802 /* Constant pools are per function only when PC relative
8803 literal loads are true or we are in the large memory
8804 model. */
8805
8806 static inline bool
8807 aarch64_can_use_per_function_literal_pools_p (void)
8808 {
8809 return (aarch64_pcrelative_literal_loads
8810 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8811 }
8812
8813 static bool
8814 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8815 {
8816 /* We can't use blocks for constants when we're using a per-function
8817 constant pool. */
8818 return !aarch64_can_use_per_function_literal_pools_p ();
8819 }
8820
8821 /* Select appropriate section for constants depending
8822 on where we place literal pools. */
8823
8824 static section *
8825 aarch64_select_rtx_section (machine_mode mode,
8826 rtx x,
8827 unsigned HOST_WIDE_INT align)
8828 {
8829 if (aarch64_can_use_per_function_literal_pools_p ())
8830 return function_section (current_function_decl);
8831
8832 return default_elf_select_rtx_section (mode, x, align);
8833 }
8834
8835 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8836 void
8837 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8838 HOST_WIDE_INT offset)
8839 {
8840 /* When using per-function literal pools, we must ensure that any code
8841 section is aligned to the minimal instruction length, lest we get
8842 errors from the assembler re "unaligned instructions". */
8843 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8844 ASM_OUTPUT_ALIGN (f, 2);
8845 }
8846
8847 /* Costs. */
8848
8849 /* Helper function for rtx cost calculation. Strip a shift expression
8850 from X. Returns the inner operand if successful, or the original
8851 expression on failure. */
8852 static rtx
8853 aarch64_strip_shift (rtx x)
8854 {
8855 rtx op = x;
8856
8857 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8858 we can convert both to ROR during final output. */
8859 if ((GET_CODE (op) == ASHIFT
8860 || GET_CODE (op) == ASHIFTRT
8861 || GET_CODE (op) == LSHIFTRT
8862 || GET_CODE (op) == ROTATERT
8863 || GET_CODE (op) == ROTATE)
8864 && CONST_INT_P (XEXP (op, 1)))
8865 return XEXP (op, 0);
8866
8867 if (GET_CODE (op) == MULT
8868 && CONST_INT_P (XEXP (op, 1))
8869 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8870 return XEXP (op, 0);
8871
8872 return x;
8873 }
8874
8875 /* Helper function for rtx cost calculation. Strip an extend
8876 expression from X. Returns the inner operand if successful, or the
8877 original expression on failure. We deal with a number of possible
8878 canonicalization variations here. If STRIP_SHIFT is true, then
8879 we can strip off a shift also. */
8880 static rtx
8881 aarch64_strip_extend (rtx x, bool strip_shift)
8882 {
8883 scalar_int_mode mode;
8884 rtx op = x;
8885
8886 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8887 return op;
8888
8889 /* Zero and sign extraction of a widened value. */
8890 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8891 && XEXP (op, 2) == const0_rtx
8892 && GET_CODE (XEXP (op, 0)) == MULT
8893 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8894 XEXP (op, 1)))
8895 return XEXP (XEXP (op, 0), 0);
8896
8897 /* It can also be represented (for zero-extend) as an AND with an
8898 immediate. */
8899 if (GET_CODE (op) == AND
8900 && GET_CODE (XEXP (op, 0)) == MULT
8901 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8902 && CONST_INT_P (XEXP (op, 1))
8903 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8904 INTVAL (XEXP (op, 1))) != 0)
8905 return XEXP (XEXP (op, 0), 0);
8906
8907 /* Now handle extended register, as this may also have an optional
8908 left shift by 1..4. */
8909 if (strip_shift
8910 && GET_CODE (op) == ASHIFT
8911 && CONST_INT_P (XEXP (op, 1))
8912 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8913 op = XEXP (op, 0);
8914
8915 if (GET_CODE (op) == ZERO_EXTEND
8916 || GET_CODE (op) == SIGN_EXTEND)
8917 op = XEXP (op, 0);
8918
8919 if (op != x)
8920 return op;
8921
8922 return x;
8923 }
8924
8925 /* Return true iff CODE is a shift supported in combination
8926 with arithmetic instructions. */
8927
8928 static bool
8929 aarch64_shift_p (enum rtx_code code)
8930 {
8931 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8932 }
8933
8934
8935 /* Return true iff X is a cheap shift without a sign extend. */
8936
8937 static bool
8938 aarch64_cheap_mult_shift_p (rtx x)
8939 {
8940 rtx op0, op1;
8941
8942 op0 = XEXP (x, 0);
8943 op1 = XEXP (x, 1);
8944
8945 if (!(aarch64_tune_params.extra_tuning_flags
8946 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8947 return false;
8948
8949 if (GET_CODE (op0) == SIGN_EXTEND)
8950 return false;
8951
8952 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8953 && UINTVAL (op1) <= 4)
8954 return true;
8955
8956 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8957 return false;
8958
8959 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8960
8961 if (l2 > 0 && l2 <= 4)
8962 return true;
8963
8964 return false;
8965 }
8966
8967 /* Helper function for rtx cost calculation. Calculate the cost of
8968 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8969 Return the calculated cost of the expression, recursing manually in to
8970 operands where needed. */
8971
8972 static int
8973 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8974 {
8975 rtx op0, op1;
8976 const struct cpu_cost_table *extra_cost
8977 = aarch64_tune_params.insn_extra_cost;
8978 int cost = 0;
8979 bool compound_p = (outer == PLUS || outer == MINUS);
8980 machine_mode mode = GET_MODE (x);
8981
8982 gcc_checking_assert (code == MULT);
8983
8984 op0 = XEXP (x, 0);
8985 op1 = XEXP (x, 1);
8986
8987 if (VECTOR_MODE_P (mode))
8988 mode = GET_MODE_INNER (mode);
8989
8990 /* Integer multiply/fma. */
8991 if (GET_MODE_CLASS (mode) == MODE_INT)
8992 {
8993 /* The multiply will be canonicalized as a shift, cost it as such. */
8994 if (aarch64_shift_p (GET_CODE (x))
8995 || (CONST_INT_P (op1)
8996 && exact_log2 (INTVAL (op1)) > 0))
8997 {
8998 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8999 || GET_CODE (op0) == SIGN_EXTEND;
9000 if (speed)
9001 {
9002 if (compound_p)
9003 {
9004 /* If the shift is considered cheap,
9005 then don't add any cost. */
9006 if (aarch64_cheap_mult_shift_p (x))
9007 ;
9008 else if (REG_P (op1))
9009 /* ARITH + shift-by-register. */
9010 cost += extra_cost->alu.arith_shift_reg;
9011 else if (is_extend)
9012 /* ARITH + extended register. We don't have a cost field
9013 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9014 cost += extra_cost->alu.extend_arith;
9015 else
9016 /* ARITH + shift-by-immediate. */
9017 cost += extra_cost->alu.arith_shift;
9018 }
9019 else
9020 /* LSL (immediate). */
9021 cost += extra_cost->alu.shift;
9022
9023 }
9024 /* Strip extends as we will have costed them in the case above. */
9025 if (is_extend)
9026 op0 = aarch64_strip_extend (op0, true);
9027
9028 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9029
9030 return cost;
9031 }
9032
9033 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9034 compound and let the below cases handle it. After all, MNEG is a
9035 special-case alias of MSUB. */
9036 if (GET_CODE (op0) == NEG)
9037 {
9038 op0 = XEXP (op0, 0);
9039 compound_p = true;
9040 }
9041
9042 /* Integer multiplies or FMAs have zero/sign extending variants. */
9043 if ((GET_CODE (op0) == ZERO_EXTEND
9044 && GET_CODE (op1) == ZERO_EXTEND)
9045 || (GET_CODE (op0) == SIGN_EXTEND
9046 && GET_CODE (op1) == SIGN_EXTEND))
9047 {
9048 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9049 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9050
9051 if (speed)
9052 {
9053 if (compound_p)
9054 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9055 cost += extra_cost->mult[0].extend_add;
9056 else
9057 /* MUL/SMULL/UMULL. */
9058 cost += extra_cost->mult[0].extend;
9059 }
9060
9061 return cost;
9062 }
9063
9064 /* This is either an integer multiply or a MADD. In both cases
9065 we want to recurse and cost the operands. */
9066 cost += rtx_cost (op0, mode, MULT, 0, speed);
9067 cost += rtx_cost (op1, mode, MULT, 1, speed);
9068
9069 if (speed)
9070 {
9071 if (compound_p)
9072 /* MADD/MSUB. */
9073 cost += extra_cost->mult[mode == DImode].add;
9074 else
9075 /* MUL. */
9076 cost += extra_cost->mult[mode == DImode].simple;
9077 }
9078
9079 return cost;
9080 }
9081 else
9082 {
9083 if (speed)
9084 {
9085 /* Floating-point FMA/FMUL can also support negations of the
9086 operands, unless the rounding mode is upward or downward in
9087 which case FNMUL is different than FMUL with operand negation. */
9088 bool neg0 = GET_CODE (op0) == NEG;
9089 bool neg1 = GET_CODE (op1) == NEG;
9090 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9091 {
9092 if (neg0)
9093 op0 = XEXP (op0, 0);
9094 if (neg1)
9095 op1 = XEXP (op1, 0);
9096 }
9097
9098 if (compound_p)
9099 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9100 cost += extra_cost->fp[mode == DFmode].fma;
9101 else
9102 /* FMUL/FNMUL. */
9103 cost += extra_cost->fp[mode == DFmode].mult;
9104 }
9105
9106 cost += rtx_cost (op0, mode, MULT, 0, speed);
9107 cost += rtx_cost (op1, mode, MULT, 1, speed);
9108 return cost;
9109 }
9110 }
9111
9112 static int
9113 aarch64_address_cost (rtx x,
9114 machine_mode mode,
9115 addr_space_t as ATTRIBUTE_UNUSED,
9116 bool speed)
9117 {
9118 enum rtx_code c = GET_CODE (x);
9119 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9120 struct aarch64_address_info info;
9121 int cost = 0;
9122 info.shift = 0;
9123
9124 if (!aarch64_classify_address (&info, x, mode, false))
9125 {
9126 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9127 {
9128 /* This is a CONST or SYMBOL ref which will be split
9129 in a different way depending on the code model in use.
9130 Cost it through the generic infrastructure. */
9131 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9132 /* Divide through by the cost of one instruction to
9133 bring it to the same units as the address costs. */
9134 cost_symbol_ref /= COSTS_N_INSNS (1);
9135 /* The cost is then the cost of preparing the address,
9136 followed by an immediate (possibly 0) offset. */
9137 return cost_symbol_ref + addr_cost->imm_offset;
9138 }
9139 else
9140 {
9141 /* This is most likely a jump table from a case
9142 statement. */
9143 return addr_cost->register_offset;
9144 }
9145 }
9146
9147 switch (info.type)
9148 {
9149 case ADDRESS_LO_SUM:
9150 case ADDRESS_SYMBOLIC:
9151 case ADDRESS_REG_IMM:
9152 cost += addr_cost->imm_offset;
9153 break;
9154
9155 case ADDRESS_REG_WB:
9156 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9157 cost += addr_cost->pre_modify;
9158 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9159 cost += addr_cost->post_modify;
9160 else
9161 gcc_unreachable ();
9162
9163 break;
9164
9165 case ADDRESS_REG_REG:
9166 cost += addr_cost->register_offset;
9167 break;
9168
9169 case ADDRESS_REG_SXTW:
9170 cost += addr_cost->register_sextend;
9171 break;
9172
9173 case ADDRESS_REG_UXTW:
9174 cost += addr_cost->register_zextend;
9175 break;
9176
9177 default:
9178 gcc_unreachable ();
9179 }
9180
9181
9182 if (info.shift > 0)
9183 {
9184 /* For the sake of calculating the cost of the shifted register
9185 component, we can treat same sized modes in the same way. */
9186 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9187 cost += addr_cost->addr_scale_costs.hi;
9188 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9189 cost += addr_cost->addr_scale_costs.si;
9190 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9191 cost += addr_cost->addr_scale_costs.di;
9192 else
9193 /* We can't tell, or this is a 128-bit vector. */
9194 cost += addr_cost->addr_scale_costs.ti;
9195 }
9196
9197 return cost;
9198 }
9199
9200 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9201 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9202 to be taken. */
9203
9204 int
9205 aarch64_branch_cost (bool speed_p, bool predictable_p)
9206 {
9207 /* When optimizing for speed, use the cost of unpredictable branches. */
9208 const struct cpu_branch_cost *branch_costs =
9209 aarch64_tune_params.branch_costs;
9210
9211 if (!speed_p || predictable_p)
9212 return branch_costs->predictable;
9213 else
9214 return branch_costs->unpredictable;
9215 }
9216
9217 /* Return true if the RTX X in mode MODE is a zero or sign extract
9218 usable in an ADD or SUB (extended register) instruction. */
9219 static bool
9220 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9221 {
9222 /* Catch add with a sign extract.
9223 This is add_<optab><mode>_multp2. */
9224 if (GET_CODE (x) == SIGN_EXTRACT
9225 || GET_CODE (x) == ZERO_EXTRACT)
9226 {
9227 rtx op0 = XEXP (x, 0);
9228 rtx op1 = XEXP (x, 1);
9229 rtx op2 = XEXP (x, 2);
9230
9231 if (GET_CODE (op0) == MULT
9232 && CONST_INT_P (op1)
9233 && op2 == const0_rtx
9234 && CONST_INT_P (XEXP (op0, 1))
9235 && aarch64_is_extend_from_extract (mode,
9236 XEXP (op0, 1),
9237 op1))
9238 {
9239 return true;
9240 }
9241 }
9242 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9243 No shift. */
9244 else if (GET_CODE (x) == SIGN_EXTEND
9245 || GET_CODE (x) == ZERO_EXTEND)
9246 return REG_P (XEXP (x, 0));
9247
9248 return false;
9249 }
9250
9251 static bool
9252 aarch64_frint_unspec_p (unsigned int u)
9253 {
9254 switch (u)
9255 {
9256 case UNSPEC_FRINTZ:
9257 case UNSPEC_FRINTP:
9258 case UNSPEC_FRINTM:
9259 case UNSPEC_FRINTA:
9260 case UNSPEC_FRINTN:
9261 case UNSPEC_FRINTX:
9262 case UNSPEC_FRINTI:
9263 return true;
9264
9265 default:
9266 return false;
9267 }
9268 }
9269
9270 /* Return true iff X is an rtx that will match an extr instruction
9271 i.e. as described in the *extr<mode>5_insn family of patterns.
9272 OP0 and OP1 will be set to the operands of the shifts involved
9273 on success and will be NULL_RTX otherwise. */
9274
9275 static bool
9276 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9277 {
9278 rtx op0, op1;
9279 scalar_int_mode mode;
9280 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9281 return false;
9282
9283 *res_op0 = NULL_RTX;
9284 *res_op1 = NULL_RTX;
9285
9286 if (GET_CODE (x) != IOR)
9287 return false;
9288
9289 op0 = XEXP (x, 0);
9290 op1 = XEXP (x, 1);
9291
9292 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9293 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9294 {
9295 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9296 if (GET_CODE (op1) == ASHIFT)
9297 std::swap (op0, op1);
9298
9299 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9300 return false;
9301
9302 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9303 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9304
9305 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9306 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9307 {
9308 *res_op0 = XEXP (op0, 0);
9309 *res_op1 = XEXP (op1, 0);
9310 return true;
9311 }
9312 }
9313
9314 return false;
9315 }
9316
9317 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9318 storing it in *COST. Result is true if the total cost of the operation
9319 has now been calculated. */
9320 static bool
9321 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9322 {
9323 rtx inner;
9324 rtx comparator;
9325 enum rtx_code cmpcode;
9326
9327 if (COMPARISON_P (op0))
9328 {
9329 inner = XEXP (op0, 0);
9330 comparator = XEXP (op0, 1);
9331 cmpcode = GET_CODE (op0);
9332 }
9333 else
9334 {
9335 inner = op0;
9336 comparator = const0_rtx;
9337 cmpcode = NE;
9338 }
9339
9340 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9341 {
9342 /* Conditional branch. */
9343 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9344 return true;
9345 else
9346 {
9347 if (cmpcode == NE || cmpcode == EQ)
9348 {
9349 if (comparator == const0_rtx)
9350 {
9351 /* TBZ/TBNZ/CBZ/CBNZ. */
9352 if (GET_CODE (inner) == ZERO_EXTRACT)
9353 /* TBZ/TBNZ. */
9354 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9355 ZERO_EXTRACT, 0, speed);
9356 else
9357 /* CBZ/CBNZ. */
9358 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9359
9360 return true;
9361 }
9362 }
9363 else if (cmpcode == LT || cmpcode == GE)
9364 {
9365 /* TBZ/TBNZ. */
9366 if (comparator == const0_rtx)
9367 return true;
9368 }
9369 }
9370 }
9371 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9372 {
9373 /* CCMP. */
9374 if (GET_CODE (op1) == COMPARE)
9375 {
9376 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9377 if (XEXP (op1, 1) == const0_rtx)
9378 *cost += 1;
9379 if (speed)
9380 {
9381 machine_mode mode = GET_MODE (XEXP (op1, 0));
9382 const struct cpu_cost_table *extra_cost
9383 = aarch64_tune_params.insn_extra_cost;
9384
9385 if (GET_MODE_CLASS (mode) == MODE_INT)
9386 *cost += extra_cost->alu.arith;
9387 else
9388 *cost += extra_cost->fp[mode == DFmode].compare;
9389 }
9390 return true;
9391 }
9392
9393 /* It's a conditional operation based on the status flags,
9394 so it must be some flavor of CSEL. */
9395
9396 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9397 if (GET_CODE (op1) == NEG
9398 || GET_CODE (op1) == NOT
9399 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9400 op1 = XEXP (op1, 0);
9401 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9402 {
9403 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9404 op1 = XEXP (op1, 0);
9405 op2 = XEXP (op2, 0);
9406 }
9407
9408 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9409 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9410 return true;
9411 }
9412
9413 /* We don't know what this is, cost all operands. */
9414 return false;
9415 }
9416
9417 /* Check whether X is a bitfield operation of the form shift + extend that
9418 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9419 operand to which the bitfield operation is applied. Otherwise return
9420 NULL_RTX. */
9421
9422 static rtx
9423 aarch64_extend_bitfield_pattern_p (rtx x)
9424 {
9425 rtx_code outer_code = GET_CODE (x);
9426 machine_mode outer_mode = GET_MODE (x);
9427
9428 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9429 && outer_mode != SImode && outer_mode != DImode)
9430 return NULL_RTX;
9431
9432 rtx inner = XEXP (x, 0);
9433 rtx_code inner_code = GET_CODE (inner);
9434 machine_mode inner_mode = GET_MODE (inner);
9435 rtx op = NULL_RTX;
9436
9437 switch (inner_code)
9438 {
9439 case ASHIFT:
9440 if (CONST_INT_P (XEXP (inner, 1))
9441 && (inner_mode == QImode || inner_mode == HImode))
9442 op = XEXP (inner, 0);
9443 break;
9444 case LSHIFTRT:
9445 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9446 && (inner_mode == QImode || inner_mode == HImode))
9447 op = XEXP (inner, 0);
9448 break;
9449 case ASHIFTRT:
9450 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9451 && (inner_mode == QImode || inner_mode == HImode))
9452 op = XEXP (inner, 0);
9453 break;
9454 default:
9455 break;
9456 }
9457
9458 return op;
9459 }
9460
9461 /* Return true if the mask and a shift amount from an RTX of the form
9462 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9463 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9464
9465 bool
9466 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9467 rtx shft_amnt)
9468 {
9469 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9470 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9471 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9472 && (INTVAL (mask)
9473 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9474 }
9475
9476 /* Return true if the masks and a shift amount from an RTX of the form
9477 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9478 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
9479
9480 bool
9481 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9482 unsigned HOST_WIDE_INT mask1,
9483 unsigned HOST_WIDE_INT shft_amnt,
9484 unsigned HOST_WIDE_INT mask2)
9485 {
9486 unsigned HOST_WIDE_INT t;
9487
9488 /* Verify that there is no overlap in what bits are set in the two masks. */
9489 if (mask1 != ~mask2)
9490 return false;
9491
9492 /* Verify that mask2 is not all zeros or ones. */
9493 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9494 return false;
9495
9496 /* The shift amount should always be less than the mode size. */
9497 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9498
9499 /* Verify that the mask being shifted is contiguous and would be in the
9500 least significant bits after shifting by shft_amnt. */
9501 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9502 return (t == (t & -t));
9503 }
9504
9505 /* Calculate the cost of calculating X, storing it in *COST. Result
9506 is true if the total cost of the operation has now been calculated. */
9507 static bool
9508 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9509 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9510 {
9511 rtx op0, op1, op2;
9512 const struct cpu_cost_table *extra_cost
9513 = aarch64_tune_params.insn_extra_cost;
9514 int code = GET_CODE (x);
9515 scalar_int_mode int_mode;
9516
9517 /* By default, assume that everything has equivalent cost to the
9518 cheapest instruction. Any additional costs are applied as a delta
9519 above this default. */
9520 *cost = COSTS_N_INSNS (1);
9521
9522 switch (code)
9523 {
9524 case SET:
9525 /* The cost depends entirely on the operands to SET. */
9526 *cost = 0;
9527 op0 = SET_DEST (x);
9528 op1 = SET_SRC (x);
9529
9530 switch (GET_CODE (op0))
9531 {
9532 case MEM:
9533 if (speed)
9534 {
9535 rtx address = XEXP (op0, 0);
9536 if (VECTOR_MODE_P (mode))
9537 *cost += extra_cost->ldst.storev;
9538 else if (GET_MODE_CLASS (mode) == MODE_INT)
9539 *cost += extra_cost->ldst.store;
9540 else if (mode == SFmode)
9541 *cost += extra_cost->ldst.storef;
9542 else if (mode == DFmode)
9543 *cost += extra_cost->ldst.stored;
9544
9545 *cost +=
9546 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9547 0, speed));
9548 }
9549
9550 *cost += rtx_cost (op1, mode, SET, 1, speed);
9551 return true;
9552
9553 case SUBREG:
9554 if (! REG_P (SUBREG_REG (op0)))
9555 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9556
9557 /* Fall through. */
9558 case REG:
9559 /* The cost is one per vector-register copied. */
9560 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9561 {
9562 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9563 *cost = COSTS_N_INSNS (nregs);
9564 }
9565 /* const0_rtx is in general free, but we will use an
9566 instruction to set a register to 0. */
9567 else if (REG_P (op1) || op1 == const0_rtx)
9568 {
9569 /* The cost is 1 per register copied. */
9570 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9571 *cost = COSTS_N_INSNS (nregs);
9572 }
9573 else
9574 /* Cost is just the cost of the RHS of the set. */
9575 *cost += rtx_cost (op1, mode, SET, 1, speed);
9576 return true;
9577
9578 case ZERO_EXTRACT:
9579 case SIGN_EXTRACT:
9580 /* Bit-field insertion. Strip any redundant widening of
9581 the RHS to meet the width of the target. */
9582 if (GET_CODE (op1) == SUBREG)
9583 op1 = SUBREG_REG (op1);
9584 if ((GET_CODE (op1) == ZERO_EXTEND
9585 || GET_CODE (op1) == SIGN_EXTEND)
9586 && CONST_INT_P (XEXP (op0, 1))
9587 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9588 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9589 op1 = XEXP (op1, 0);
9590
9591 if (CONST_INT_P (op1))
9592 {
9593 /* MOV immediate is assumed to always be cheap. */
9594 *cost = COSTS_N_INSNS (1);
9595 }
9596 else
9597 {
9598 /* BFM. */
9599 if (speed)
9600 *cost += extra_cost->alu.bfi;
9601 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9602 }
9603
9604 return true;
9605
9606 default:
9607 /* We can't make sense of this, assume default cost. */
9608 *cost = COSTS_N_INSNS (1);
9609 return false;
9610 }
9611 return false;
9612
9613 case CONST_INT:
9614 /* If an instruction can incorporate a constant within the
9615 instruction, the instruction's expression avoids calling
9616 rtx_cost() on the constant. If rtx_cost() is called on a
9617 constant, then it is usually because the constant must be
9618 moved into a register by one or more instructions.
9619
9620 The exception is constant 0, which can be expressed
9621 as XZR/WZR and is therefore free. The exception to this is
9622 if we have (set (reg) (const0_rtx)) in which case we must cost
9623 the move. However, we can catch that when we cost the SET, so
9624 we don't need to consider that here. */
9625 if (x == const0_rtx)
9626 *cost = 0;
9627 else
9628 {
9629 /* To an approximation, building any other constant is
9630 proportionally expensive to the number of instructions
9631 required to build that constant. This is true whether we
9632 are compiling for SPEED or otherwise. */
9633 if (!is_a <scalar_int_mode> (mode, &int_mode))
9634 int_mode = word_mode;
9635 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9636 (NULL_RTX, x, false, int_mode));
9637 }
9638 return true;
9639
9640 case CONST_DOUBLE:
9641
9642 /* First determine number of instructions to do the move
9643 as an integer constant. */
9644 if (!aarch64_float_const_representable_p (x)
9645 && !aarch64_can_const_movi_rtx_p (x, mode)
9646 && aarch64_float_const_rtx_p (x))
9647 {
9648 unsigned HOST_WIDE_INT ival;
9649 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9650 gcc_assert (succeed);
9651
9652 scalar_int_mode imode = (mode == HFmode
9653 ? SImode
9654 : int_mode_for_mode (mode).require ());
9655 int ncost = aarch64_internal_mov_immediate
9656 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9657 *cost += COSTS_N_INSNS (ncost);
9658 return true;
9659 }
9660
9661 if (speed)
9662 {
9663 /* mov[df,sf]_aarch64. */
9664 if (aarch64_float_const_representable_p (x))
9665 /* FMOV (scalar immediate). */
9666 *cost += extra_cost->fp[mode == DFmode].fpconst;
9667 else if (!aarch64_float_const_zero_rtx_p (x))
9668 {
9669 /* This will be a load from memory. */
9670 if (mode == DFmode)
9671 *cost += extra_cost->ldst.loadd;
9672 else
9673 *cost += extra_cost->ldst.loadf;
9674 }
9675 else
9676 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9677 or MOV v0.s[0], wzr - neither of which are modeled by the
9678 cost tables. Just use the default cost. */
9679 {
9680 }
9681 }
9682
9683 return true;
9684
9685 case MEM:
9686 if (speed)
9687 {
9688 /* For loads we want the base cost of a load, plus an
9689 approximation for the additional cost of the addressing
9690 mode. */
9691 rtx address = XEXP (x, 0);
9692 if (VECTOR_MODE_P (mode))
9693 *cost += extra_cost->ldst.loadv;
9694 else if (GET_MODE_CLASS (mode) == MODE_INT)
9695 *cost += extra_cost->ldst.load;
9696 else if (mode == SFmode)
9697 *cost += extra_cost->ldst.loadf;
9698 else if (mode == DFmode)
9699 *cost += extra_cost->ldst.loadd;
9700
9701 *cost +=
9702 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9703 0, speed));
9704 }
9705
9706 return true;
9707
9708 case NEG:
9709 op0 = XEXP (x, 0);
9710
9711 if (VECTOR_MODE_P (mode))
9712 {
9713 if (speed)
9714 {
9715 /* FNEG. */
9716 *cost += extra_cost->vect.alu;
9717 }
9718 return false;
9719 }
9720
9721 if (GET_MODE_CLASS (mode) == MODE_INT)
9722 {
9723 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9724 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9725 {
9726 /* CSETM. */
9727 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9728 return true;
9729 }
9730
9731 /* Cost this as SUB wzr, X. */
9732 op0 = CONST0_RTX (mode);
9733 op1 = XEXP (x, 0);
9734 goto cost_minus;
9735 }
9736
9737 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9738 {
9739 /* Support (neg(fma...)) as a single instruction only if
9740 sign of zeros is unimportant. This matches the decision
9741 making in aarch64.md. */
9742 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9743 {
9744 /* FNMADD. */
9745 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9746 return true;
9747 }
9748 if (GET_CODE (op0) == MULT)
9749 {
9750 /* FNMUL. */
9751 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9752 return true;
9753 }
9754 if (speed)
9755 /* FNEG. */
9756 *cost += extra_cost->fp[mode == DFmode].neg;
9757 return false;
9758 }
9759
9760 return false;
9761
9762 case CLRSB:
9763 case CLZ:
9764 if (speed)
9765 {
9766 if (VECTOR_MODE_P (mode))
9767 *cost += extra_cost->vect.alu;
9768 else
9769 *cost += extra_cost->alu.clz;
9770 }
9771
9772 return false;
9773
9774 case COMPARE:
9775 op0 = XEXP (x, 0);
9776 op1 = XEXP (x, 1);
9777
9778 if (op1 == const0_rtx
9779 && GET_CODE (op0) == AND)
9780 {
9781 x = op0;
9782 mode = GET_MODE (op0);
9783 goto cost_logic;
9784 }
9785
9786 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9787 {
9788 /* TODO: A write to the CC flags possibly costs extra, this
9789 needs encoding in the cost tables. */
9790
9791 mode = GET_MODE (op0);
9792 /* ANDS. */
9793 if (GET_CODE (op0) == AND)
9794 {
9795 x = op0;
9796 goto cost_logic;
9797 }
9798
9799 if (GET_CODE (op0) == PLUS)
9800 {
9801 /* ADDS (and CMN alias). */
9802 x = op0;
9803 goto cost_plus;
9804 }
9805
9806 if (GET_CODE (op0) == MINUS)
9807 {
9808 /* SUBS. */
9809 x = op0;
9810 goto cost_minus;
9811 }
9812
9813 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9814 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9815 && CONST_INT_P (XEXP (op0, 2)))
9816 {
9817 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9818 Handle it here directly rather than going to cost_logic
9819 since we know the immediate generated for the TST is valid
9820 so we can avoid creating an intermediate rtx for it only
9821 for costing purposes. */
9822 if (speed)
9823 *cost += extra_cost->alu.logical;
9824
9825 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9826 ZERO_EXTRACT, 0, speed);
9827 return true;
9828 }
9829
9830 if (GET_CODE (op1) == NEG)
9831 {
9832 /* CMN. */
9833 if (speed)
9834 *cost += extra_cost->alu.arith;
9835
9836 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9837 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9838 return true;
9839 }
9840
9841 /* CMP.
9842
9843 Compare can freely swap the order of operands, and
9844 canonicalization puts the more complex operation first.
9845 But the integer MINUS logic expects the shift/extend
9846 operation in op1. */
9847 if (! (REG_P (op0)
9848 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9849 {
9850 op0 = XEXP (x, 1);
9851 op1 = XEXP (x, 0);
9852 }
9853 goto cost_minus;
9854 }
9855
9856 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9857 {
9858 /* FCMP. */
9859 if (speed)
9860 *cost += extra_cost->fp[mode == DFmode].compare;
9861
9862 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9863 {
9864 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9865 /* FCMP supports constant 0.0 for no extra cost. */
9866 return true;
9867 }
9868 return false;
9869 }
9870
9871 if (VECTOR_MODE_P (mode))
9872 {
9873 /* Vector compare. */
9874 if (speed)
9875 *cost += extra_cost->vect.alu;
9876
9877 if (aarch64_float_const_zero_rtx_p (op1))
9878 {
9879 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9880 cost. */
9881 return true;
9882 }
9883 return false;
9884 }
9885 return false;
9886
9887 case MINUS:
9888 {
9889 op0 = XEXP (x, 0);
9890 op1 = XEXP (x, 1);
9891
9892 cost_minus:
9893 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9894
9895 /* Detect valid immediates. */
9896 if ((GET_MODE_CLASS (mode) == MODE_INT
9897 || (GET_MODE_CLASS (mode) == MODE_CC
9898 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9899 && CONST_INT_P (op1)
9900 && aarch64_uimm12_shift (INTVAL (op1)))
9901 {
9902 if (speed)
9903 /* SUB(S) (immediate). */
9904 *cost += extra_cost->alu.arith;
9905 return true;
9906 }
9907
9908 /* Look for SUB (extended register). */
9909 if (is_a <scalar_int_mode> (mode, &int_mode)
9910 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9911 {
9912 if (speed)
9913 *cost += extra_cost->alu.extend_arith;
9914
9915 op1 = aarch64_strip_extend (op1, true);
9916 *cost += rtx_cost (op1, VOIDmode,
9917 (enum rtx_code) GET_CODE (op1), 0, speed);
9918 return true;
9919 }
9920
9921 rtx new_op1 = aarch64_strip_extend (op1, false);
9922
9923 /* Cost this as an FMA-alike operation. */
9924 if ((GET_CODE (new_op1) == MULT
9925 || aarch64_shift_p (GET_CODE (new_op1)))
9926 && code != COMPARE)
9927 {
9928 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9929 (enum rtx_code) code,
9930 speed);
9931 return true;
9932 }
9933
9934 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9935
9936 if (speed)
9937 {
9938 if (VECTOR_MODE_P (mode))
9939 {
9940 /* Vector SUB. */
9941 *cost += extra_cost->vect.alu;
9942 }
9943 else if (GET_MODE_CLASS (mode) == MODE_INT)
9944 {
9945 /* SUB(S). */
9946 *cost += extra_cost->alu.arith;
9947 }
9948 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9949 {
9950 /* FSUB. */
9951 *cost += extra_cost->fp[mode == DFmode].addsub;
9952 }
9953 }
9954 return true;
9955 }
9956
9957 case PLUS:
9958 {
9959 rtx new_op0;
9960
9961 op0 = XEXP (x, 0);
9962 op1 = XEXP (x, 1);
9963
9964 cost_plus:
9965 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9966 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9967 {
9968 /* CSINC. */
9969 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9970 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9971 return true;
9972 }
9973
9974 if (GET_MODE_CLASS (mode) == MODE_INT
9975 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9976 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9977 {
9978 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9979
9980 if (speed)
9981 /* ADD (immediate). */
9982 *cost += extra_cost->alu.arith;
9983 return true;
9984 }
9985
9986 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9987
9988 /* Look for ADD (extended register). */
9989 if (is_a <scalar_int_mode> (mode, &int_mode)
9990 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9991 {
9992 if (speed)
9993 *cost += extra_cost->alu.extend_arith;
9994
9995 op0 = aarch64_strip_extend (op0, true);
9996 *cost += rtx_cost (op0, VOIDmode,
9997 (enum rtx_code) GET_CODE (op0), 0, speed);
9998 return true;
9999 }
10000
10001 /* Strip any extend, leave shifts behind as we will
10002 cost them through mult_cost. */
10003 new_op0 = aarch64_strip_extend (op0, false);
10004
10005 if (GET_CODE (new_op0) == MULT
10006 || aarch64_shift_p (GET_CODE (new_op0)))
10007 {
10008 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10009 speed);
10010 return true;
10011 }
10012
10013 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10014
10015 if (speed)
10016 {
10017 if (VECTOR_MODE_P (mode))
10018 {
10019 /* Vector ADD. */
10020 *cost += extra_cost->vect.alu;
10021 }
10022 else if (GET_MODE_CLASS (mode) == MODE_INT)
10023 {
10024 /* ADD. */
10025 *cost += extra_cost->alu.arith;
10026 }
10027 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10028 {
10029 /* FADD. */
10030 *cost += extra_cost->fp[mode == DFmode].addsub;
10031 }
10032 }
10033 return true;
10034 }
10035
10036 case BSWAP:
10037 *cost = COSTS_N_INSNS (1);
10038
10039 if (speed)
10040 {
10041 if (VECTOR_MODE_P (mode))
10042 *cost += extra_cost->vect.alu;
10043 else
10044 *cost += extra_cost->alu.rev;
10045 }
10046 return false;
10047
10048 case IOR:
10049 if (aarch_rev16_p (x))
10050 {
10051 *cost = COSTS_N_INSNS (1);
10052
10053 if (speed)
10054 {
10055 if (VECTOR_MODE_P (mode))
10056 *cost += extra_cost->vect.alu;
10057 else
10058 *cost += extra_cost->alu.rev;
10059 }
10060 return true;
10061 }
10062
10063 if (aarch64_extr_rtx_p (x, &op0, &op1))
10064 {
10065 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10066 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10067 if (speed)
10068 *cost += extra_cost->alu.shift;
10069
10070 return true;
10071 }
10072 /* Fall through. */
10073 case XOR:
10074 case AND:
10075 cost_logic:
10076 op0 = XEXP (x, 0);
10077 op1 = XEXP (x, 1);
10078
10079 if (VECTOR_MODE_P (mode))
10080 {
10081 if (speed)
10082 *cost += extra_cost->vect.alu;
10083 return true;
10084 }
10085
10086 if (code == AND
10087 && GET_CODE (op0) == MULT
10088 && CONST_INT_P (XEXP (op0, 1))
10089 && CONST_INT_P (op1)
10090 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10091 INTVAL (op1)) != 0)
10092 {
10093 /* This is a UBFM/SBFM. */
10094 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10095 if (speed)
10096 *cost += extra_cost->alu.bfx;
10097 return true;
10098 }
10099
10100 if (is_int_mode (mode, &int_mode))
10101 {
10102 if (CONST_INT_P (op1))
10103 {
10104 /* We have a mask + shift version of a UBFIZ
10105 i.e. the *andim_ashift<mode>_bfiz pattern. */
10106 if (GET_CODE (op0) == ASHIFT
10107 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10108 XEXP (op0, 1)))
10109 {
10110 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10111 (enum rtx_code) code, 0, speed);
10112 if (speed)
10113 *cost += extra_cost->alu.bfx;
10114
10115 return true;
10116 }
10117 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10118 {
10119 /* We possibly get the immediate for free, this is not
10120 modelled. */
10121 *cost += rtx_cost (op0, int_mode,
10122 (enum rtx_code) code, 0, speed);
10123 if (speed)
10124 *cost += extra_cost->alu.logical;
10125
10126 return true;
10127 }
10128 }
10129 else
10130 {
10131 rtx new_op0 = op0;
10132
10133 /* Handle ORN, EON, or BIC. */
10134 if (GET_CODE (op0) == NOT)
10135 op0 = XEXP (op0, 0);
10136
10137 new_op0 = aarch64_strip_shift (op0);
10138
10139 /* If we had a shift on op0 then this is a logical-shift-
10140 by-register/immediate operation. Otherwise, this is just
10141 a logical operation. */
10142 if (speed)
10143 {
10144 if (new_op0 != op0)
10145 {
10146 /* Shift by immediate. */
10147 if (CONST_INT_P (XEXP (op0, 1)))
10148 *cost += extra_cost->alu.log_shift;
10149 else
10150 *cost += extra_cost->alu.log_shift_reg;
10151 }
10152 else
10153 *cost += extra_cost->alu.logical;
10154 }
10155
10156 /* In both cases we want to cost both operands. */
10157 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10158 0, speed);
10159 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10160 1, speed);
10161
10162 return true;
10163 }
10164 }
10165 return false;
10166
10167 case NOT:
10168 x = XEXP (x, 0);
10169 op0 = aarch64_strip_shift (x);
10170
10171 if (VECTOR_MODE_P (mode))
10172 {
10173 /* Vector NOT. */
10174 *cost += extra_cost->vect.alu;
10175 return false;
10176 }
10177
10178 /* MVN-shifted-reg. */
10179 if (op0 != x)
10180 {
10181 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10182
10183 if (speed)
10184 *cost += extra_cost->alu.log_shift;
10185
10186 return true;
10187 }
10188 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10189 Handle the second form here taking care that 'a' in the above can
10190 be a shift. */
10191 else if (GET_CODE (op0) == XOR)
10192 {
10193 rtx newop0 = XEXP (op0, 0);
10194 rtx newop1 = XEXP (op0, 1);
10195 rtx op0_stripped = aarch64_strip_shift (newop0);
10196
10197 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10198 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10199
10200 if (speed)
10201 {
10202 if (op0_stripped != newop0)
10203 *cost += extra_cost->alu.log_shift;
10204 else
10205 *cost += extra_cost->alu.logical;
10206 }
10207
10208 return true;
10209 }
10210 /* MVN. */
10211 if (speed)
10212 *cost += extra_cost->alu.logical;
10213
10214 return false;
10215
10216 case ZERO_EXTEND:
10217
10218 op0 = XEXP (x, 0);
10219 /* If a value is written in SI mode, then zero extended to DI
10220 mode, the operation will in general be free as a write to
10221 a 'w' register implicitly zeroes the upper bits of an 'x'
10222 register. However, if this is
10223
10224 (set (reg) (zero_extend (reg)))
10225
10226 we must cost the explicit register move. */
10227 if (mode == DImode
10228 && GET_MODE (op0) == SImode
10229 && outer == SET)
10230 {
10231 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10232
10233 /* If OP_COST is non-zero, then the cost of the zero extend
10234 is effectively the cost of the inner operation. Otherwise
10235 we have a MOV instruction and we take the cost from the MOV
10236 itself. This is true independently of whether we are
10237 optimizing for space or time. */
10238 if (op_cost)
10239 *cost = op_cost;
10240
10241 return true;
10242 }
10243 else if (MEM_P (op0))
10244 {
10245 /* All loads can zero extend to any size for free. */
10246 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10247 return true;
10248 }
10249
10250 op0 = aarch64_extend_bitfield_pattern_p (x);
10251 if (op0)
10252 {
10253 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10254 if (speed)
10255 *cost += extra_cost->alu.bfx;
10256 return true;
10257 }
10258
10259 if (speed)
10260 {
10261 if (VECTOR_MODE_P (mode))
10262 {
10263 /* UMOV. */
10264 *cost += extra_cost->vect.alu;
10265 }
10266 else
10267 {
10268 /* We generate an AND instead of UXTB/UXTH. */
10269 *cost += extra_cost->alu.logical;
10270 }
10271 }
10272 return false;
10273
10274 case SIGN_EXTEND:
10275 if (MEM_P (XEXP (x, 0)))
10276 {
10277 /* LDRSH. */
10278 if (speed)
10279 {
10280 rtx address = XEXP (XEXP (x, 0), 0);
10281 *cost += extra_cost->ldst.load_sign_extend;
10282
10283 *cost +=
10284 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10285 0, speed));
10286 }
10287 return true;
10288 }
10289
10290 op0 = aarch64_extend_bitfield_pattern_p (x);
10291 if (op0)
10292 {
10293 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10294 if (speed)
10295 *cost += extra_cost->alu.bfx;
10296 return true;
10297 }
10298
10299 if (speed)
10300 {
10301 if (VECTOR_MODE_P (mode))
10302 *cost += extra_cost->vect.alu;
10303 else
10304 *cost += extra_cost->alu.extend;
10305 }
10306 return false;
10307
10308 case ASHIFT:
10309 op0 = XEXP (x, 0);
10310 op1 = XEXP (x, 1);
10311
10312 if (CONST_INT_P (op1))
10313 {
10314 if (speed)
10315 {
10316 if (VECTOR_MODE_P (mode))
10317 {
10318 /* Vector shift (immediate). */
10319 *cost += extra_cost->vect.alu;
10320 }
10321 else
10322 {
10323 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10324 aliases. */
10325 *cost += extra_cost->alu.shift;
10326 }
10327 }
10328
10329 /* We can incorporate zero/sign extend for free. */
10330 if (GET_CODE (op0) == ZERO_EXTEND
10331 || GET_CODE (op0) == SIGN_EXTEND)
10332 op0 = XEXP (op0, 0);
10333
10334 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10335 return true;
10336 }
10337 else
10338 {
10339 if (VECTOR_MODE_P (mode))
10340 {
10341 if (speed)
10342 /* Vector shift (register). */
10343 *cost += extra_cost->vect.alu;
10344 }
10345 else
10346 {
10347 if (speed)
10348 /* LSLV. */
10349 *cost += extra_cost->alu.shift_reg;
10350
10351 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10352 && CONST_INT_P (XEXP (op1, 1))
10353 && known_eq (INTVAL (XEXP (op1, 1)),
10354 GET_MODE_BITSIZE (mode) - 1))
10355 {
10356 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10357 /* We already demanded XEXP (op1, 0) to be REG_P, so
10358 don't recurse into it. */
10359 return true;
10360 }
10361 }
10362 return false; /* All arguments need to be in registers. */
10363 }
10364
10365 case ROTATE:
10366 case ROTATERT:
10367 case LSHIFTRT:
10368 case ASHIFTRT:
10369 op0 = XEXP (x, 0);
10370 op1 = XEXP (x, 1);
10371
10372 if (CONST_INT_P (op1))
10373 {
10374 /* ASR (immediate) and friends. */
10375 if (speed)
10376 {
10377 if (VECTOR_MODE_P (mode))
10378 *cost += extra_cost->vect.alu;
10379 else
10380 *cost += extra_cost->alu.shift;
10381 }
10382
10383 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10384 return true;
10385 }
10386 else
10387 {
10388 if (VECTOR_MODE_P (mode))
10389 {
10390 if (speed)
10391 /* Vector shift (register). */
10392 *cost += extra_cost->vect.alu;
10393 }
10394 else
10395 {
10396 if (speed)
10397 /* ASR (register) and friends. */
10398 *cost += extra_cost->alu.shift_reg;
10399
10400 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10401 && CONST_INT_P (XEXP (op1, 1))
10402 && known_eq (INTVAL (XEXP (op1, 1)),
10403 GET_MODE_BITSIZE (mode) - 1))
10404 {
10405 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10406 /* We already demanded XEXP (op1, 0) to be REG_P, so
10407 don't recurse into it. */
10408 return true;
10409 }
10410 }
10411 return false; /* All arguments need to be in registers. */
10412 }
10413
10414 case SYMBOL_REF:
10415
10416 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10417 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10418 {
10419 /* LDR. */
10420 if (speed)
10421 *cost += extra_cost->ldst.load;
10422 }
10423 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10424 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10425 {
10426 /* ADRP, followed by ADD. */
10427 *cost += COSTS_N_INSNS (1);
10428 if (speed)
10429 *cost += 2 * extra_cost->alu.arith;
10430 }
10431 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10432 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10433 {
10434 /* ADR. */
10435 if (speed)
10436 *cost += extra_cost->alu.arith;
10437 }
10438
10439 if (flag_pic)
10440 {
10441 /* One extra load instruction, after accessing the GOT. */
10442 *cost += COSTS_N_INSNS (1);
10443 if (speed)
10444 *cost += extra_cost->ldst.load;
10445 }
10446 return true;
10447
10448 case HIGH:
10449 case LO_SUM:
10450 /* ADRP/ADD (immediate). */
10451 if (speed)
10452 *cost += extra_cost->alu.arith;
10453 return true;
10454
10455 case ZERO_EXTRACT:
10456 case SIGN_EXTRACT:
10457 /* UBFX/SBFX. */
10458 if (speed)
10459 {
10460 if (VECTOR_MODE_P (mode))
10461 *cost += extra_cost->vect.alu;
10462 else
10463 *cost += extra_cost->alu.bfx;
10464 }
10465
10466 /* We can trust that the immediates used will be correct (there
10467 are no by-register forms), so we need only cost op0. */
10468 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10469 return true;
10470
10471 case MULT:
10472 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10473 /* aarch64_rtx_mult_cost always handles recursion to its
10474 operands. */
10475 return true;
10476
10477 case MOD:
10478 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10479 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10480 an unconditional negate. This case should only ever be reached through
10481 the set_smod_pow2_cheap check in expmed.c. */
10482 if (CONST_INT_P (XEXP (x, 1))
10483 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10484 && (mode == SImode || mode == DImode))
10485 {
10486 /* We expand to 4 instructions. Reset the baseline. */
10487 *cost = COSTS_N_INSNS (4);
10488
10489 if (speed)
10490 *cost += 2 * extra_cost->alu.logical
10491 + 2 * extra_cost->alu.arith;
10492
10493 return true;
10494 }
10495
10496 /* Fall-through. */
10497 case UMOD:
10498 if (speed)
10499 {
10500 /* Slighly prefer UMOD over SMOD. */
10501 if (VECTOR_MODE_P (mode))
10502 *cost += extra_cost->vect.alu;
10503 else if (GET_MODE_CLASS (mode) == MODE_INT)
10504 *cost += (extra_cost->mult[mode == DImode].add
10505 + extra_cost->mult[mode == DImode].idiv
10506 + (code == MOD ? 1 : 0));
10507 }
10508 return false; /* All arguments need to be in registers. */
10509
10510 case DIV:
10511 case UDIV:
10512 case SQRT:
10513 if (speed)
10514 {
10515 if (VECTOR_MODE_P (mode))
10516 *cost += extra_cost->vect.alu;
10517 else if (GET_MODE_CLASS (mode) == MODE_INT)
10518 /* There is no integer SQRT, so only DIV and UDIV can get
10519 here. */
10520 *cost += (extra_cost->mult[mode == DImode].idiv
10521 /* Slighly prefer UDIV over SDIV. */
10522 + (code == DIV ? 1 : 0));
10523 else
10524 *cost += extra_cost->fp[mode == DFmode].div;
10525 }
10526 return false; /* All arguments need to be in registers. */
10527
10528 case IF_THEN_ELSE:
10529 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10530 XEXP (x, 2), cost, speed);
10531
10532 case EQ:
10533 case NE:
10534 case GT:
10535 case GTU:
10536 case LT:
10537 case LTU:
10538 case GE:
10539 case GEU:
10540 case LE:
10541 case LEU:
10542
10543 return false; /* All arguments must be in registers. */
10544
10545 case FMA:
10546 op0 = XEXP (x, 0);
10547 op1 = XEXP (x, 1);
10548 op2 = XEXP (x, 2);
10549
10550 if (speed)
10551 {
10552 if (VECTOR_MODE_P (mode))
10553 *cost += extra_cost->vect.alu;
10554 else
10555 *cost += extra_cost->fp[mode == DFmode].fma;
10556 }
10557
10558 /* FMSUB, FNMADD, and FNMSUB are free. */
10559 if (GET_CODE (op0) == NEG)
10560 op0 = XEXP (op0, 0);
10561
10562 if (GET_CODE (op2) == NEG)
10563 op2 = XEXP (op2, 0);
10564
10565 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10566 and the by-element operand as operand 0. */
10567 if (GET_CODE (op1) == NEG)
10568 op1 = XEXP (op1, 0);
10569
10570 /* Catch vector-by-element operations. The by-element operand can
10571 either be (vec_duplicate (vec_select (x))) or just
10572 (vec_select (x)), depending on whether we are multiplying by
10573 a vector or a scalar.
10574
10575 Canonicalization is not very good in these cases, FMA4 will put the
10576 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10577 if (GET_CODE (op0) == VEC_DUPLICATE)
10578 op0 = XEXP (op0, 0);
10579 else if (GET_CODE (op1) == VEC_DUPLICATE)
10580 op1 = XEXP (op1, 0);
10581
10582 if (GET_CODE (op0) == VEC_SELECT)
10583 op0 = XEXP (op0, 0);
10584 else if (GET_CODE (op1) == VEC_SELECT)
10585 op1 = XEXP (op1, 0);
10586
10587 /* If the remaining parameters are not registers,
10588 get the cost to put them into registers. */
10589 *cost += rtx_cost (op0, mode, FMA, 0, speed);
10590 *cost += rtx_cost (op1, mode, FMA, 1, speed);
10591 *cost += rtx_cost (op2, mode, FMA, 2, speed);
10592 return true;
10593
10594 case FLOAT:
10595 case UNSIGNED_FLOAT:
10596 if (speed)
10597 *cost += extra_cost->fp[mode == DFmode].fromint;
10598 return false;
10599
10600 case FLOAT_EXTEND:
10601 if (speed)
10602 {
10603 if (VECTOR_MODE_P (mode))
10604 {
10605 /*Vector truncate. */
10606 *cost += extra_cost->vect.alu;
10607 }
10608 else
10609 *cost += extra_cost->fp[mode == DFmode].widen;
10610 }
10611 return false;
10612
10613 case FLOAT_TRUNCATE:
10614 if (speed)
10615 {
10616 if (VECTOR_MODE_P (mode))
10617 {
10618 /*Vector conversion. */
10619 *cost += extra_cost->vect.alu;
10620 }
10621 else
10622 *cost += extra_cost->fp[mode == DFmode].narrow;
10623 }
10624 return false;
10625
10626 case FIX:
10627 case UNSIGNED_FIX:
10628 x = XEXP (x, 0);
10629 /* Strip the rounding part. They will all be implemented
10630 by the fcvt* family of instructions anyway. */
10631 if (GET_CODE (x) == UNSPEC)
10632 {
10633 unsigned int uns_code = XINT (x, 1);
10634
10635 if (uns_code == UNSPEC_FRINTA
10636 || uns_code == UNSPEC_FRINTM
10637 || uns_code == UNSPEC_FRINTN
10638 || uns_code == UNSPEC_FRINTP
10639 || uns_code == UNSPEC_FRINTZ)
10640 x = XVECEXP (x, 0, 0);
10641 }
10642
10643 if (speed)
10644 {
10645 if (VECTOR_MODE_P (mode))
10646 *cost += extra_cost->vect.alu;
10647 else
10648 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10649 }
10650
10651 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10652 fixed-point fcvt. */
10653 if (GET_CODE (x) == MULT
10654 && ((VECTOR_MODE_P (mode)
10655 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10656 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10657 {
10658 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10659 0, speed);
10660 return true;
10661 }
10662
10663 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10664 return true;
10665
10666 case ABS:
10667 if (VECTOR_MODE_P (mode))
10668 {
10669 /* ABS (vector). */
10670 if (speed)
10671 *cost += extra_cost->vect.alu;
10672 }
10673 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10674 {
10675 op0 = XEXP (x, 0);
10676
10677 /* FABD, which is analogous to FADD. */
10678 if (GET_CODE (op0) == MINUS)
10679 {
10680 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10681 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10682 if (speed)
10683 *cost += extra_cost->fp[mode == DFmode].addsub;
10684
10685 return true;
10686 }
10687 /* Simple FABS is analogous to FNEG. */
10688 if (speed)
10689 *cost += extra_cost->fp[mode == DFmode].neg;
10690 }
10691 else
10692 {
10693 /* Integer ABS will either be split to
10694 two arithmetic instructions, or will be an ABS
10695 (scalar), which we don't model. */
10696 *cost = COSTS_N_INSNS (2);
10697 if (speed)
10698 *cost += 2 * extra_cost->alu.arith;
10699 }
10700 return false;
10701
10702 case SMAX:
10703 case SMIN:
10704 if (speed)
10705 {
10706 if (VECTOR_MODE_P (mode))
10707 *cost += extra_cost->vect.alu;
10708 else
10709 {
10710 /* FMAXNM/FMINNM/FMAX/FMIN.
10711 TODO: This may not be accurate for all implementations, but
10712 we do not model this in the cost tables. */
10713 *cost += extra_cost->fp[mode == DFmode].addsub;
10714 }
10715 }
10716 return false;
10717
10718 case UNSPEC:
10719 /* The floating point round to integer frint* instructions. */
10720 if (aarch64_frint_unspec_p (XINT (x, 1)))
10721 {
10722 if (speed)
10723 *cost += extra_cost->fp[mode == DFmode].roundint;
10724
10725 return false;
10726 }
10727
10728 if (XINT (x, 1) == UNSPEC_RBIT)
10729 {
10730 if (speed)
10731 *cost += extra_cost->alu.rev;
10732
10733 return false;
10734 }
10735 break;
10736
10737 case TRUNCATE:
10738
10739 /* Decompose <su>muldi3_highpart. */
10740 if (/* (truncate:DI */
10741 mode == DImode
10742 /* (lshiftrt:TI */
10743 && GET_MODE (XEXP (x, 0)) == TImode
10744 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10745 /* (mult:TI */
10746 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10747 /* (ANY_EXTEND:TI (reg:DI))
10748 (ANY_EXTEND:TI (reg:DI))) */
10749 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10750 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10751 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10752 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10753 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10754 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10755 /* (const_int 64) */
10756 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10757 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10758 {
10759 /* UMULH/SMULH. */
10760 if (speed)
10761 *cost += extra_cost->mult[mode == DImode].extend;
10762 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10763 mode, MULT, 0, speed);
10764 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10765 mode, MULT, 1, speed);
10766 return true;
10767 }
10768
10769 /* Fall through. */
10770 default:
10771 break;
10772 }
10773
10774 if (dump_file
10775 && flag_aarch64_verbose_cost)
10776 fprintf (dump_file,
10777 "\nFailed to cost RTX. Assuming default cost.\n");
10778
10779 return true;
10780 }
10781
10782 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10783 calculated for X. This cost is stored in *COST. Returns true
10784 if the total cost of X was calculated. */
10785 static bool
10786 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10787 int param, int *cost, bool speed)
10788 {
10789 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10790
10791 if (dump_file
10792 && flag_aarch64_verbose_cost)
10793 {
10794 print_rtl_single (dump_file, x);
10795 fprintf (dump_file, "\n%s cost: %d (%s)\n",
10796 speed ? "Hot" : "Cold",
10797 *cost, result ? "final" : "partial");
10798 }
10799
10800 return result;
10801 }
10802
10803 static int
10804 aarch64_register_move_cost (machine_mode mode,
10805 reg_class_t from_i, reg_class_t to_i)
10806 {
10807 enum reg_class from = (enum reg_class) from_i;
10808 enum reg_class to = (enum reg_class) to_i;
10809 const struct cpu_regmove_cost *regmove_cost
10810 = aarch64_tune_params.regmove_cost;
10811
10812 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10813 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10814 to = GENERAL_REGS;
10815
10816 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10817 from = GENERAL_REGS;
10818
10819 /* Moving between GPR and stack cost is the same as GP2GP. */
10820 if ((from == GENERAL_REGS && to == STACK_REG)
10821 || (to == GENERAL_REGS && from == STACK_REG))
10822 return regmove_cost->GP2GP;
10823
10824 /* To/From the stack register, we move via the gprs. */
10825 if (to == STACK_REG || from == STACK_REG)
10826 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10827 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10828
10829 if (known_eq (GET_MODE_SIZE (mode), 16))
10830 {
10831 /* 128-bit operations on general registers require 2 instructions. */
10832 if (from == GENERAL_REGS && to == GENERAL_REGS)
10833 return regmove_cost->GP2GP * 2;
10834 else if (from == GENERAL_REGS)
10835 return regmove_cost->GP2FP * 2;
10836 else if (to == GENERAL_REGS)
10837 return regmove_cost->FP2GP * 2;
10838
10839 /* When AdvSIMD instructions are disabled it is not possible to move
10840 a 128-bit value directly between Q registers. This is handled in
10841 secondary reload. A general register is used as a scratch to move
10842 the upper DI value and the lower DI value is moved directly,
10843 hence the cost is the sum of three moves. */
10844 if (! TARGET_SIMD)
10845 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10846
10847 return regmove_cost->FP2FP;
10848 }
10849
10850 if (from == GENERAL_REGS && to == GENERAL_REGS)
10851 return regmove_cost->GP2GP;
10852 else if (from == GENERAL_REGS)
10853 return regmove_cost->GP2FP;
10854 else if (to == GENERAL_REGS)
10855 return regmove_cost->FP2GP;
10856
10857 return regmove_cost->FP2FP;
10858 }
10859
10860 static int
10861 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10862 reg_class_t rclass ATTRIBUTE_UNUSED,
10863 bool in ATTRIBUTE_UNUSED)
10864 {
10865 return aarch64_tune_params.memmov_cost;
10866 }
10867
10868 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10869 to optimize 1.0/sqrt. */
10870
10871 static bool
10872 use_rsqrt_p (machine_mode mode)
10873 {
10874 return (!flag_trapping_math
10875 && flag_unsafe_math_optimizations
10876 && ((aarch64_tune_params.approx_modes->recip_sqrt
10877 & AARCH64_APPROX_MODE (mode))
10878 || flag_mrecip_low_precision_sqrt));
10879 }
10880
10881 /* Function to decide when to use the approximate reciprocal square root
10882 builtin. */
10883
10884 static tree
10885 aarch64_builtin_reciprocal (tree fndecl)
10886 {
10887 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10888
10889 if (!use_rsqrt_p (mode))
10890 return NULL_TREE;
10891 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10892 }
10893
10894 /* Emit instruction sequence to compute either the approximate square root
10895 or its approximate reciprocal, depending on the flag RECP, and return
10896 whether the sequence was emitted or not. */
10897
10898 bool
10899 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10900 {
10901 machine_mode mode = GET_MODE (dst);
10902
10903 if (GET_MODE_INNER (mode) == HFmode)
10904 {
10905 gcc_assert (!recp);
10906 return false;
10907 }
10908
10909 if (!recp)
10910 {
10911 if (!(flag_mlow_precision_sqrt
10912 || (aarch64_tune_params.approx_modes->sqrt
10913 & AARCH64_APPROX_MODE (mode))))
10914 return false;
10915
10916 if (flag_finite_math_only
10917 || flag_trapping_math
10918 || !flag_unsafe_math_optimizations
10919 || optimize_function_for_size_p (cfun))
10920 return false;
10921 }
10922 else
10923 /* Caller assumes we cannot fail. */
10924 gcc_assert (use_rsqrt_p (mode));
10925
10926 machine_mode mmsk = mode_for_int_vector (mode).require ();
10927 rtx xmsk = gen_reg_rtx (mmsk);
10928 if (!recp)
10929 /* When calculating the approximate square root, compare the
10930 argument with 0.0 and create a mask. */
10931 emit_insn (gen_rtx_SET (xmsk,
10932 gen_rtx_NEG (mmsk,
10933 gen_rtx_EQ (mmsk, src,
10934 CONST0_RTX (mode)))));
10935
10936 /* Estimate the approximate reciprocal square root. */
10937 rtx xdst = gen_reg_rtx (mode);
10938 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10939
10940 /* Iterate over the series twice for SF and thrice for DF. */
10941 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10942
10943 /* Optionally iterate over the series once less for faster performance
10944 while sacrificing the accuracy. */
10945 if ((recp && flag_mrecip_low_precision_sqrt)
10946 || (!recp && flag_mlow_precision_sqrt))
10947 iterations--;
10948
10949 /* Iterate over the series to calculate the approximate reciprocal square
10950 root. */
10951 rtx x1 = gen_reg_rtx (mode);
10952 while (iterations--)
10953 {
10954 rtx x2 = gen_reg_rtx (mode);
10955 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10956
10957 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10958
10959 if (iterations > 0)
10960 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10961 }
10962
10963 if (!recp)
10964 {
10965 /* Qualify the approximate reciprocal square root when the argument is
10966 0.0 by squashing the intermediary result to 0.0. */
10967 rtx xtmp = gen_reg_rtx (mmsk);
10968 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10969 gen_rtx_SUBREG (mmsk, xdst, 0)));
10970 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10971
10972 /* Calculate the approximate square root. */
10973 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10974 }
10975
10976 /* Finalize the approximation. */
10977 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10978
10979 return true;
10980 }
10981
10982 /* Emit the instruction sequence to compute the approximation for the division
10983 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10984
10985 bool
10986 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10987 {
10988 machine_mode mode = GET_MODE (quo);
10989
10990 if (GET_MODE_INNER (mode) == HFmode)
10991 return false;
10992
10993 bool use_approx_division_p = (flag_mlow_precision_div
10994 || (aarch64_tune_params.approx_modes->division
10995 & AARCH64_APPROX_MODE (mode)));
10996
10997 if (!flag_finite_math_only
10998 || flag_trapping_math
10999 || !flag_unsafe_math_optimizations
11000 || optimize_function_for_size_p (cfun)
11001 || !use_approx_division_p)
11002 return false;
11003
11004 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11005 return false;
11006
11007 /* Estimate the approximate reciprocal. */
11008 rtx xrcp = gen_reg_rtx (mode);
11009 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11010
11011 /* Iterate over the series twice for SF and thrice for DF. */
11012 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11013
11014 /* Optionally iterate over the series once less for faster performance,
11015 while sacrificing the accuracy. */
11016 if (flag_mlow_precision_div)
11017 iterations--;
11018
11019 /* Iterate over the series to calculate the approximate reciprocal. */
11020 rtx xtmp = gen_reg_rtx (mode);
11021 while (iterations--)
11022 {
11023 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11024
11025 if (iterations > 0)
11026 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11027 }
11028
11029 if (num != CONST1_RTX (mode))
11030 {
11031 /* As the approximate reciprocal of DEN is already calculated, only
11032 calculate the approximate division when NUM is not 1.0. */
11033 rtx xnum = force_reg (mode, num);
11034 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11035 }
11036
11037 /* Finalize the approximation. */
11038 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11039 return true;
11040 }
11041
11042 /* Return the number of instructions that can be issued per cycle. */
11043 static int
11044 aarch64_sched_issue_rate (void)
11045 {
11046 return aarch64_tune_params.issue_rate;
11047 }
11048
11049 static int
11050 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11051 {
11052 int issue_rate = aarch64_sched_issue_rate ();
11053
11054 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11055 }
11056
11057
11058 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11059 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11060 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11061
11062 static int
11063 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11064 int ready_index)
11065 {
11066 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11067 }
11068
11069
11070 /* Vectorizer cost model target hooks. */
11071
11072 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11073 static int
11074 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11075 tree vectype,
11076 int misalign ATTRIBUTE_UNUSED)
11077 {
11078 unsigned elements;
11079 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11080 bool fp = false;
11081
11082 if (vectype != NULL)
11083 fp = FLOAT_TYPE_P (vectype);
11084
11085 switch (type_of_cost)
11086 {
11087 case scalar_stmt:
11088 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11089
11090 case scalar_load:
11091 return costs->scalar_load_cost;
11092
11093 case scalar_store:
11094 return costs->scalar_store_cost;
11095
11096 case vector_stmt:
11097 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11098
11099 case vector_load:
11100 return costs->vec_align_load_cost;
11101
11102 case vector_store:
11103 return costs->vec_store_cost;
11104
11105 case vec_to_scalar:
11106 return costs->vec_to_scalar_cost;
11107
11108 case scalar_to_vec:
11109 return costs->scalar_to_vec_cost;
11110
11111 case unaligned_load:
11112 case vector_gather_load:
11113 return costs->vec_unalign_load_cost;
11114
11115 case unaligned_store:
11116 case vector_scatter_store:
11117 return costs->vec_unalign_store_cost;
11118
11119 case cond_branch_taken:
11120 return costs->cond_taken_branch_cost;
11121
11122 case cond_branch_not_taken:
11123 return costs->cond_not_taken_branch_cost;
11124
11125 case vec_perm:
11126 return costs->vec_permute_cost;
11127
11128 case vec_promote_demote:
11129 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11130
11131 case vec_construct:
11132 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11133 return elements / 2 + 1;
11134
11135 default:
11136 gcc_unreachable ();
11137 }
11138 }
11139
11140 /* Implement targetm.vectorize.add_stmt_cost. */
11141 static unsigned
11142 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11143 struct _stmt_vec_info *stmt_info, int misalign,
11144 enum vect_cost_model_location where)
11145 {
11146 unsigned *cost = (unsigned *) data;
11147 unsigned retval = 0;
11148
11149 if (flag_vect_cost_model)
11150 {
11151 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11152 int stmt_cost =
11153 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11154
11155 /* Statements in an inner loop relative to the loop being
11156 vectorized are weighted more heavily. The value here is
11157 arbitrary and could potentially be improved with analysis. */
11158 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11159 count *= 50; /* FIXME */
11160
11161 retval = (unsigned) (count * stmt_cost);
11162 cost[where] += retval;
11163 }
11164
11165 return retval;
11166 }
11167
11168 static void initialize_aarch64_code_model (struct gcc_options *);
11169
11170 /* Parse the TO_PARSE string and put the architecture struct that it
11171 selects into RES and the architectural features into ISA_FLAGS.
11172 Return an aarch64_parse_opt_result describing the parse result.
11173 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11174 When the TO_PARSE string contains an invalid extension,
11175 a copy of the string is created and stored to INVALID_EXTENSION. */
11176
11177 static enum aarch64_parse_opt_result
11178 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11179 uint64_t *isa_flags, std::string *invalid_extension)
11180 {
11181 const char *ext;
11182 const struct processor *arch;
11183 size_t len;
11184
11185 ext = strchr (to_parse, '+');
11186
11187 if (ext != NULL)
11188 len = ext - to_parse;
11189 else
11190 len = strlen (to_parse);
11191
11192 if (len == 0)
11193 return AARCH64_PARSE_MISSING_ARG;
11194
11195
11196 /* Loop through the list of supported ARCHes to find a match. */
11197 for (arch = all_architectures; arch->name != NULL; arch++)
11198 {
11199 if (strlen (arch->name) == len
11200 && strncmp (arch->name, to_parse, len) == 0)
11201 {
11202 uint64_t isa_temp = arch->flags;
11203
11204 if (ext != NULL)
11205 {
11206 /* TO_PARSE string contains at least one extension. */
11207 enum aarch64_parse_opt_result ext_res
11208 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11209
11210 if (ext_res != AARCH64_PARSE_OK)
11211 return ext_res;
11212 }
11213 /* Extension parsing was successful. Confirm the result
11214 arch and ISA flags. */
11215 *res = arch;
11216 *isa_flags = isa_temp;
11217 return AARCH64_PARSE_OK;
11218 }
11219 }
11220
11221 /* ARCH name not found in list. */
11222 return AARCH64_PARSE_INVALID_ARG;
11223 }
11224
11225 /* Parse the TO_PARSE string and put the result tuning in RES and the
11226 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11227 describing the parse result. If there is an error parsing, RES and
11228 ISA_FLAGS are left unchanged.
11229 When the TO_PARSE string contains an invalid extension,
11230 a copy of the string is created and stored to INVALID_EXTENSION. */
11231
11232 static enum aarch64_parse_opt_result
11233 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11234 uint64_t *isa_flags, std::string *invalid_extension)
11235 {
11236 const char *ext;
11237 const struct processor *cpu;
11238 size_t len;
11239
11240 ext = strchr (to_parse, '+');
11241
11242 if (ext != NULL)
11243 len = ext - to_parse;
11244 else
11245 len = strlen (to_parse);
11246
11247 if (len == 0)
11248 return AARCH64_PARSE_MISSING_ARG;
11249
11250
11251 /* Loop through the list of supported CPUs to find a match. */
11252 for (cpu = all_cores; cpu->name != NULL; cpu++)
11253 {
11254 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11255 {
11256 uint64_t isa_temp = cpu->flags;
11257
11258
11259 if (ext != NULL)
11260 {
11261 /* TO_PARSE string contains at least one extension. */
11262 enum aarch64_parse_opt_result ext_res
11263 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11264
11265 if (ext_res != AARCH64_PARSE_OK)
11266 return ext_res;
11267 }
11268 /* Extension parsing was successfull. Confirm the result
11269 cpu and ISA flags. */
11270 *res = cpu;
11271 *isa_flags = isa_temp;
11272 return AARCH64_PARSE_OK;
11273 }
11274 }
11275
11276 /* CPU name not found in list. */
11277 return AARCH64_PARSE_INVALID_ARG;
11278 }
11279
11280 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11281 Return an aarch64_parse_opt_result describing the parse result.
11282 If the parsing fails the RES does not change. */
11283
11284 static enum aarch64_parse_opt_result
11285 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11286 {
11287 const struct processor *cpu;
11288
11289 /* Loop through the list of supported CPUs to find a match. */
11290 for (cpu = all_cores; cpu->name != NULL; cpu++)
11291 {
11292 if (strcmp (cpu->name, to_parse) == 0)
11293 {
11294 *res = cpu;
11295 return AARCH64_PARSE_OK;
11296 }
11297 }
11298
11299 /* CPU name not found in list. */
11300 return AARCH64_PARSE_INVALID_ARG;
11301 }
11302
11303 /* Parse TOKEN, which has length LENGTH to see if it is an option
11304 described in FLAG. If it is, return the index bit for that fusion type.
11305 If not, error (printing OPTION_NAME) and return zero. */
11306
11307 static unsigned int
11308 aarch64_parse_one_option_token (const char *token,
11309 size_t length,
11310 const struct aarch64_flag_desc *flag,
11311 const char *option_name)
11312 {
11313 for (; flag->name != NULL; flag++)
11314 {
11315 if (length == strlen (flag->name)
11316 && !strncmp (flag->name, token, length))
11317 return flag->flag;
11318 }
11319
11320 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11321 return 0;
11322 }
11323
11324 /* Parse OPTION which is a comma-separated list of flags to enable.
11325 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11326 default state we inherit from the CPU tuning structures. OPTION_NAME
11327 gives the top-level option we are parsing in the -moverride string,
11328 for use in error messages. */
11329
11330 static unsigned int
11331 aarch64_parse_boolean_options (const char *option,
11332 const struct aarch64_flag_desc *flags,
11333 unsigned int initial_state,
11334 const char *option_name)
11335 {
11336 const char separator = '.';
11337 const char* specs = option;
11338 const char* ntoken = option;
11339 unsigned int found_flags = initial_state;
11340
11341 while ((ntoken = strchr (specs, separator)))
11342 {
11343 size_t token_length = ntoken - specs;
11344 unsigned token_ops = aarch64_parse_one_option_token (specs,
11345 token_length,
11346 flags,
11347 option_name);
11348 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11349 in the token stream, reset the supported operations. So:
11350
11351 adrp+add.cmp+branch.none.adrp+add
11352
11353 would have the result of turning on only adrp+add fusion. */
11354 if (!token_ops)
11355 found_flags = 0;
11356
11357 found_flags |= token_ops;
11358 specs = ++ntoken;
11359 }
11360
11361 /* We ended with a comma, print something. */
11362 if (!(*specs))
11363 {
11364 error ("%s string ill-formed\n", option_name);
11365 return 0;
11366 }
11367
11368 /* We still have one more token to parse. */
11369 size_t token_length = strlen (specs);
11370 unsigned token_ops = aarch64_parse_one_option_token (specs,
11371 token_length,
11372 flags,
11373 option_name);
11374 if (!token_ops)
11375 found_flags = 0;
11376
11377 found_flags |= token_ops;
11378 return found_flags;
11379 }
11380
11381 /* Support for overriding instruction fusion. */
11382
11383 static void
11384 aarch64_parse_fuse_string (const char *fuse_string,
11385 struct tune_params *tune)
11386 {
11387 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11388 aarch64_fusible_pairs,
11389 tune->fusible_ops,
11390 "fuse=");
11391 }
11392
11393 /* Support for overriding other tuning flags. */
11394
11395 static void
11396 aarch64_parse_tune_string (const char *tune_string,
11397 struct tune_params *tune)
11398 {
11399 tune->extra_tuning_flags
11400 = aarch64_parse_boolean_options (tune_string,
11401 aarch64_tuning_flags,
11402 tune->extra_tuning_flags,
11403 "tune=");
11404 }
11405
11406 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11407 Accept the valid SVE vector widths allowed by
11408 aarch64_sve_vector_bits_enum and use it to override sve_width
11409 in TUNE. */
11410
11411 static void
11412 aarch64_parse_sve_width_string (const char *tune_string,
11413 struct tune_params *tune)
11414 {
11415 int width = -1;
11416
11417 int n = sscanf (tune_string, "%d", &width);
11418 if (n == EOF)
11419 {
11420 error ("invalid format for sve_width");
11421 return;
11422 }
11423 switch (width)
11424 {
11425 case SVE_128:
11426 case SVE_256:
11427 case SVE_512:
11428 case SVE_1024:
11429 case SVE_2048:
11430 break;
11431 default:
11432 error ("invalid sve_width value: %d", width);
11433 }
11434 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11435 }
11436
11437 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11438 we understand. If it is, extract the option string and handoff to
11439 the appropriate function. */
11440
11441 void
11442 aarch64_parse_one_override_token (const char* token,
11443 size_t length,
11444 struct tune_params *tune)
11445 {
11446 const struct aarch64_tuning_override_function *fn
11447 = aarch64_tuning_override_functions;
11448
11449 const char *option_part = strchr (token, '=');
11450 if (!option_part)
11451 {
11452 error ("tuning string missing in option (%s)", token);
11453 return;
11454 }
11455
11456 /* Get the length of the option name. */
11457 length = option_part - token;
11458 /* Skip the '=' to get to the option string. */
11459 option_part++;
11460
11461 for (; fn->name != NULL; fn++)
11462 {
11463 if (!strncmp (fn->name, token, length))
11464 {
11465 fn->parse_override (option_part, tune);
11466 return;
11467 }
11468 }
11469
11470 error ("unknown tuning option (%s)",token);
11471 return;
11472 }
11473
11474 /* A checking mechanism for the implementation of the tls size. */
11475
11476 static void
11477 initialize_aarch64_tls_size (struct gcc_options *opts)
11478 {
11479 if (aarch64_tls_size == 0)
11480 aarch64_tls_size = 24;
11481
11482 switch (opts->x_aarch64_cmodel_var)
11483 {
11484 case AARCH64_CMODEL_TINY:
11485 /* Both the default and maximum TLS size allowed under tiny is 1M which
11486 needs two instructions to address, so we clamp the size to 24. */
11487 if (aarch64_tls_size > 24)
11488 aarch64_tls_size = 24;
11489 break;
11490 case AARCH64_CMODEL_SMALL:
11491 /* The maximum TLS size allowed under small is 4G. */
11492 if (aarch64_tls_size > 32)
11493 aarch64_tls_size = 32;
11494 break;
11495 case AARCH64_CMODEL_LARGE:
11496 /* The maximum TLS size allowed under large is 16E.
11497 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11498 if (aarch64_tls_size > 48)
11499 aarch64_tls_size = 48;
11500 break;
11501 default:
11502 gcc_unreachable ();
11503 }
11504
11505 return;
11506 }
11507
11508 /* Parse STRING looking for options in the format:
11509 string :: option:string
11510 option :: name=substring
11511 name :: {a-z}
11512 substring :: defined by option. */
11513
11514 static void
11515 aarch64_parse_override_string (const char* input_string,
11516 struct tune_params* tune)
11517 {
11518 const char separator = ':';
11519 size_t string_length = strlen (input_string) + 1;
11520 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11521 char *string = string_root;
11522 strncpy (string, input_string, string_length);
11523 string[string_length - 1] = '\0';
11524
11525 char* ntoken = string;
11526
11527 while ((ntoken = strchr (string, separator)))
11528 {
11529 size_t token_length = ntoken - string;
11530 /* Make this substring look like a string. */
11531 *ntoken = '\0';
11532 aarch64_parse_one_override_token (string, token_length, tune);
11533 string = ++ntoken;
11534 }
11535
11536 /* One last option to parse. */
11537 aarch64_parse_one_override_token (string, strlen (string), tune);
11538 free (string_root);
11539 }
11540
11541
11542 static void
11543 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11544 {
11545 if (accepted_branch_protection_string)
11546 {
11547 opts->x_aarch64_branch_protection_string
11548 = xstrdup (accepted_branch_protection_string);
11549 }
11550
11551 /* PR 70044: We have to be careful about being called multiple times for the
11552 same function. This means all changes should be repeatable. */
11553
11554 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11555 Disable the frame pointer flag so the mid-end will not use a frame
11556 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11557 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11558 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11559 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11560 if (opts->x_flag_omit_frame_pointer == 0)
11561 opts->x_flag_omit_frame_pointer = 2;
11562
11563 /* If not optimizing for size, set the default
11564 alignment to what the target wants. */
11565 if (!opts->x_optimize_size)
11566 {
11567 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11568 opts->x_str_align_loops = aarch64_tune_params.loop_align;
11569 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11570 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11571 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11572 opts->x_str_align_functions = aarch64_tune_params.function_align;
11573 }
11574
11575 /* We default to no pc-relative literal loads. */
11576
11577 aarch64_pcrelative_literal_loads = false;
11578
11579 /* If -mpc-relative-literal-loads is set on the command line, this
11580 implies that the user asked for PC relative literal loads. */
11581 if (opts->x_pcrelative_literal_loads == 1)
11582 aarch64_pcrelative_literal_loads = true;
11583
11584 /* In the tiny memory model it makes no sense to disallow PC relative
11585 literal pool loads. */
11586 if (aarch64_cmodel == AARCH64_CMODEL_TINY
11587 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11588 aarch64_pcrelative_literal_loads = true;
11589
11590 /* When enabling the lower precision Newton series for the square root, also
11591 enable it for the reciprocal square root, since the latter is an
11592 intermediary step for the former. */
11593 if (flag_mlow_precision_sqrt)
11594 flag_mrecip_low_precision_sqrt = true;
11595 }
11596
11597 /* 'Unpack' up the internal tuning structs and update the options
11598 in OPTS. The caller must have set up selected_tune and selected_arch
11599 as all the other target-specific codegen decisions are
11600 derived from them. */
11601
11602 void
11603 aarch64_override_options_internal (struct gcc_options *opts)
11604 {
11605 aarch64_tune_flags = selected_tune->flags;
11606 aarch64_tune = selected_tune->sched_core;
11607 /* Make a copy of the tuning parameters attached to the core, which
11608 we may later overwrite. */
11609 aarch64_tune_params = *(selected_tune->tune);
11610 aarch64_architecture_version = selected_arch->architecture_version;
11611
11612 if (opts->x_aarch64_override_tune_string)
11613 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11614 &aarch64_tune_params);
11615
11616 /* This target defaults to strict volatile bitfields. */
11617 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11618 opts->x_flag_strict_volatile_bitfields = 1;
11619
11620 if (aarch64_stack_protector_guard == SSP_GLOBAL
11621 && opts->x_aarch64_stack_protector_guard_offset_str)
11622 {
11623 error ("incompatible options %<-mstack-protector-guard=global%> and "
11624 "%<-mstack-protector-guard-offset=%s%>",
11625 aarch64_stack_protector_guard_offset_str);
11626 }
11627
11628 if (aarch64_stack_protector_guard == SSP_SYSREG
11629 && !(opts->x_aarch64_stack_protector_guard_offset_str
11630 && opts->x_aarch64_stack_protector_guard_reg_str))
11631 {
11632 error ("both %<-mstack-protector-guard-offset%> and "
11633 "%<-mstack-protector-guard-reg%> must be used "
11634 "with %<-mstack-protector-guard=sysreg%>");
11635 }
11636
11637 if (opts->x_aarch64_stack_protector_guard_reg_str)
11638 {
11639 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11640 error ("specify a system register with a small string length.");
11641 }
11642
11643 if (opts->x_aarch64_stack_protector_guard_offset_str)
11644 {
11645 char *end;
11646 const char *str = aarch64_stack_protector_guard_offset_str;
11647 errno = 0;
11648 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11649 if (!*str || *end || errno)
11650 error ("%qs is not a valid offset in %qs", str,
11651 "-mstack-protector-guard-offset=");
11652 aarch64_stack_protector_guard_offset = offs;
11653 }
11654
11655 initialize_aarch64_code_model (opts);
11656 initialize_aarch64_tls_size (opts);
11657
11658 int queue_depth = 0;
11659 switch (aarch64_tune_params.autoprefetcher_model)
11660 {
11661 case tune_params::AUTOPREFETCHER_OFF:
11662 queue_depth = -1;
11663 break;
11664 case tune_params::AUTOPREFETCHER_WEAK:
11665 queue_depth = 0;
11666 break;
11667 case tune_params::AUTOPREFETCHER_STRONG:
11668 queue_depth = max_insn_queue_index + 1;
11669 break;
11670 default:
11671 gcc_unreachable ();
11672 }
11673
11674 /* We don't mind passing in global_options_set here as we don't use
11675 the *options_set structs anyway. */
11676 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11677 queue_depth,
11678 opts->x_param_values,
11679 global_options_set.x_param_values);
11680
11681 /* Set up parameters to be used in prefetching algorithm. Do not
11682 override the defaults unless we are tuning for a core we have
11683 researched values for. */
11684 if (aarch64_tune_params.prefetch->num_slots > 0)
11685 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11686 aarch64_tune_params.prefetch->num_slots,
11687 opts->x_param_values,
11688 global_options_set.x_param_values);
11689 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11690 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11691 aarch64_tune_params.prefetch->l1_cache_size,
11692 opts->x_param_values,
11693 global_options_set.x_param_values);
11694 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11695 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11696 aarch64_tune_params.prefetch->l1_cache_line_size,
11697 opts->x_param_values,
11698 global_options_set.x_param_values);
11699 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11700 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11701 aarch64_tune_params.prefetch->l2_cache_size,
11702 opts->x_param_values,
11703 global_options_set.x_param_values);
11704 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11705 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11706 0,
11707 opts->x_param_values,
11708 global_options_set.x_param_values);
11709 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11710 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11711 aarch64_tune_params.prefetch->minimum_stride,
11712 opts->x_param_values,
11713 global_options_set.x_param_values);
11714
11715 /* Use the alternative scheduling-pressure algorithm by default. */
11716 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11717 opts->x_param_values,
11718 global_options_set.x_param_values);
11719
11720 /* If the user hasn't changed it via configure then set the default to 64 KB
11721 for the backend. */
11722 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11723 DEFAULT_STK_CLASH_GUARD_SIZE == 0
11724 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11725 opts->x_param_values,
11726 global_options_set.x_param_values);
11727
11728 /* Validate the guard size. */
11729 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11730
11731 /* Enforce that interval is the same size as size so the mid-end does the
11732 right thing. */
11733 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11734 guard_size,
11735 opts->x_param_values,
11736 global_options_set.x_param_values);
11737
11738 /* The maybe_set calls won't update the value if the user has explicitly set
11739 one. Which means we need to validate that probing interval and guard size
11740 are equal. */
11741 int probe_interval
11742 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11743 if (guard_size != probe_interval)
11744 error ("stack clash guard size %<%d%> must be equal to probing interval "
11745 "%<%d%>", guard_size, probe_interval);
11746
11747 /* Enable sw prefetching at specified optimization level for
11748 CPUS that have prefetch. Lower optimization level threshold by 1
11749 when profiling is enabled. */
11750 if (opts->x_flag_prefetch_loop_arrays < 0
11751 && !opts->x_optimize_size
11752 && aarch64_tune_params.prefetch->default_opt_level >= 0
11753 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11754 opts->x_flag_prefetch_loop_arrays = 1;
11755
11756 if (opts->x_aarch64_arch_string == NULL)
11757 opts->x_aarch64_arch_string = selected_arch->name;
11758 if (opts->x_aarch64_cpu_string == NULL)
11759 opts->x_aarch64_cpu_string = selected_cpu->name;
11760 if (opts->x_aarch64_tune_string == NULL)
11761 opts->x_aarch64_tune_string = selected_tune->name;
11762
11763 aarch64_override_options_after_change_1 (opts);
11764 }
11765
11766 /* Print a hint with a suggestion for a core or architecture name that
11767 most closely resembles what the user passed in STR. ARCH is true if
11768 the user is asking for an architecture name. ARCH is false if the user
11769 is asking for a core name. */
11770
11771 static void
11772 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11773 {
11774 auto_vec<const char *> candidates;
11775 const struct processor *entry = arch ? all_architectures : all_cores;
11776 for (; entry->name != NULL; entry++)
11777 candidates.safe_push (entry->name);
11778
11779 #ifdef HAVE_LOCAL_CPU_DETECT
11780 /* Add also "native" as possible value. */
11781 if (arch)
11782 candidates.safe_push ("native");
11783 #endif
11784
11785 char *s;
11786 const char *hint = candidates_list_and_hint (str, s, candidates);
11787 if (hint)
11788 inform (input_location, "valid arguments are: %s;"
11789 " did you mean %qs?", s, hint);
11790 else
11791 inform (input_location, "valid arguments are: %s", s);
11792
11793 XDELETEVEC (s);
11794 }
11795
11796 /* Print a hint with a suggestion for a core name that most closely resembles
11797 what the user passed in STR. */
11798
11799 inline static void
11800 aarch64_print_hint_for_core (const char *str)
11801 {
11802 aarch64_print_hint_for_core_or_arch (str, false);
11803 }
11804
11805 /* Print a hint with a suggestion for an architecture name that most closely
11806 resembles what the user passed in STR. */
11807
11808 inline static void
11809 aarch64_print_hint_for_arch (const char *str)
11810 {
11811 aarch64_print_hint_for_core_or_arch (str, true);
11812 }
11813
11814
11815 /* Print a hint with a suggestion for an extension name
11816 that most closely resembles what the user passed in STR. */
11817
11818 void
11819 aarch64_print_hint_for_extensions (const std::string &str)
11820 {
11821 auto_vec<const char *> candidates;
11822 aarch64_get_all_extension_candidates (&candidates);
11823 char *s;
11824 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11825 if (hint)
11826 inform (input_location, "valid arguments are: %s;"
11827 " did you mean %qs?", s, hint);
11828 else
11829 inform (input_location, "valid arguments are: %s;", s);
11830
11831 XDELETEVEC (s);
11832 }
11833
11834 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11835 specified in STR and throw errors if appropriate. Put the results if
11836 they are valid in RES and ISA_FLAGS. Return whether the option is
11837 valid. */
11838
11839 static bool
11840 aarch64_validate_mcpu (const char *str, const struct processor **res,
11841 uint64_t *isa_flags)
11842 {
11843 std::string invalid_extension;
11844 enum aarch64_parse_opt_result parse_res
11845 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11846
11847 if (parse_res == AARCH64_PARSE_OK)
11848 return true;
11849
11850 switch (parse_res)
11851 {
11852 case AARCH64_PARSE_MISSING_ARG:
11853 error ("missing cpu name in %<-mcpu=%s%>", str);
11854 break;
11855 case AARCH64_PARSE_INVALID_ARG:
11856 error ("unknown value %qs for %<-mcpu%>", str);
11857 aarch64_print_hint_for_core (str);
11858 break;
11859 case AARCH64_PARSE_INVALID_FEATURE:
11860 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11861 invalid_extension.c_str (), str);
11862 aarch64_print_hint_for_extensions (invalid_extension);
11863 break;
11864 default:
11865 gcc_unreachable ();
11866 }
11867
11868 return false;
11869 }
11870
11871 /* Parses CONST_STR for branch protection features specified in
11872 aarch64_branch_protect_types, and set any global variables required. Returns
11873 the parsing result and assigns LAST_STR to the last processed token from
11874 CONST_STR so that it can be used for error reporting. */
11875
11876 static enum
11877 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11878 char** last_str)
11879 {
11880 char *str_root = xstrdup (const_str);
11881 char* token_save = NULL;
11882 char *str = strtok_r (str_root, "+", &token_save);
11883 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11884 if (!str)
11885 res = AARCH64_PARSE_MISSING_ARG;
11886 else
11887 {
11888 char *next_str = strtok_r (NULL, "+", &token_save);
11889 /* Reset the branch protection features to their defaults. */
11890 aarch64_handle_no_branch_protection (NULL, NULL);
11891
11892 while (str && res == AARCH64_PARSE_OK)
11893 {
11894 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11895 bool found = false;
11896 /* Search for this type. */
11897 while (type && type->name && !found && res == AARCH64_PARSE_OK)
11898 {
11899 if (strcmp (str, type->name) == 0)
11900 {
11901 found = true;
11902 res = type->handler (str, next_str);
11903 str = next_str;
11904 next_str = strtok_r (NULL, "+", &token_save);
11905 }
11906 else
11907 type++;
11908 }
11909 if (found && res == AARCH64_PARSE_OK)
11910 {
11911 bool found_subtype = true;
11912 /* Loop through each token until we find one that isn't a
11913 subtype. */
11914 while (found_subtype)
11915 {
11916 found_subtype = false;
11917 const aarch64_branch_protect_type *subtype = type->subtypes;
11918 /* Search for the subtype. */
11919 while (str && subtype && subtype->name && !found_subtype
11920 && res == AARCH64_PARSE_OK)
11921 {
11922 if (strcmp (str, subtype->name) == 0)
11923 {
11924 found_subtype = true;
11925 res = subtype->handler (str, next_str);
11926 str = next_str;
11927 next_str = strtok_r (NULL, "+", &token_save);
11928 }
11929 else
11930 subtype++;
11931 }
11932 }
11933 }
11934 else if (!found)
11935 res = AARCH64_PARSE_INVALID_ARG;
11936 }
11937 }
11938 /* Copy the last processed token into the argument to pass it back.
11939 Used by option and attribute validation to print the offending token. */
11940 if (last_str)
11941 {
11942 if (str) strcpy (*last_str, str);
11943 else *last_str = NULL;
11944 }
11945 if (res == AARCH64_PARSE_OK)
11946 {
11947 /* If needed, alloc the accepted string then copy in const_str.
11948 Used by override_option_after_change_1. */
11949 if (!accepted_branch_protection_string)
11950 accepted_branch_protection_string = (char *) xmalloc (
11951 BRANCH_PROTECT_STR_MAX
11952 + 1);
11953 strncpy (accepted_branch_protection_string, const_str,
11954 BRANCH_PROTECT_STR_MAX + 1);
11955 /* Forcibly null-terminate. */
11956 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11957 }
11958 return res;
11959 }
11960
11961 static bool
11962 aarch64_validate_mbranch_protection (const char *const_str)
11963 {
11964 char *str = (char *) xmalloc (strlen (const_str));
11965 enum aarch64_parse_opt_result res =
11966 aarch64_parse_branch_protection (const_str, &str);
11967 if (res == AARCH64_PARSE_INVALID_ARG)
11968 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
11969 else if (res == AARCH64_PARSE_MISSING_ARG)
11970 error ("missing argument for %<-mbranch-protection=%>");
11971 free (str);
11972 return res == AARCH64_PARSE_OK;
11973 }
11974
11975 /* Validate a command-line -march option. Parse the arch and extensions
11976 (if any) specified in STR and throw errors if appropriate. Put the
11977 results, if they are valid, in RES and ISA_FLAGS. Return whether the
11978 option is valid. */
11979
11980 static bool
11981 aarch64_validate_march (const char *str, const struct processor **res,
11982 uint64_t *isa_flags)
11983 {
11984 std::string invalid_extension;
11985 enum aarch64_parse_opt_result parse_res
11986 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11987
11988 if (parse_res == AARCH64_PARSE_OK)
11989 return true;
11990
11991 switch (parse_res)
11992 {
11993 case AARCH64_PARSE_MISSING_ARG:
11994 error ("missing arch name in %<-march=%s%>", str);
11995 break;
11996 case AARCH64_PARSE_INVALID_ARG:
11997 error ("unknown value %qs for %<-march%>", str);
11998 aarch64_print_hint_for_arch (str);
11999 break;
12000 case AARCH64_PARSE_INVALID_FEATURE:
12001 error ("invalid feature modifier %qs in %<-march=%s%>",
12002 invalid_extension.c_str (), str);
12003 aarch64_print_hint_for_extensions (invalid_extension);
12004 break;
12005 default:
12006 gcc_unreachable ();
12007 }
12008
12009 return false;
12010 }
12011
12012 /* Validate a command-line -mtune option. Parse the cpu
12013 specified in STR and throw errors if appropriate. Put the
12014 result, if it is valid, in RES. Return whether the option is
12015 valid. */
12016
12017 static bool
12018 aarch64_validate_mtune (const char *str, const struct processor **res)
12019 {
12020 enum aarch64_parse_opt_result parse_res
12021 = aarch64_parse_tune (str, res);
12022
12023 if (parse_res == AARCH64_PARSE_OK)
12024 return true;
12025
12026 switch (parse_res)
12027 {
12028 case AARCH64_PARSE_MISSING_ARG:
12029 error ("missing cpu name in %<-mtune=%s%>", str);
12030 break;
12031 case AARCH64_PARSE_INVALID_ARG:
12032 error ("unknown value %qs for %<-mtune%>", str);
12033 aarch64_print_hint_for_core (str);
12034 break;
12035 default:
12036 gcc_unreachable ();
12037 }
12038 return false;
12039 }
12040
12041 /* Return the CPU corresponding to the enum CPU.
12042 If it doesn't specify a cpu, return the default. */
12043
12044 static const struct processor *
12045 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12046 {
12047 if (cpu != aarch64_none)
12048 return &all_cores[cpu];
12049
12050 /* The & 0x3f is to extract the bottom 6 bits that encode the
12051 default cpu as selected by the --with-cpu GCC configure option
12052 in config.gcc.
12053 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12054 flags mechanism should be reworked to make it more sane. */
12055 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12056 }
12057
12058 /* Return the architecture corresponding to the enum ARCH.
12059 If it doesn't specify a valid architecture, return the default. */
12060
12061 static const struct processor *
12062 aarch64_get_arch (enum aarch64_arch arch)
12063 {
12064 if (arch != aarch64_no_arch)
12065 return &all_architectures[arch];
12066
12067 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12068
12069 return &all_architectures[cpu->arch];
12070 }
12071
12072 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12073
12074 static poly_uint16
12075 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12076 {
12077 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12078 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12079 deciding which .md file patterns to use and when deciding whether
12080 something is a legitimate address or constant. */
12081 if (value == SVE_SCALABLE || value == SVE_128)
12082 return poly_uint16 (2, 2);
12083 else
12084 return (int) value / 64;
12085 }
12086
12087 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12088 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12089 tuning structs. In particular it must set selected_tune and
12090 aarch64_isa_flags that define the available ISA features and tuning
12091 decisions. It must also set selected_arch as this will be used to
12092 output the .arch asm tags for each function. */
12093
12094 static void
12095 aarch64_override_options (void)
12096 {
12097 uint64_t cpu_isa = 0;
12098 uint64_t arch_isa = 0;
12099 aarch64_isa_flags = 0;
12100
12101 bool valid_cpu = true;
12102 bool valid_tune = true;
12103 bool valid_arch = true;
12104
12105 selected_cpu = NULL;
12106 selected_arch = NULL;
12107 selected_tune = NULL;
12108
12109 if (aarch64_branch_protection_string)
12110 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12111
12112 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12113 If either of -march or -mtune is given, they override their
12114 respective component of -mcpu. */
12115 if (aarch64_cpu_string)
12116 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12117 &cpu_isa);
12118
12119 if (aarch64_arch_string)
12120 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12121 &arch_isa);
12122
12123 if (aarch64_tune_string)
12124 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12125
12126 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12127 SUBTARGET_OVERRIDE_OPTIONS;
12128 #endif
12129
12130 /* If the user did not specify a processor, choose the default
12131 one for them. This will be the CPU set during configuration using
12132 --with-cpu, otherwise it is "generic". */
12133 if (!selected_cpu)
12134 {
12135 if (selected_arch)
12136 {
12137 selected_cpu = &all_cores[selected_arch->ident];
12138 aarch64_isa_flags = arch_isa;
12139 explicit_arch = selected_arch->arch;
12140 }
12141 else
12142 {
12143 /* Get default configure-time CPU. */
12144 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12145 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12146 }
12147
12148 if (selected_tune)
12149 explicit_tune_core = selected_tune->ident;
12150 }
12151 /* If both -mcpu and -march are specified check that they are architecturally
12152 compatible, warn if they're not and prefer the -march ISA flags. */
12153 else if (selected_arch)
12154 {
12155 if (selected_arch->arch != selected_cpu->arch)
12156 {
12157 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12158 all_architectures[selected_cpu->arch].name,
12159 selected_arch->name);
12160 }
12161 aarch64_isa_flags = arch_isa;
12162 explicit_arch = selected_arch->arch;
12163 explicit_tune_core = selected_tune ? selected_tune->ident
12164 : selected_cpu->ident;
12165 }
12166 else
12167 {
12168 /* -mcpu but no -march. */
12169 aarch64_isa_flags = cpu_isa;
12170 explicit_tune_core = selected_tune ? selected_tune->ident
12171 : selected_cpu->ident;
12172 gcc_assert (selected_cpu);
12173 selected_arch = &all_architectures[selected_cpu->arch];
12174 explicit_arch = selected_arch->arch;
12175 }
12176
12177 /* Set the arch as well as we will need it when outputing
12178 the .arch directive in assembly. */
12179 if (!selected_arch)
12180 {
12181 gcc_assert (selected_cpu);
12182 selected_arch = &all_architectures[selected_cpu->arch];
12183 }
12184
12185 if (!selected_tune)
12186 selected_tune = selected_cpu;
12187
12188 if (aarch64_enable_bti == 2)
12189 {
12190 #ifdef TARGET_ENABLE_BTI
12191 aarch64_enable_bti = 1;
12192 #else
12193 aarch64_enable_bti = 0;
12194 #endif
12195 }
12196
12197 /* Return address signing is currently not supported for ILP32 targets. For
12198 LP64 targets use the configured option in the absence of a command-line
12199 option for -mbranch-protection. */
12200 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12201 {
12202 #ifdef TARGET_ENABLE_PAC_RET
12203 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12204 #else
12205 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12206 #endif
12207 }
12208
12209 #ifndef HAVE_AS_MABI_OPTION
12210 /* The compiler may have been configured with 2.23.* binutils, which does
12211 not have support for ILP32. */
12212 if (TARGET_ILP32)
12213 error ("assembler does not support %<-mabi=ilp32%>");
12214 #endif
12215
12216 /* Convert -msve-vector-bits to a VG count. */
12217 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12218
12219 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12220 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12221
12222 /* Make sure we properly set up the explicit options. */
12223 if ((aarch64_cpu_string && valid_cpu)
12224 || (aarch64_tune_string && valid_tune))
12225 gcc_assert (explicit_tune_core != aarch64_none);
12226
12227 if ((aarch64_cpu_string && valid_cpu)
12228 || (aarch64_arch_string && valid_arch))
12229 gcc_assert (explicit_arch != aarch64_no_arch);
12230
12231 /* The pass to insert speculation tracking runs before
12232 shrink-wrapping and the latter does not know how to update the
12233 tracking status. So disable it in this case. */
12234 if (aarch64_track_speculation)
12235 flag_shrink_wrap = 0;
12236
12237 aarch64_override_options_internal (&global_options);
12238
12239 /* Save these options as the default ones in case we push and pop them later
12240 while processing functions with potential target attributes. */
12241 target_option_default_node = target_option_current_node
12242 = build_target_option_node (&global_options);
12243 }
12244
12245 /* Implement targetm.override_options_after_change. */
12246
12247 static void
12248 aarch64_override_options_after_change (void)
12249 {
12250 aarch64_override_options_after_change_1 (&global_options);
12251 }
12252
12253 static struct machine_function *
12254 aarch64_init_machine_status (void)
12255 {
12256 struct machine_function *machine;
12257 machine = ggc_cleared_alloc<machine_function> ();
12258 return machine;
12259 }
12260
12261 void
12262 aarch64_init_expanders (void)
12263 {
12264 init_machine_status = aarch64_init_machine_status;
12265 }
12266
12267 /* A checking mechanism for the implementation of the various code models. */
12268 static void
12269 initialize_aarch64_code_model (struct gcc_options *opts)
12270 {
12271 if (opts->x_flag_pic)
12272 {
12273 switch (opts->x_aarch64_cmodel_var)
12274 {
12275 case AARCH64_CMODEL_TINY:
12276 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12277 break;
12278 case AARCH64_CMODEL_SMALL:
12279 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12280 aarch64_cmodel = (flag_pic == 2
12281 ? AARCH64_CMODEL_SMALL_PIC
12282 : AARCH64_CMODEL_SMALL_SPIC);
12283 #else
12284 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12285 #endif
12286 break;
12287 case AARCH64_CMODEL_LARGE:
12288 sorry ("code model %qs with %<-f%s%>", "large",
12289 opts->x_flag_pic > 1 ? "PIC" : "pic");
12290 break;
12291 default:
12292 gcc_unreachable ();
12293 }
12294 }
12295 else
12296 aarch64_cmodel = opts->x_aarch64_cmodel_var;
12297 }
12298
12299 /* Implement TARGET_OPTION_SAVE. */
12300
12301 static void
12302 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12303 {
12304 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12305 ptr->x_aarch64_branch_protection_string
12306 = opts->x_aarch64_branch_protection_string;
12307 }
12308
12309 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12310 using the information saved in PTR. */
12311
12312 static void
12313 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12314 {
12315 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12316 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12317 opts->x_explicit_arch = ptr->x_explicit_arch;
12318 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12319 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12320 opts->x_aarch64_branch_protection_string
12321 = ptr->x_aarch64_branch_protection_string;
12322 if (opts->x_aarch64_branch_protection_string)
12323 {
12324 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12325 NULL);
12326 }
12327
12328 aarch64_override_options_internal (opts);
12329 }
12330
12331 /* Implement TARGET_OPTION_PRINT. */
12332
12333 static void
12334 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12335 {
12336 const struct processor *cpu
12337 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12338 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12339 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12340 std::string extension
12341 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12342
12343 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12344 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12345 arch->name, extension.c_str ());
12346 }
12347
12348 static GTY(()) tree aarch64_previous_fndecl;
12349
12350 void
12351 aarch64_reset_previous_fndecl (void)
12352 {
12353 aarch64_previous_fndecl = NULL;
12354 }
12355
12356 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12357 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12358 make sure optab availability predicates are recomputed when necessary. */
12359
12360 void
12361 aarch64_save_restore_target_globals (tree new_tree)
12362 {
12363 if (TREE_TARGET_GLOBALS (new_tree))
12364 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12365 else if (new_tree == target_option_default_node)
12366 restore_target_globals (&default_target_globals);
12367 else
12368 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12369 }
12370
12371 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12372 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12373 of the function, if such exists. This function may be called multiple
12374 times on a single function so use aarch64_previous_fndecl to avoid
12375 setting up identical state. */
12376
12377 static void
12378 aarch64_set_current_function (tree fndecl)
12379 {
12380 if (!fndecl || fndecl == aarch64_previous_fndecl)
12381 return;
12382
12383 tree old_tree = (aarch64_previous_fndecl
12384 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12385 : NULL_TREE);
12386
12387 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12388
12389 /* If current function has no attributes but the previous one did,
12390 use the default node. */
12391 if (!new_tree && old_tree)
12392 new_tree = target_option_default_node;
12393
12394 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12395 the default have been handled by aarch64_save_restore_target_globals from
12396 aarch64_pragma_target_parse. */
12397 if (old_tree == new_tree)
12398 return;
12399
12400 aarch64_previous_fndecl = fndecl;
12401
12402 /* First set the target options. */
12403 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12404
12405 aarch64_save_restore_target_globals (new_tree);
12406 }
12407
12408 /* Enum describing the various ways we can handle attributes.
12409 In many cases we can reuse the generic option handling machinery. */
12410
12411 enum aarch64_attr_opt_type
12412 {
12413 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
12414 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
12415 aarch64_attr_enum, /* Attribute sets an enum variable. */
12416 aarch64_attr_custom /* Attribute requires a custom handling function. */
12417 };
12418
12419 /* All the information needed to handle a target attribute.
12420 NAME is the name of the attribute.
12421 ATTR_TYPE specifies the type of behavior of the attribute as described
12422 in the definition of enum aarch64_attr_opt_type.
12423 ALLOW_NEG is true if the attribute supports a "no-" form.
12424 HANDLER is the function that takes the attribute string as an argument
12425 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12426 OPT_NUM is the enum specifying the option that the attribute modifies.
12427 This is needed for attributes that mirror the behavior of a command-line
12428 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12429 aarch64_attr_enum. */
12430
12431 struct aarch64_attribute_info
12432 {
12433 const char *name;
12434 enum aarch64_attr_opt_type attr_type;
12435 bool allow_neg;
12436 bool (*handler) (const char *);
12437 enum opt_code opt_num;
12438 };
12439
12440 /* Handle the ARCH_STR argument to the arch= target attribute. */
12441
12442 static bool
12443 aarch64_handle_attr_arch (const char *str)
12444 {
12445 const struct processor *tmp_arch = NULL;
12446 std::string invalid_extension;
12447 enum aarch64_parse_opt_result parse_res
12448 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12449
12450 if (parse_res == AARCH64_PARSE_OK)
12451 {
12452 gcc_assert (tmp_arch);
12453 selected_arch = tmp_arch;
12454 explicit_arch = selected_arch->arch;
12455 return true;
12456 }
12457
12458 switch (parse_res)
12459 {
12460 case AARCH64_PARSE_MISSING_ARG:
12461 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12462 break;
12463 case AARCH64_PARSE_INVALID_ARG:
12464 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12465 aarch64_print_hint_for_arch (str);
12466 break;
12467 case AARCH64_PARSE_INVALID_FEATURE:
12468 error ("invalid feature modifier %s of value (\"%s\") in "
12469 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12470 aarch64_print_hint_for_extensions (invalid_extension);
12471 break;
12472 default:
12473 gcc_unreachable ();
12474 }
12475
12476 return false;
12477 }
12478
12479 /* Handle the argument CPU_STR to the cpu= target attribute. */
12480
12481 static bool
12482 aarch64_handle_attr_cpu (const char *str)
12483 {
12484 const struct processor *tmp_cpu = NULL;
12485 std::string invalid_extension;
12486 enum aarch64_parse_opt_result parse_res
12487 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12488
12489 if (parse_res == AARCH64_PARSE_OK)
12490 {
12491 gcc_assert (tmp_cpu);
12492 selected_tune = tmp_cpu;
12493 explicit_tune_core = selected_tune->ident;
12494
12495 selected_arch = &all_architectures[tmp_cpu->arch];
12496 explicit_arch = selected_arch->arch;
12497 return true;
12498 }
12499
12500 switch (parse_res)
12501 {
12502 case AARCH64_PARSE_MISSING_ARG:
12503 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12504 break;
12505 case AARCH64_PARSE_INVALID_ARG:
12506 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12507 aarch64_print_hint_for_core (str);
12508 break;
12509 case AARCH64_PARSE_INVALID_FEATURE:
12510 error ("invalid feature modifier %s of value (\"%s\") in "
12511 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12512 aarch64_print_hint_for_extensions (invalid_extension);
12513 break;
12514 default:
12515 gcc_unreachable ();
12516 }
12517
12518 return false;
12519 }
12520
12521 /* Handle the argument STR to the branch-protection= attribute. */
12522
12523 static bool
12524 aarch64_handle_attr_branch_protection (const char* str)
12525 {
12526 char *err_str = (char *) xmalloc (strlen (str));
12527 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12528 &err_str);
12529 bool success = false;
12530 switch (res)
12531 {
12532 case AARCH64_PARSE_MISSING_ARG:
12533 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12534 " attribute");
12535 break;
12536 case AARCH64_PARSE_INVALID_ARG:
12537 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12538 "=\")%> pragma or attribute", err_str);
12539 break;
12540 case AARCH64_PARSE_OK:
12541 success = true;
12542 /* Fall through. */
12543 case AARCH64_PARSE_INVALID_FEATURE:
12544 break;
12545 default:
12546 gcc_unreachable ();
12547 }
12548 free (err_str);
12549 return success;
12550 }
12551
12552 /* Handle the argument STR to the tune= target attribute. */
12553
12554 static bool
12555 aarch64_handle_attr_tune (const char *str)
12556 {
12557 const struct processor *tmp_tune = NULL;
12558 enum aarch64_parse_opt_result parse_res
12559 = aarch64_parse_tune (str, &tmp_tune);
12560
12561 if (parse_res == AARCH64_PARSE_OK)
12562 {
12563 gcc_assert (tmp_tune);
12564 selected_tune = tmp_tune;
12565 explicit_tune_core = selected_tune->ident;
12566 return true;
12567 }
12568
12569 switch (parse_res)
12570 {
12571 case AARCH64_PARSE_INVALID_ARG:
12572 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12573 aarch64_print_hint_for_core (str);
12574 break;
12575 default:
12576 gcc_unreachable ();
12577 }
12578
12579 return false;
12580 }
12581
12582 /* Parse an architecture extensions target attribute string specified in STR.
12583 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12584 if successful. Update aarch64_isa_flags to reflect the ISA features
12585 modified. */
12586
12587 static bool
12588 aarch64_handle_attr_isa_flags (char *str)
12589 {
12590 enum aarch64_parse_opt_result parse_res;
12591 uint64_t isa_flags = aarch64_isa_flags;
12592
12593 /* We allow "+nothing" in the beginning to clear out all architectural
12594 features if the user wants to handpick specific features. */
12595 if (strncmp ("+nothing", str, 8) == 0)
12596 {
12597 isa_flags = 0;
12598 str += 8;
12599 }
12600
12601 std::string invalid_extension;
12602 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12603
12604 if (parse_res == AARCH64_PARSE_OK)
12605 {
12606 aarch64_isa_flags = isa_flags;
12607 return true;
12608 }
12609
12610 switch (parse_res)
12611 {
12612 case AARCH64_PARSE_MISSING_ARG:
12613 error ("missing value in %<target()%> pragma or attribute");
12614 break;
12615
12616 case AARCH64_PARSE_INVALID_FEATURE:
12617 error ("invalid feature modifier %s of value (\"%s\") in "
12618 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12619 break;
12620
12621 default:
12622 gcc_unreachable ();
12623 }
12624
12625 return false;
12626 }
12627
12628 /* The target attributes that we support. On top of these we also support just
12629 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12630 handled explicitly in aarch64_process_one_target_attr. */
12631
12632 static const struct aarch64_attribute_info aarch64_attributes[] =
12633 {
12634 { "general-regs-only", aarch64_attr_mask, false, NULL,
12635 OPT_mgeneral_regs_only },
12636 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12637 OPT_mfix_cortex_a53_835769 },
12638 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12639 OPT_mfix_cortex_a53_843419 },
12640 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12641 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12642 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12643 OPT_momit_leaf_frame_pointer },
12644 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12645 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12646 OPT_march_ },
12647 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12648 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12649 OPT_mtune_ },
12650 { "branch-protection", aarch64_attr_custom, false,
12651 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12652 { "sign-return-address", aarch64_attr_enum, false, NULL,
12653 OPT_msign_return_address_ },
12654 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12655 };
12656
12657 /* Parse ARG_STR which contains the definition of one target attribute.
12658 Show appropriate errors if any or return true if the attribute is valid. */
12659
12660 static bool
12661 aarch64_process_one_target_attr (char *arg_str)
12662 {
12663 bool invert = false;
12664
12665 size_t len = strlen (arg_str);
12666
12667 if (len == 0)
12668 {
12669 error ("malformed %<target()%> pragma or attribute");
12670 return false;
12671 }
12672
12673 char *str_to_check = (char *) alloca (len + 1);
12674 strcpy (str_to_check, arg_str);
12675
12676 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12677 It is easier to detect and handle it explicitly here rather than going
12678 through the machinery for the rest of the target attributes in this
12679 function. */
12680 if (*str_to_check == '+')
12681 return aarch64_handle_attr_isa_flags (str_to_check);
12682
12683 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12684 {
12685 invert = true;
12686 str_to_check += 3;
12687 }
12688 char *arg = strchr (str_to_check, '=');
12689
12690 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12691 and point ARG to "foo". */
12692 if (arg)
12693 {
12694 *arg = '\0';
12695 arg++;
12696 }
12697 const struct aarch64_attribute_info *p_attr;
12698 bool found = false;
12699 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12700 {
12701 /* If the names don't match up, or the user has given an argument
12702 to an attribute that doesn't accept one, or didn't give an argument
12703 to an attribute that expects one, fail to match. */
12704 if (strcmp (str_to_check, p_attr->name) != 0)
12705 continue;
12706
12707 found = true;
12708 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12709 || p_attr->attr_type == aarch64_attr_enum;
12710
12711 if (attr_need_arg_p ^ (arg != NULL))
12712 {
12713 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12714 return false;
12715 }
12716
12717 /* If the name matches but the attribute does not allow "no-" versions
12718 then we can't match. */
12719 if (invert && !p_attr->allow_neg)
12720 {
12721 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12722 return false;
12723 }
12724
12725 switch (p_attr->attr_type)
12726 {
12727 /* Has a custom handler registered.
12728 For example, cpu=, arch=, tune=. */
12729 case aarch64_attr_custom:
12730 gcc_assert (p_attr->handler);
12731 if (!p_attr->handler (arg))
12732 return false;
12733 break;
12734
12735 /* Either set or unset a boolean option. */
12736 case aarch64_attr_bool:
12737 {
12738 struct cl_decoded_option decoded;
12739
12740 generate_option (p_attr->opt_num, NULL, !invert,
12741 CL_TARGET, &decoded);
12742 aarch64_handle_option (&global_options, &global_options_set,
12743 &decoded, input_location);
12744 break;
12745 }
12746 /* Set or unset a bit in the target_flags. aarch64_handle_option
12747 should know what mask to apply given the option number. */
12748 case aarch64_attr_mask:
12749 {
12750 struct cl_decoded_option decoded;
12751 /* We only need to specify the option number.
12752 aarch64_handle_option will know which mask to apply. */
12753 decoded.opt_index = p_attr->opt_num;
12754 decoded.value = !invert;
12755 aarch64_handle_option (&global_options, &global_options_set,
12756 &decoded, input_location);
12757 break;
12758 }
12759 /* Use the option setting machinery to set an option to an enum. */
12760 case aarch64_attr_enum:
12761 {
12762 gcc_assert (arg);
12763 bool valid;
12764 int value;
12765 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12766 &value, CL_TARGET);
12767 if (valid)
12768 {
12769 set_option (&global_options, NULL, p_attr->opt_num, value,
12770 NULL, DK_UNSPECIFIED, input_location,
12771 global_dc);
12772 }
12773 else
12774 {
12775 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12776 }
12777 break;
12778 }
12779 default:
12780 gcc_unreachable ();
12781 }
12782 }
12783
12784 /* If we reached here we either have found an attribute and validated
12785 it or didn't match any. If we matched an attribute but its arguments
12786 were malformed we will have returned false already. */
12787 return found;
12788 }
12789
12790 /* Count how many times the character C appears in
12791 NULL-terminated string STR. */
12792
12793 static unsigned int
12794 num_occurences_in_str (char c, char *str)
12795 {
12796 unsigned int res = 0;
12797 while (*str != '\0')
12798 {
12799 if (*str == c)
12800 res++;
12801
12802 str++;
12803 }
12804
12805 return res;
12806 }
12807
12808 /* Parse the tree in ARGS that contains the target attribute information
12809 and update the global target options space. */
12810
12811 bool
12812 aarch64_process_target_attr (tree args)
12813 {
12814 if (TREE_CODE (args) == TREE_LIST)
12815 {
12816 do
12817 {
12818 tree head = TREE_VALUE (args);
12819 if (head)
12820 {
12821 if (!aarch64_process_target_attr (head))
12822 return false;
12823 }
12824 args = TREE_CHAIN (args);
12825 } while (args);
12826
12827 return true;
12828 }
12829
12830 if (TREE_CODE (args) != STRING_CST)
12831 {
12832 error ("attribute %<target%> argument not a string");
12833 return false;
12834 }
12835
12836 size_t len = strlen (TREE_STRING_POINTER (args));
12837 char *str_to_check = (char *) alloca (len + 1);
12838 strcpy (str_to_check, TREE_STRING_POINTER (args));
12839
12840 if (len == 0)
12841 {
12842 error ("malformed %<target()%> pragma or attribute");
12843 return false;
12844 }
12845
12846 /* Used to catch empty spaces between commas i.e.
12847 attribute ((target ("attr1,,attr2"))). */
12848 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12849
12850 /* Handle multiple target attributes separated by ','. */
12851 char *token = strtok_r (str_to_check, ",", &str_to_check);
12852
12853 unsigned int num_attrs = 0;
12854 while (token)
12855 {
12856 num_attrs++;
12857 if (!aarch64_process_one_target_attr (token))
12858 {
12859 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12860 return false;
12861 }
12862
12863 token = strtok_r (NULL, ",", &str_to_check);
12864 }
12865
12866 if (num_attrs != num_commas + 1)
12867 {
12868 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12869 return false;
12870 }
12871
12872 return true;
12873 }
12874
12875 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
12876 process attribute ((target ("..."))). */
12877
12878 static bool
12879 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12880 {
12881 struct cl_target_option cur_target;
12882 bool ret;
12883 tree old_optimize;
12884 tree new_target, new_optimize;
12885 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12886
12887 /* If what we're processing is the current pragma string then the
12888 target option node is already stored in target_option_current_node
12889 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
12890 having to re-parse the string. This is especially useful to keep
12891 arm_neon.h compile times down since that header contains a lot
12892 of intrinsics enclosed in pragmas. */
12893 if (!existing_target && args == current_target_pragma)
12894 {
12895 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12896 return true;
12897 }
12898 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12899
12900 old_optimize = build_optimization_node (&global_options);
12901 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12902
12903 /* If the function changed the optimization levels as well as setting
12904 target options, start with the optimizations specified. */
12905 if (func_optimize && func_optimize != old_optimize)
12906 cl_optimization_restore (&global_options,
12907 TREE_OPTIMIZATION (func_optimize));
12908
12909 /* Save the current target options to restore at the end. */
12910 cl_target_option_save (&cur_target, &global_options);
12911
12912 /* If fndecl already has some target attributes applied to it, unpack
12913 them so that we add this attribute on top of them, rather than
12914 overwriting them. */
12915 if (existing_target)
12916 {
12917 struct cl_target_option *existing_options
12918 = TREE_TARGET_OPTION (existing_target);
12919
12920 if (existing_options)
12921 cl_target_option_restore (&global_options, existing_options);
12922 }
12923 else
12924 cl_target_option_restore (&global_options,
12925 TREE_TARGET_OPTION (target_option_current_node));
12926
12927 ret = aarch64_process_target_attr (args);
12928
12929 /* Set up any additional state. */
12930 if (ret)
12931 {
12932 aarch64_override_options_internal (&global_options);
12933 /* Initialize SIMD builtins if we haven't already.
12934 Set current_target_pragma to NULL for the duration so that
12935 the builtin initialization code doesn't try to tag the functions
12936 being built with the attributes specified by any current pragma, thus
12937 going into an infinite recursion. */
12938 if (TARGET_SIMD)
12939 {
12940 tree saved_current_target_pragma = current_target_pragma;
12941 current_target_pragma = NULL;
12942 aarch64_init_simd_builtins ();
12943 current_target_pragma = saved_current_target_pragma;
12944 }
12945 new_target = build_target_option_node (&global_options);
12946 }
12947 else
12948 new_target = NULL;
12949
12950 new_optimize = build_optimization_node (&global_options);
12951
12952 if (fndecl && ret)
12953 {
12954 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12955
12956 if (old_optimize != new_optimize)
12957 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12958 }
12959
12960 cl_target_option_restore (&global_options, &cur_target);
12961
12962 if (old_optimize != new_optimize)
12963 cl_optimization_restore (&global_options,
12964 TREE_OPTIMIZATION (old_optimize));
12965 return ret;
12966 }
12967
12968 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
12969 tri-bool options (yes, no, don't care) and the default value is
12970 DEF, determine whether to reject inlining. */
12971
12972 static bool
12973 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12974 int dont_care, int def)
12975 {
12976 /* If the callee doesn't care, always allow inlining. */
12977 if (callee == dont_care)
12978 return true;
12979
12980 /* If the caller doesn't care, always allow inlining. */
12981 if (caller == dont_care)
12982 return true;
12983
12984 /* Otherwise, allow inlining if either the callee and caller values
12985 agree, or if the callee is using the default value. */
12986 return (callee == caller || callee == def);
12987 }
12988
12989 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
12990 to inline CALLEE into CALLER based on target-specific info.
12991 Make sure that the caller and callee have compatible architectural
12992 features. Then go through the other possible target attributes
12993 and see if they can block inlining. Try not to reject always_inline
12994 callees unless they are incompatible architecturally. */
12995
12996 static bool
12997 aarch64_can_inline_p (tree caller, tree callee)
12998 {
12999 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13000 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13001
13002 struct cl_target_option *caller_opts
13003 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13004 : target_option_default_node);
13005
13006 struct cl_target_option *callee_opts
13007 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13008 : target_option_default_node);
13009
13010 /* Callee's ISA flags should be a subset of the caller's. */
13011 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13012 != callee_opts->x_aarch64_isa_flags)
13013 return false;
13014
13015 /* Allow non-strict aligned functions inlining into strict
13016 aligned ones. */
13017 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13018 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13019 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13020 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13021 return false;
13022
13023 bool always_inline = lookup_attribute ("always_inline",
13024 DECL_ATTRIBUTES (callee));
13025
13026 /* If the architectural features match up and the callee is always_inline
13027 then the other attributes don't matter. */
13028 if (always_inline)
13029 return true;
13030
13031 if (caller_opts->x_aarch64_cmodel_var
13032 != callee_opts->x_aarch64_cmodel_var)
13033 return false;
13034
13035 if (caller_opts->x_aarch64_tls_dialect
13036 != callee_opts->x_aarch64_tls_dialect)
13037 return false;
13038
13039 /* Honour explicit requests to workaround errata. */
13040 if (!aarch64_tribools_ok_for_inlining_p (
13041 caller_opts->x_aarch64_fix_a53_err835769,
13042 callee_opts->x_aarch64_fix_a53_err835769,
13043 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13044 return false;
13045
13046 if (!aarch64_tribools_ok_for_inlining_p (
13047 caller_opts->x_aarch64_fix_a53_err843419,
13048 callee_opts->x_aarch64_fix_a53_err843419,
13049 2, TARGET_FIX_ERR_A53_843419))
13050 return false;
13051
13052 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13053 caller and calle and they don't match up, reject inlining. */
13054 if (!aarch64_tribools_ok_for_inlining_p (
13055 caller_opts->x_flag_omit_leaf_frame_pointer,
13056 callee_opts->x_flag_omit_leaf_frame_pointer,
13057 2, 1))
13058 return false;
13059
13060 /* If the callee has specific tuning overrides, respect them. */
13061 if (callee_opts->x_aarch64_override_tune_string != NULL
13062 && caller_opts->x_aarch64_override_tune_string == NULL)
13063 return false;
13064
13065 /* If the user specified tuning override strings for the
13066 caller and callee and they don't match up, reject inlining.
13067 We just do a string compare here, we don't analyze the meaning
13068 of the string, as it would be too costly for little gain. */
13069 if (callee_opts->x_aarch64_override_tune_string
13070 && caller_opts->x_aarch64_override_tune_string
13071 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13072 caller_opts->x_aarch64_override_tune_string) != 0))
13073 return false;
13074
13075 return true;
13076 }
13077
13078 /* Return true if SYMBOL_REF X binds locally. */
13079
13080 static bool
13081 aarch64_symbol_binds_local_p (const_rtx x)
13082 {
13083 return (SYMBOL_REF_DECL (x)
13084 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13085 : SYMBOL_REF_LOCAL_P (x));
13086 }
13087
13088 /* Return true if SYMBOL_REF X is thread local */
13089 static bool
13090 aarch64_tls_symbol_p (rtx x)
13091 {
13092 if (! TARGET_HAVE_TLS)
13093 return false;
13094
13095 if (GET_CODE (x) != SYMBOL_REF)
13096 return false;
13097
13098 return SYMBOL_REF_TLS_MODEL (x) != 0;
13099 }
13100
13101 /* Classify a TLS symbol into one of the TLS kinds. */
13102 enum aarch64_symbol_type
13103 aarch64_classify_tls_symbol (rtx x)
13104 {
13105 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13106
13107 switch (tls_kind)
13108 {
13109 case TLS_MODEL_GLOBAL_DYNAMIC:
13110 case TLS_MODEL_LOCAL_DYNAMIC:
13111 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13112
13113 case TLS_MODEL_INITIAL_EXEC:
13114 switch (aarch64_cmodel)
13115 {
13116 case AARCH64_CMODEL_TINY:
13117 case AARCH64_CMODEL_TINY_PIC:
13118 return SYMBOL_TINY_TLSIE;
13119 default:
13120 return SYMBOL_SMALL_TLSIE;
13121 }
13122
13123 case TLS_MODEL_LOCAL_EXEC:
13124 if (aarch64_tls_size == 12)
13125 return SYMBOL_TLSLE12;
13126 else if (aarch64_tls_size == 24)
13127 return SYMBOL_TLSLE24;
13128 else if (aarch64_tls_size == 32)
13129 return SYMBOL_TLSLE32;
13130 else if (aarch64_tls_size == 48)
13131 return SYMBOL_TLSLE48;
13132 else
13133 gcc_unreachable ();
13134
13135 case TLS_MODEL_EMULATED:
13136 case TLS_MODEL_NONE:
13137 return SYMBOL_FORCE_TO_MEM;
13138
13139 default:
13140 gcc_unreachable ();
13141 }
13142 }
13143
13144 /* Return the correct method for accessing X + OFFSET, where X is either
13145 a SYMBOL_REF or LABEL_REF. */
13146
13147 enum aarch64_symbol_type
13148 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13149 {
13150 if (GET_CODE (x) == LABEL_REF)
13151 {
13152 switch (aarch64_cmodel)
13153 {
13154 case AARCH64_CMODEL_LARGE:
13155 return SYMBOL_FORCE_TO_MEM;
13156
13157 case AARCH64_CMODEL_TINY_PIC:
13158 case AARCH64_CMODEL_TINY:
13159 return SYMBOL_TINY_ABSOLUTE;
13160
13161 case AARCH64_CMODEL_SMALL_SPIC:
13162 case AARCH64_CMODEL_SMALL_PIC:
13163 case AARCH64_CMODEL_SMALL:
13164 return SYMBOL_SMALL_ABSOLUTE;
13165
13166 default:
13167 gcc_unreachable ();
13168 }
13169 }
13170
13171 if (GET_CODE (x) == SYMBOL_REF)
13172 {
13173 if (aarch64_tls_symbol_p (x))
13174 return aarch64_classify_tls_symbol (x);
13175
13176 switch (aarch64_cmodel)
13177 {
13178 case AARCH64_CMODEL_TINY:
13179 /* When we retrieve symbol + offset address, we have to make sure
13180 the offset does not cause overflow of the final address. But
13181 we have no way of knowing the address of symbol at compile time
13182 so we can't accurately say if the distance between the PC and
13183 symbol + offset is outside the addressible range of +/-1M in the
13184 TINY code model. So we rely on images not being greater than
13185 1M and cap the offset at 1M and anything beyond 1M will have to
13186 be loaded using an alternative mechanism. Furthermore if the
13187 symbol is a weak reference to something that isn't known to
13188 resolve to a symbol in this module, then force to memory. */
13189 if ((SYMBOL_REF_WEAK (x)
13190 && !aarch64_symbol_binds_local_p (x))
13191 || !IN_RANGE (offset, -1048575, 1048575))
13192 return SYMBOL_FORCE_TO_MEM;
13193 return SYMBOL_TINY_ABSOLUTE;
13194
13195 case AARCH64_CMODEL_SMALL:
13196 /* Same reasoning as the tiny code model, but the offset cap here is
13197 4G. */
13198 if ((SYMBOL_REF_WEAK (x)
13199 && !aarch64_symbol_binds_local_p (x))
13200 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13201 HOST_WIDE_INT_C (4294967264)))
13202 return SYMBOL_FORCE_TO_MEM;
13203 return SYMBOL_SMALL_ABSOLUTE;
13204
13205 case AARCH64_CMODEL_TINY_PIC:
13206 if (!aarch64_symbol_binds_local_p (x))
13207 return SYMBOL_TINY_GOT;
13208 return SYMBOL_TINY_ABSOLUTE;
13209
13210 case AARCH64_CMODEL_SMALL_SPIC:
13211 case AARCH64_CMODEL_SMALL_PIC:
13212 if (!aarch64_symbol_binds_local_p (x))
13213 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13214 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13215 return SYMBOL_SMALL_ABSOLUTE;
13216
13217 case AARCH64_CMODEL_LARGE:
13218 /* This is alright even in PIC code as the constant
13219 pool reference is always PC relative and within
13220 the same translation unit. */
13221 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13222 return SYMBOL_SMALL_ABSOLUTE;
13223 else
13224 return SYMBOL_FORCE_TO_MEM;
13225
13226 default:
13227 gcc_unreachable ();
13228 }
13229 }
13230
13231 /* By default push everything into the constant pool. */
13232 return SYMBOL_FORCE_TO_MEM;
13233 }
13234
13235 bool
13236 aarch64_constant_address_p (rtx x)
13237 {
13238 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13239 }
13240
13241 bool
13242 aarch64_legitimate_pic_operand_p (rtx x)
13243 {
13244 if (GET_CODE (x) == SYMBOL_REF
13245 || (GET_CODE (x) == CONST
13246 && GET_CODE (XEXP (x, 0)) == PLUS
13247 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13248 return false;
13249
13250 return true;
13251 }
13252
13253 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13254 that should be rematerialized rather than spilled. */
13255
13256 static bool
13257 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13258 {
13259 /* Support CSE and rematerialization of common constants. */
13260 if (CONST_INT_P (x)
13261 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13262 || GET_CODE (x) == CONST_VECTOR)
13263 return true;
13264
13265 /* Do not allow vector struct mode constants for Advanced SIMD.
13266 We could support 0 and -1 easily, but they need support in
13267 aarch64-simd.md. */
13268 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13269 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13270 return false;
13271
13272 /* Only accept variable-length vector constants if they can be
13273 handled directly.
13274
13275 ??? It would be possible to handle rematerialization of other
13276 constants via secondary reloads. */
13277 if (vec_flags & VEC_ANY_SVE)
13278 return aarch64_simd_valid_immediate (x, NULL);
13279
13280 if (GET_CODE (x) == HIGH)
13281 x = XEXP (x, 0);
13282
13283 /* Accept polynomial constants that can be calculated by using the
13284 destination of a move as the sole temporary. Constants that
13285 require a second temporary cannot be rematerialized (they can't be
13286 forced to memory and also aren't legitimate constants). */
13287 poly_int64 offset;
13288 if (poly_int_rtx_p (x, &offset))
13289 return aarch64_offset_temporaries (false, offset) <= 1;
13290
13291 /* If an offset is being added to something else, we need to allow the
13292 base to be moved into the destination register, meaning that there
13293 are no free temporaries for the offset. */
13294 x = strip_offset (x, &offset);
13295 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13296 return false;
13297
13298 /* Do not allow const (plus (anchor_symbol, const_int)). */
13299 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13300 return false;
13301
13302 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13303 so spilling them is better than rematerialization. */
13304 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13305 return true;
13306
13307 /* Label references are always constant. */
13308 if (GET_CODE (x) == LABEL_REF)
13309 return true;
13310
13311 return false;
13312 }
13313
13314 rtx
13315 aarch64_load_tp (rtx target)
13316 {
13317 if (!target
13318 || GET_MODE (target) != Pmode
13319 || !register_operand (target, Pmode))
13320 target = gen_reg_rtx (Pmode);
13321
13322 /* Can return in any reg. */
13323 emit_insn (gen_aarch64_load_tp_hard (target));
13324 return target;
13325 }
13326
13327 /* On AAPCS systems, this is the "struct __va_list". */
13328 static GTY(()) tree va_list_type;
13329
13330 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13331 Return the type to use as __builtin_va_list.
13332
13333 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13334
13335 struct __va_list
13336 {
13337 void *__stack;
13338 void *__gr_top;
13339 void *__vr_top;
13340 int __gr_offs;
13341 int __vr_offs;
13342 }; */
13343
13344 static tree
13345 aarch64_build_builtin_va_list (void)
13346 {
13347 tree va_list_name;
13348 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13349
13350 /* Create the type. */
13351 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13352 /* Give it the required name. */
13353 va_list_name = build_decl (BUILTINS_LOCATION,
13354 TYPE_DECL,
13355 get_identifier ("__va_list"),
13356 va_list_type);
13357 DECL_ARTIFICIAL (va_list_name) = 1;
13358 TYPE_NAME (va_list_type) = va_list_name;
13359 TYPE_STUB_DECL (va_list_type) = va_list_name;
13360
13361 /* Create the fields. */
13362 f_stack = build_decl (BUILTINS_LOCATION,
13363 FIELD_DECL, get_identifier ("__stack"),
13364 ptr_type_node);
13365 f_grtop = build_decl (BUILTINS_LOCATION,
13366 FIELD_DECL, get_identifier ("__gr_top"),
13367 ptr_type_node);
13368 f_vrtop = build_decl (BUILTINS_LOCATION,
13369 FIELD_DECL, get_identifier ("__vr_top"),
13370 ptr_type_node);
13371 f_groff = build_decl (BUILTINS_LOCATION,
13372 FIELD_DECL, get_identifier ("__gr_offs"),
13373 integer_type_node);
13374 f_vroff = build_decl (BUILTINS_LOCATION,
13375 FIELD_DECL, get_identifier ("__vr_offs"),
13376 integer_type_node);
13377
13378 /* Tell tree-stdarg pass about our internal offset fields.
13379 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13380 purpose to identify whether the code is updating va_list internal
13381 offset fields through irregular way. */
13382 va_list_gpr_counter_field = f_groff;
13383 va_list_fpr_counter_field = f_vroff;
13384
13385 DECL_ARTIFICIAL (f_stack) = 1;
13386 DECL_ARTIFICIAL (f_grtop) = 1;
13387 DECL_ARTIFICIAL (f_vrtop) = 1;
13388 DECL_ARTIFICIAL (f_groff) = 1;
13389 DECL_ARTIFICIAL (f_vroff) = 1;
13390
13391 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13392 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13393 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13394 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13395 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13396
13397 TYPE_FIELDS (va_list_type) = f_stack;
13398 DECL_CHAIN (f_stack) = f_grtop;
13399 DECL_CHAIN (f_grtop) = f_vrtop;
13400 DECL_CHAIN (f_vrtop) = f_groff;
13401 DECL_CHAIN (f_groff) = f_vroff;
13402
13403 /* Compute its layout. */
13404 layout_type (va_list_type);
13405
13406 return va_list_type;
13407 }
13408
13409 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13410 static void
13411 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13412 {
13413 const CUMULATIVE_ARGS *cum;
13414 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13415 tree stack, grtop, vrtop, groff, vroff;
13416 tree t;
13417 int gr_save_area_size = cfun->va_list_gpr_size;
13418 int vr_save_area_size = cfun->va_list_fpr_size;
13419 int vr_offset;
13420
13421 cum = &crtl->args.info;
13422 if (cfun->va_list_gpr_size)
13423 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13424 cfun->va_list_gpr_size);
13425 if (cfun->va_list_fpr_size)
13426 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13427 * UNITS_PER_VREG, cfun->va_list_fpr_size);
13428
13429 if (!TARGET_FLOAT)
13430 {
13431 gcc_assert (cum->aapcs_nvrn == 0);
13432 vr_save_area_size = 0;
13433 }
13434
13435 f_stack = TYPE_FIELDS (va_list_type_node);
13436 f_grtop = DECL_CHAIN (f_stack);
13437 f_vrtop = DECL_CHAIN (f_grtop);
13438 f_groff = DECL_CHAIN (f_vrtop);
13439 f_vroff = DECL_CHAIN (f_groff);
13440
13441 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13442 NULL_TREE);
13443 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13444 NULL_TREE);
13445 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13446 NULL_TREE);
13447 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13448 NULL_TREE);
13449 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13450 NULL_TREE);
13451
13452 /* Emit code to initialize STACK, which points to the next varargs stack
13453 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13454 by named arguments. STACK is 8-byte aligned. */
13455 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13456 if (cum->aapcs_stack_size > 0)
13457 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13458 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13459 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13460
13461 /* Emit code to initialize GRTOP, the top of the GR save area.
13462 virtual_incoming_args_rtx should have been 16 byte aligned. */
13463 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13464 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13465 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13466
13467 /* Emit code to initialize VRTOP, the top of the VR save area.
13468 This address is gr_save_area_bytes below GRTOP, rounded
13469 down to the next 16-byte boundary. */
13470 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13471 vr_offset = ROUND_UP (gr_save_area_size,
13472 STACK_BOUNDARY / BITS_PER_UNIT);
13473
13474 if (vr_offset)
13475 t = fold_build_pointer_plus_hwi (t, -vr_offset);
13476 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13477 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13478
13479 /* Emit code to initialize GROFF, the offset from GRTOP of the
13480 next GPR argument. */
13481 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13482 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13483 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13484
13485 /* Likewise emit code to initialize VROFF, the offset from FTOP
13486 of the next VR argument. */
13487 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13488 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13489 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13490 }
13491
13492 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13493
13494 static tree
13495 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13496 gimple_seq *post_p ATTRIBUTE_UNUSED)
13497 {
13498 tree addr;
13499 bool indirect_p;
13500 bool is_ha; /* is HFA or HVA. */
13501 bool dw_align; /* double-word align. */
13502 machine_mode ag_mode = VOIDmode;
13503 int nregs;
13504 machine_mode mode;
13505
13506 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13507 tree stack, f_top, f_off, off, arg, roundup, on_stack;
13508 HOST_WIDE_INT size, rsize, adjust, align;
13509 tree t, u, cond1, cond2;
13510
13511 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13512 if (indirect_p)
13513 type = build_pointer_type (type);
13514
13515 mode = TYPE_MODE (type);
13516
13517 f_stack = TYPE_FIELDS (va_list_type_node);
13518 f_grtop = DECL_CHAIN (f_stack);
13519 f_vrtop = DECL_CHAIN (f_grtop);
13520 f_groff = DECL_CHAIN (f_vrtop);
13521 f_vroff = DECL_CHAIN (f_groff);
13522
13523 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13524 f_stack, NULL_TREE);
13525 size = int_size_in_bytes (type);
13526
13527 bool abi_break;
13528 align
13529 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13530
13531 dw_align = false;
13532 adjust = 0;
13533 if (aarch64_vfp_is_call_or_return_candidate (mode,
13534 type,
13535 &ag_mode,
13536 &nregs,
13537 &is_ha))
13538 {
13539 /* No frontends can create types with variable-sized modes, so we
13540 shouldn't be asked to pass or return them. */
13541 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13542
13543 /* TYPE passed in fp/simd registers. */
13544 if (!TARGET_FLOAT)
13545 aarch64_err_no_fpadvsimd (mode);
13546
13547 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13548 unshare_expr (valist), f_vrtop, NULL_TREE);
13549 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13550 unshare_expr (valist), f_vroff, NULL_TREE);
13551
13552 rsize = nregs * UNITS_PER_VREG;
13553
13554 if (is_ha)
13555 {
13556 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13557 adjust = UNITS_PER_VREG - ag_size;
13558 }
13559 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13560 && size < UNITS_PER_VREG)
13561 {
13562 adjust = UNITS_PER_VREG - size;
13563 }
13564 }
13565 else
13566 {
13567 /* TYPE passed in general registers. */
13568 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13569 unshare_expr (valist), f_grtop, NULL_TREE);
13570 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13571 unshare_expr (valist), f_groff, NULL_TREE);
13572 rsize = ROUND_UP (size, UNITS_PER_WORD);
13573 nregs = rsize / UNITS_PER_WORD;
13574
13575 if (align > 8)
13576 {
13577 if (abi_break && warn_psabi)
13578 inform (input_location, "parameter passing for argument of type "
13579 "%qT changed in GCC 9.1", type);
13580 dw_align = true;
13581 }
13582
13583 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13584 && size < UNITS_PER_WORD)
13585 {
13586 adjust = UNITS_PER_WORD - size;
13587 }
13588 }
13589
13590 /* Get a local temporary for the field value. */
13591 off = get_initialized_tmp_var (f_off, pre_p, NULL);
13592
13593 /* Emit code to branch if off >= 0. */
13594 t = build2 (GE_EXPR, boolean_type_node, off,
13595 build_int_cst (TREE_TYPE (off), 0));
13596 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13597
13598 if (dw_align)
13599 {
13600 /* Emit: offs = (offs + 15) & -16. */
13601 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13602 build_int_cst (TREE_TYPE (off), 15));
13603 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13604 build_int_cst (TREE_TYPE (off), -16));
13605 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13606 }
13607 else
13608 roundup = NULL;
13609
13610 /* Update ap.__[g|v]r_offs */
13611 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13612 build_int_cst (TREE_TYPE (off), rsize));
13613 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13614
13615 /* String up. */
13616 if (roundup)
13617 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13618
13619 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13620 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13621 build_int_cst (TREE_TYPE (f_off), 0));
13622 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13623
13624 /* String up: make sure the assignment happens before the use. */
13625 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13626 COND_EXPR_ELSE (cond1) = t;
13627
13628 /* Prepare the trees handling the argument that is passed on the stack;
13629 the top level node will store in ON_STACK. */
13630 arg = get_initialized_tmp_var (stack, pre_p, NULL);
13631 if (align > 8)
13632 {
13633 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13634 t = fold_build_pointer_plus_hwi (arg, 15);
13635 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13636 build_int_cst (TREE_TYPE (t), -16));
13637 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13638 }
13639 else
13640 roundup = NULL;
13641 /* Advance ap.__stack */
13642 t = fold_build_pointer_plus_hwi (arg, size + 7);
13643 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13644 build_int_cst (TREE_TYPE (t), -8));
13645 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13646 /* String up roundup and advance. */
13647 if (roundup)
13648 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13649 /* String up with arg */
13650 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13651 /* Big-endianness related address adjustment. */
13652 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13653 && size < UNITS_PER_WORD)
13654 {
13655 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13656 size_int (UNITS_PER_WORD - size));
13657 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13658 }
13659
13660 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13661 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13662
13663 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13664 t = off;
13665 if (adjust)
13666 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13667 build_int_cst (TREE_TYPE (off), adjust));
13668
13669 t = fold_convert (sizetype, t);
13670 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13671
13672 if (is_ha)
13673 {
13674 /* type ha; // treat as "struct {ftype field[n];}"
13675 ... [computing offs]
13676 for (i = 0; i <nregs; ++i, offs += 16)
13677 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13678 return ha; */
13679 int i;
13680 tree tmp_ha, field_t, field_ptr_t;
13681
13682 /* Declare a local variable. */
13683 tmp_ha = create_tmp_var_raw (type, "ha");
13684 gimple_add_tmp_var (tmp_ha);
13685
13686 /* Establish the base type. */
13687 switch (ag_mode)
13688 {
13689 case E_SFmode:
13690 field_t = float_type_node;
13691 field_ptr_t = float_ptr_type_node;
13692 break;
13693 case E_DFmode:
13694 field_t = double_type_node;
13695 field_ptr_t = double_ptr_type_node;
13696 break;
13697 case E_TFmode:
13698 field_t = long_double_type_node;
13699 field_ptr_t = long_double_ptr_type_node;
13700 break;
13701 case E_HFmode:
13702 field_t = aarch64_fp16_type_node;
13703 field_ptr_t = aarch64_fp16_ptr_type_node;
13704 break;
13705 case E_V2SImode:
13706 case E_V4SImode:
13707 {
13708 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13709 field_t = build_vector_type_for_mode (innertype, ag_mode);
13710 field_ptr_t = build_pointer_type (field_t);
13711 }
13712 break;
13713 default:
13714 gcc_assert (0);
13715 }
13716
13717 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13718 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13719 addr = t;
13720 t = fold_convert (field_ptr_t, addr);
13721 t = build2 (MODIFY_EXPR, field_t,
13722 build1 (INDIRECT_REF, field_t, tmp_ha),
13723 build1 (INDIRECT_REF, field_t, t));
13724
13725 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13726 for (i = 1; i < nregs; ++i)
13727 {
13728 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13729 u = fold_convert (field_ptr_t, addr);
13730 u = build2 (MODIFY_EXPR, field_t,
13731 build2 (MEM_REF, field_t, tmp_ha,
13732 build_int_cst (field_ptr_t,
13733 (i *
13734 int_size_in_bytes (field_t)))),
13735 build1 (INDIRECT_REF, field_t, u));
13736 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13737 }
13738
13739 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13740 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13741 }
13742
13743 COND_EXPR_ELSE (cond2) = t;
13744 addr = fold_convert (build_pointer_type (type), cond1);
13745 addr = build_va_arg_indirect_ref (addr);
13746
13747 if (indirect_p)
13748 addr = build_va_arg_indirect_ref (addr);
13749
13750 return addr;
13751 }
13752
13753 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13754
13755 static void
13756 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13757 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13758 int no_rtl)
13759 {
13760 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13761 CUMULATIVE_ARGS local_cum;
13762 int gr_saved = cfun->va_list_gpr_size;
13763 int vr_saved = cfun->va_list_fpr_size;
13764
13765 /* The caller has advanced CUM up to, but not beyond, the last named
13766 argument. Advance a local copy of CUM past the last "real" named
13767 argument, to find out how many registers are left over. */
13768 local_cum = *cum;
13769 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13770
13771 /* Found out how many registers we need to save.
13772 Honor tree-stdvar analysis results. */
13773 if (cfun->va_list_gpr_size)
13774 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13775 cfun->va_list_gpr_size / UNITS_PER_WORD);
13776 if (cfun->va_list_fpr_size)
13777 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13778 cfun->va_list_fpr_size / UNITS_PER_VREG);
13779
13780 if (!TARGET_FLOAT)
13781 {
13782 gcc_assert (local_cum.aapcs_nvrn == 0);
13783 vr_saved = 0;
13784 }
13785
13786 if (!no_rtl)
13787 {
13788 if (gr_saved > 0)
13789 {
13790 rtx ptr, mem;
13791
13792 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13793 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13794 - gr_saved * UNITS_PER_WORD);
13795 mem = gen_frame_mem (BLKmode, ptr);
13796 set_mem_alias_set (mem, get_varargs_alias_set ());
13797
13798 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13799 mem, gr_saved);
13800 }
13801 if (vr_saved > 0)
13802 {
13803 /* We can't use move_block_from_reg, because it will use
13804 the wrong mode, storing D regs only. */
13805 machine_mode mode = TImode;
13806 int off, i, vr_start;
13807
13808 /* Set OFF to the offset from virtual_incoming_args_rtx of
13809 the first vector register. The VR save area lies below
13810 the GR one, and is aligned to 16 bytes. */
13811 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13812 STACK_BOUNDARY / BITS_PER_UNIT);
13813 off -= vr_saved * UNITS_PER_VREG;
13814
13815 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13816 for (i = 0; i < vr_saved; ++i)
13817 {
13818 rtx ptr, mem;
13819
13820 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13821 mem = gen_frame_mem (mode, ptr);
13822 set_mem_alias_set (mem, get_varargs_alias_set ());
13823 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13824 off += UNITS_PER_VREG;
13825 }
13826 }
13827 }
13828
13829 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13830 any complication of having crtl->args.pretend_args_size changed. */
13831 cfun->machine->frame.saved_varargs_size
13832 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13833 STACK_BOUNDARY / BITS_PER_UNIT)
13834 + vr_saved * UNITS_PER_VREG);
13835 }
13836
13837 static void
13838 aarch64_conditional_register_usage (void)
13839 {
13840 int i;
13841 if (!TARGET_FLOAT)
13842 {
13843 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13844 {
13845 fixed_regs[i] = 1;
13846 call_used_regs[i] = 1;
13847 }
13848 }
13849 if (!TARGET_SVE)
13850 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13851 {
13852 fixed_regs[i] = 1;
13853 call_used_regs[i] = 1;
13854 }
13855
13856 /* When tracking speculation, we need a couple of call-clobbered registers
13857 to track the speculation state. It would be nice to just use
13858 IP0 and IP1, but currently there are numerous places that just
13859 assume these registers are free for other uses (eg pointer
13860 authentication). */
13861 if (aarch64_track_speculation)
13862 {
13863 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13864 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13865 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13866 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13867 }
13868 }
13869
13870 /* Walk down the type tree of TYPE counting consecutive base elements.
13871 If *MODEP is VOIDmode, then set it to the first valid floating point
13872 type. If a non-floating point type is found, or if a floating point
13873 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13874 otherwise return the count in the sub-tree. */
13875 static int
13876 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13877 {
13878 machine_mode mode;
13879 HOST_WIDE_INT size;
13880
13881 switch (TREE_CODE (type))
13882 {
13883 case REAL_TYPE:
13884 mode = TYPE_MODE (type);
13885 if (mode != DFmode && mode != SFmode
13886 && mode != TFmode && mode != HFmode)
13887 return -1;
13888
13889 if (*modep == VOIDmode)
13890 *modep = mode;
13891
13892 if (*modep == mode)
13893 return 1;
13894
13895 break;
13896
13897 case COMPLEX_TYPE:
13898 mode = TYPE_MODE (TREE_TYPE (type));
13899 if (mode != DFmode && mode != SFmode
13900 && mode != TFmode && mode != HFmode)
13901 return -1;
13902
13903 if (*modep == VOIDmode)
13904 *modep = mode;
13905
13906 if (*modep == mode)
13907 return 2;
13908
13909 break;
13910
13911 case VECTOR_TYPE:
13912 /* Use V2SImode and V4SImode as representatives of all 64-bit
13913 and 128-bit vector types. */
13914 size = int_size_in_bytes (type);
13915 switch (size)
13916 {
13917 case 8:
13918 mode = V2SImode;
13919 break;
13920 case 16:
13921 mode = V4SImode;
13922 break;
13923 default:
13924 return -1;
13925 }
13926
13927 if (*modep == VOIDmode)
13928 *modep = mode;
13929
13930 /* Vector modes are considered to be opaque: two vectors are
13931 equivalent for the purposes of being homogeneous aggregates
13932 if they are the same size. */
13933 if (*modep == mode)
13934 return 1;
13935
13936 break;
13937
13938 case ARRAY_TYPE:
13939 {
13940 int count;
13941 tree index = TYPE_DOMAIN (type);
13942
13943 /* Can't handle incomplete types nor sizes that are not
13944 fixed. */
13945 if (!COMPLETE_TYPE_P (type)
13946 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13947 return -1;
13948
13949 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13950 if (count == -1
13951 || !index
13952 || !TYPE_MAX_VALUE (index)
13953 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13954 || !TYPE_MIN_VALUE (index)
13955 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13956 || count < 0)
13957 return -1;
13958
13959 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13960 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13961
13962 /* There must be no padding. */
13963 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13964 count * GET_MODE_BITSIZE (*modep)))
13965 return -1;
13966
13967 return count;
13968 }
13969
13970 case RECORD_TYPE:
13971 {
13972 int count = 0;
13973 int sub_count;
13974 tree field;
13975
13976 /* Can't handle incomplete types nor sizes that are not
13977 fixed. */
13978 if (!COMPLETE_TYPE_P (type)
13979 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13980 return -1;
13981
13982 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13983 {
13984 if (TREE_CODE (field) != FIELD_DECL)
13985 continue;
13986
13987 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13988 if (sub_count < 0)
13989 return -1;
13990 count += sub_count;
13991 }
13992
13993 /* There must be no padding. */
13994 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13995 count * GET_MODE_BITSIZE (*modep)))
13996 return -1;
13997
13998 return count;
13999 }
14000
14001 case UNION_TYPE:
14002 case QUAL_UNION_TYPE:
14003 {
14004 /* These aren't very interesting except in a degenerate case. */
14005 int count = 0;
14006 int sub_count;
14007 tree field;
14008
14009 /* Can't handle incomplete types nor sizes that are not
14010 fixed. */
14011 if (!COMPLETE_TYPE_P (type)
14012 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14013 return -1;
14014
14015 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14016 {
14017 if (TREE_CODE (field) != FIELD_DECL)
14018 continue;
14019
14020 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14021 if (sub_count < 0)
14022 return -1;
14023 count = count > sub_count ? count : sub_count;
14024 }
14025
14026 /* There must be no padding. */
14027 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14028 count * GET_MODE_BITSIZE (*modep)))
14029 return -1;
14030
14031 return count;
14032 }
14033
14034 default:
14035 break;
14036 }
14037
14038 return -1;
14039 }
14040
14041 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14042 type as described in AAPCS64 \S 4.1.2.
14043
14044 See the comment above aarch64_composite_type_p for the notes on MODE. */
14045
14046 static bool
14047 aarch64_short_vector_p (const_tree type,
14048 machine_mode mode)
14049 {
14050 poly_int64 size = -1;
14051
14052 if (type && TREE_CODE (type) == VECTOR_TYPE)
14053 size = int_size_in_bytes (type);
14054 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14055 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14056 size = GET_MODE_SIZE (mode);
14057
14058 return known_eq (size, 8) || known_eq (size, 16);
14059 }
14060
14061 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14062 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14063 array types. The C99 floating-point complex types are also considered
14064 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14065 types, which are GCC extensions and out of the scope of AAPCS64, are
14066 treated as composite types here as well.
14067
14068 Note that MODE itself is not sufficient in determining whether a type
14069 is such a composite type or not. This is because
14070 stor-layout.c:compute_record_mode may have already changed the MODE
14071 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14072 structure with only one field may have its MODE set to the mode of the
14073 field. Also an integer mode whose size matches the size of the
14074 RECORD_TYPE type may be used to substitute the original mode
14075 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14076 solely relied on. */
14077
14078 static bool
14079 aarch64_composite_type_p (const_tree type,
14080 machine_mode mode)
14081 {
14082 if (aarch64_short_vector_p (type, mode))
14083 return false;
14084
14085 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14086 return true;
14087
14088 if (mode == BLKmode
14089 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14090 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14091 return true;
14092
14093 return false;
14094 }
14095
14096 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14097 shall be passed or returned in simd/fp register(s) (providing these
14098 parameter passing registers are available).
14099
14100 Upon successful return, *COUNT returns the number of needed registers,
14101 *BASE_MODE returns the mode of the individual register and when IS_HAF
14102 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14103 floating-point aggregate or a homogeneous short-vector aggregate. */
14104
14105 static bool
14106 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14107 const_tree type,
14108 machine_mode *base_mode,
14109 int *count,
14110 bool *is_ha)
14111 {
14112 machine_mode new_mode = VOIDmode;
14113 bool composite_p = aarch64_composite_type_p (type, mode);
14114
14115 if (is_ha != NULL) *is_ha = false;
14116
14117 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14118 || aarch64_short_vector_p (type, mode))
14119 {
14120 *count = 1;
14121 new_mode = mode;
14122 }
14123 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14124 {
14125 if (is_ha != NULL) *is_ha = true;
14126 *count = 2;
14127 new_mode = GET_MODE_INNER (mode);
14128 }
14129 else if (type && composite_p)
14130 {
14131 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14132
14133 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14134 {
14135 if (is_ha != NULL) *is_ha = true;
14136 *count = ag_count;
14137 }
14138 else
14139 return false;
14140 }
14141 else
14142 return false;
14143
14144 *base_mode = new_mode;
14145 return true;
14146 }
14147
14148 /* Implement TARGET_STRUCT_VALUE_RTX. */
14149
14150 static rtx
14151 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14152 int incoming ATTRIBUTE_UNUSED)
14153 {
14154 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14155 }
14156
14157 /* Implements target hook vector_mode_supported_p. */
14158 static bool
14159 aarch64_vector_mode_supported_p (machine_mode mode)
14160 {
14161 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14162 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14163 }
14164
14165 /* Return appropriate SIMD container
14166 for MODE within a vector of WIDTH bits. */
14167 static machine_mode
14168 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14169 {
14170 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14171 switch (mode)
14172 {
14173 case E_DFmode:
14174 return VNx2DFmode;
14175 case E_SFmode:
14176 return VNx4SFmode;
14177 case E_HFmode:
14178 return VNx8HFmode;
14179 case E_DImode:
14180 return VNx2DImode;
14181 case E_SImode:
14182 return VNx4SImode;
14183 case E_HImode:
14184 return VNx8HImode;
14185 case E_QImode:
14186 return VNx16QImode;
14187 default:
14188 return word_mode;
14189 }
14190
14191 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14192 if (TARGET_SIMD)
14193 {
14194 if (known_eq (width, 128))
14195 switch (mode)
14196 {
14197 case E_DFmode:
14198 return V2DFmode;
14199 case E_SFmode:
14200 return V4SFmode;
14201 case E_HFmode:
14202 return V8HFmode;
14203 case E_SImode:
14204 return V4SImode;
14205 case E_HImode:
14206 return V8HImode;
14207 case E_QImode:
14208 return V16QImode;
14209 case E_DImode:
14210 return V2DImode;
14211 default:
14212 break;
14213 }
14214 else
14215 switch (mode)
14216 {
14217 case E_SFmode:
14218 return V2SFmode;
14219 case E_HFmode:
14220 return V4HFmode;
14221 case E_SImode:
14222 return V2SImode;
14223 case E_HImode:
14224 return V4HImode;
14225 case E_QImode:
14226 return V8QImode;
14227 default:
14228 break;
14229 }
14230 }
14231 return word_mode;
14232 }
14233
14234 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14235 static machine_mode
14236 aarch64_preferred_simd_mode (scalar_mode mode)
14237 {
14238 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14239 return aarch64_simd_container_mode (mode, bits);
14240 }
14241
14242 /* Return a list of possible vector sizes for the vectorizer
14243 to iterate over. */
14244 static void
14245 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14246 {
14247 if (TARGET_SVE)
14248 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14249 sizes->safe_push (16);
14250 sizes->safe_push (8);
14251 }
14252
14253 /* Implement TARGET_MANGLE_TYPE. */
14254
14255 static const char *
14256 aarch64_mangle_type (const_tree type)
14257 {
14258 /* The AArch64 ABI documents say that "__va_list" has to be
14259 mangled as if it is in the "std" namespace. */
14260 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14261 return "St9__va_list";
14262
14263 /* Half-precision float. */
14264 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14265 return "Dh";
14266
14267 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14268 builtin types. */
14269 if (TYPE_NAME (type) != NULL)
14270 return aarch64_mangle_builtin_type (type);
14271
14272 /* Use the default mangling. */
14273 return NULL;
14274 }
14275
14276 /* Find the first rtx_insn before insn that will generate an assembly
14277 instruction. */
14278
14279 static rtx_insn *
14280 aarch64_prev_real_insn (rtx_insn *insn)
14281 {
14282 if (!insn)
14283 return NULL;
14284
14285 do
14286 {
14287 insn = prev_real_insn (insn);
14288 }
14289 while (insn && recog_memoized (insn) < 0);
14290
14291 return insn;
14292 }
14293
14294 static bool
14295 is_madd_op (enum attr_type t1)
14296 {
14297 unsigned int i;
14298 /* A number of these may be AArch32 only. */
14299 enum attr_type mlatypes[] = {
14300 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14301 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14302 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14303 };
14304
14305 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14306 {
14307 if (t1 == mlatypes[i])
14308 return true;
14309 }
14310
14311 return false;
14312 }
14313
14314 /* Check if there is a register dependency between a load and the insn
14315 for which we hold recog_data. */
14316
14317 static bool
14318 dep_between_memop_and_curr (rtx memop)
14319 {
14320 rtx load_reg;
14321 int opno;
14322
14323 gcc_assert (GET_CODE (memop) == SET);
14324
14325 if (!REG_P (SET_DEST (memop)))
14326 return false;
14327
14328 load_reg = SET_DEST (memop);
14329 for (opno = 1; opno < recog_data.n_operands; opno++)
14330 {
14331 rtx operand = recog_data.operand[opno];
14332 if (REG_P (operand)
14333 && reg_overlap_mentioned_p (load_reg, operand))
14334 return true;
14335
14336 }
14337 return false;
14338 }
14339
14340
14341 /* When working around the Cortex-A53 erratum 835769,
14342 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14343 instruction and has a preceding memory instruction such that a NOP
14344 should be inserted between them. */
14345
14346 bool
14347 aarch64_madd_needs_nop (rtx_insn* insn)
14348 {
14349 enum attr_type attr_type;
14350 rtx_insn *prev;
14351 rtx body;
14352
14353 if (!TARGET_FIX_ERR_A53_835769)
14354 return false;
14355
14356 if (!INSN_P (insn) || recog_memoized (insn) < 0)
14357 return false;
14358
14359 attr_type = get_attr_type (insn);
14360 if (!is_madd_op (attr_type))
14361 return false;
14362
14363 prev = aarch64_prev_real_insn (insn);
14364 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14365 Restore recog state to INSN to avoid state corruption. */
14366 extract_constrain_insn_cached (insn);
14367
14368 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14369 return false;
14370
14371 body = single_set (prev);
14372
14373 /* If the previous insn is a memory op and there is no dependency between
14374 it and the DImode madd, emit a NOP between them. If body is NULL then we
14375 have a complex memory operation, probably a load/store pair.
14376 Be conservative for now and emit a NOP. */
14377 if (GET_MODE (recog_data.operand[0]) == DImode
14378 && (!body || !dep_between_memop_and_curr (body)))
14379 return true;
14380
14381 return false;
14382
14383 }
14384
14385
14386 /* Implement FINAL_PRESCAN_INSN. */
14387
14388 void
14389 aarch64_final_prescan_insn (rtx_insn *insn)
14390 {
14391 if (aarch64_madd_needs_nop (insn))
14392 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14393 }
14394
14395
14396 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14397 instruction. */
14398
14399 bool
14400 aarch64_sve_index_immediate_p (rtx base_or_step)
14401 {
14402 return (CONST_INT_P (base_or_step)
14403 && IN_RANGE (INTVAL (base_or_step), -16, 15));
14404 }
14405
14406 /* Return true if X is a valid immediate for the SVE ADD and SUB
14407 instructions. Negate X first if NEGATE_P is true. */
14408
14409 bool
14410 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14411 {
14412 rtx elt;
14413
14414 if (!const_vec_duplicate_p (x, &elt)
14415 || !CONST_INT_P (elt))
14416 return false;
14417
14418 HOST_WIDE_INT val = INTVAL (elt);
14419 if (negate_p)
14420 val = -val;
14421 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14422
14423 if (val & 0xff)
14424 return IN_RANGE (val, 0, 0xff);
14425 return IN_RANGE (val, 0, 0xff00);
14426 }
14427
14428 /* Return true if X is a valid immediate operand for an SVE logical
14429 instruction such as AND. */
14430
14431 bool
14432 aarch64_sve_bitmask_immediate_p (rtx x)
14433 {
14434 rtx elt;
14435
14436 return (const_vec_duplicate_p (x, &elt)
14437 && CONST_INT_P (elt)
14438 && aarch64_bitmask_imm (INTVAL (elt),
14439 GET_MODE_INNER (GET_MODE (x))));
14440 }
14441
14442 /* Return true if X is a valid immediate for the SVE DUP and CPY
14443 instructions. */
14444
14445 bool
14446 aarch64_sve_dup_immediate_p (rtx x)
14447 {
14448 rtx elt;
14449
14450 if (!const_vec_duplicate_p (x, &elt)
14451 || !CONST_INT_P (elt))
14452 return false;
14453
14454 HOST_WIDE_INT val = INTVAL (elt);
14455 if (val & 0xff)
14456 return IN_RANGE (val, -0x80, 0x7f);
14457 return IN_RANGE (val, -0x8000, 0x7f00);
14458 }
14459
14460 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14461 SIGNED_P says whether the operand is signed rather than unsigned. */
14462
14463 bool
14464 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14465 {
14466 rtx elt;
14467
14468 return (const_vec_duplicate_p (x, &elt)
14469 && CONST_INT_P (elt)
14470 && (signed_p
14471 ? IN_RANGE (INTVAL (elt), -16, 15)
14472 : IN_RANGE (INTVAL (elt), 0, 127)));
14473 }
14474
14475 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14476 instruction. Negate X first if NEGATE_P is true. */
14477
14478 bool
14479 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14480 {
14481 rtx elt;
14482 REAL_VALUE_TYPE r;
14483
14484 if (!const_vec_duplicate_p (x, &elt)
14485 || GET_CODE (elt) != CONST_DOUBLE)
14486 return false;
14487
14488 r = *CONST_DOUBLE_REAL_VALUE (elt);
14489
14490 if (negate_p)
14491 r = real_value_negate (&r);
14492
14493 if (real_equal (&r, &dconst1))
14494 return true;
14495 if (real_equal (&r, &dconsthalf))
14496 return true;
14497 return false;
14498 }
14499
14500 /* Return true if X is a valid immediate operand for an SVE FMUL
14501 instruction. */
14502
14503 bool
14504 aarch64_sve_float_mul_immediate_p (rtx x)
14505 {
14506 rtx elt;
14507
14508 /* GCC will never generate a multiply with an immediate of 2, so there is no
14509 point testing for it (even though it is a valid constant). */
14510 return (const_vec_duplicate_p (x, &elt)
14511 && GET_CODE (elt) == CONST_DOUBLE
14512 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14513 }
14514
14515 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14516 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14517 is nonnull, use it to describe valid immediates. */
14518 static bool
14519 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14520 simd_immediate_info *info,
14521 enum simd_immediate_check which,
14522 simd_immediate_info::insn_type insn)
14523 {
14524 /* Try a 4-byte immediate with LSL. */
14525 for (unsigned int shift = 0; shift < 32; shift += 8)
14526 if ((val32 & (0xff << shift)) == val32)
14527 {
14528 if (info)
14529 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14530 simd_immediate_info::LSL, shift);
14531 return true;
14532 }
14533
14534 /* Try a 2-byte immediate with LSL. */
14535 unsigned int imm16 = val32 & 0xffff;
14536 if (imm16 == (val32 >> 16))
14537 for (unsigned int shift = 0; shift < 16; shift += 8)
14538 if ((imm16 & (0xff << shift)) == imm16)
14539 {
14540 if (info)
14541 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14542 simd_immediate_info::LSL, shift);
14543 return true;
14544 }
14545
14546 /* Try a 4-byte immediate with MSL, except for cases that MVN
14547 can handle. */
14548 if (which == AARCH64_CHECK_MOV)
14549 for (unsigned int shift = 8; shift < 24; shift += 8)
14550 {
14551 unsigned int low = (1 << shift) - 1;
14552 if (((val32 & (0xff << shift)) | low) == val32)
14553 {
14554 if (info)
14555 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14556 simd_immediate_info::MSL, shift);
14557 return true;
14558 }
14559 }
14560
14561 return false;
14562 }
14563
14564 /* Return true if replicating VAL64 is a valid immediate for the
14565 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14566 use it to describe valid immediates. */
14567 static bool
14568 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14569 simd_immediate_info *info,
14570 enum simd_immediate_check which)
14571 {
14572 unsigned int val32 = val64 & 0xffffffff;
14573 unsigned int val16 = val64 & 0xffff;
14574 unsigned int val8 = val64 & 0xff;
14575
14576 if (val32 == (val64 >> 32))
14577 {
14578 if ((which & AARCH64_CHECK_ORR) != 0
14579 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14580 simd_immediate_info::MOV))
14581 return true;
14582
14583 if ((which & AARCH64_CHECK_BIC) != 0
14584 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14585 simd_immediate_info::MVN))
14586 return true;
14587
14588 /* Try using a replicated byte. */
14589 if (which == AARCH64_CHECK_MOV
14590 && val16 == (val32 >> 16)
14591 && val8 == (val16 >> 8))
14592 {
14593 if (info)
14594 *info = simd_immediate_info (QImode, val8);
14595 return true;
14596 }
14597 }
14598
14599 /* Try using a bit-to-bytemask. */
14600 if (which == AARCH64_CHECK_MOV)
14601 {
14602 unsigned int i;
14603 for (i = 0; i < 64; i += 8)
14604 {
14605 unsigned char byte = (val64 >> i) & 0xff;
14606 if (byte != 0 && byte != 0xff)
14607 break;
14608 }
14609 if (i == 64)
14610 {
14611 if (info)
14612 *info = simd_immediate_info (DImode, val64);
14613 return true;
14614 }
14615 }
14616 return false;
14617 }
14618
14619 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14620 instruction. If INFO is nonnull, use it to describe valid immediates. */
14621
14622 static bool
14623 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14624 simd_immediate_info *info)
14625 {
14626 scalar_int_mode mode = DImode;
14627 unsigned int val32 = val64 & 0xffffffff;
14628 if (val32 == (val64 >> 32))
14629 {
14630 mode = SImode;
14631 unsigned int val16 = val32 & 0xffff;
14632 if (val16 == (val32 >> 16))
14633 {
14634 mode = HImode;
14635 unsigned int val8 = val16 & 0xff;
14636 if (val8 == (val16 >> 8))
14637 mode = QImode;
14638 }
14639 }
14640 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14641 if (IN_RANGE (val, -0x80, 0x7f))
14642 {
14643 /* DUP with no shift. */
14644 if (info)
14645 *info = simd_immediate_info (mode, val);
14646 return true;
14647 }
14648 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14649 {
14650 /* DUP with LSL #8. */
14651 if (info)
14652 *info = simd_immediate_info (mode, val);
14653 return true;
14654 }
14655 if (aarch64_bitmask_imm (val64, mode))
14656 {
14657 /* DUPM. */
14658 if (info)
14659 *info = simd_immediate_info (mode, val);
14660 return true;
14661 }
14662 return false;
14663 }
14664
14665 /* Return true if OP is a valid SIMD immediate for the operation
14666 described by WHICH. If INFO is nonnull, use it to describe valid
14667 immediates. */
14668 bool
14669 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14670 enum simd_immediate_check which)
14671 {
14672 machine_mode mode = GET_MODE (op);
14673 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14674 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14675 return false;
14676
14677 scalar_mode elt_mode = GET_MODE_INNER (mode);
14678 rtx base, step;
14679 unsigned int n_elts;
14680 if (GET_CODE (op) == CONST_VECTOR
14681 && CONST_VECTOR_DUPLICATE_P (op))
14682 n_elts = CONST_VECTOR_NPATTERNS (op);
14683 else if ((vec_flags & VEC_SVE_DATA)
14684 && const_vec_series_p (op, &base, &step))
14685 {
14686 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14687 if (!aarch64_sve_index_immediate_p (base)
14688 || !aarch64_sve_index_immediate_p (step))
14689 return false;
14690
14691 if (info)
14692 *info = simd_immediate_info (elt_mode, base, step);
14693 return true;
14694 }
14695 else if (GET_CODE (op) == CONST_VECTOR
14696 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14697 /* N_ELTS set above. */;
14698 else
14699 return false;
14700
14701 /* Handle PFALSE and PTRUE. */
14702 if (vec_flags & VEC_SVE_PRED)
14703 return (op == CONST0_RTX (mode)
14704 || op == CONSTM1_RTX (mode));
14705
14706 scalar_float_mode elt_float_mode;
14707 if (n_elts == 1
14708 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14709 {
14710 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14711 if (aarch64_float_const_zero_rtx_p (elt)
14712 || aarch64_float_const_representable_p (elt))
14713 {
14714 if (info)
14715 *info = simd_immediate_info (elt_float_mode, elt);
14716 return true;
14717 }
14718 }
14719
14720 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14721 if (elt_size > 8)
14722 return false;
14723
14724 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14725
14726 /* Expand the vector constant out into a byte vector, with the least
14727 significant byte of the register first. */
14728 auto_vec<unsigned char, 16> bytes;
14729 bytes.reserve (n_elts * elt_size);
14730 for (unsigned int i = 0; i < n_elts; i++)
14731 {
14732 /* The vector is provided in gcc endian-neutral fashion.
14733 For aarch64_be Advanced SIMD, it must be laid out in the vector
14734 register in reverse order. */
14735 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14736 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14737
14738 if (elt_mode != elt_int_mode)
14739 elt = gen_lowpart (elt_int_mode, elt);
14740
14741 if (!CONST_INT_P (elt))
14742 return false;
14743
14744 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14745 for (unsigned int byte = 0; byte < elt_size; byte++)
14746 {
14747 bytes.quick_push (elt_val & 0xff);
14748 elt_val >>= BITS_PER_UNIT;
14749 }
14750 }
14751
14752 /* The immediate must repeat every eight bytes. */
14753 unsigned int nbytes = bytes.length ();
14754 for (unsigned i = 8; i < nbytes; ++i)
14755 if (bytes[i] != bytes[i - 8])
14756 return false;
14757
14758 /* Get the repeating 8-byte value as an integer. No endian correction
14759 is needed here because bytes is already in lsb-first order. */
14760 unsigned HOST_WIDE_INT val64 = 0;
14761 for (unsigned int i = 0; i < 8; i++)
14762 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14763 << (i * BITS_PER_UNIT));
14764
14765 if (vec_flags & VEC_SVE_DATA)
14766 return aarch64_sve_valid_immediate (val64, info);
14767 else
14768 return aarch64_advsimd_valid_immediate (val64, info, which);
14769 }
14770
14771 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14772 has a step in the range of INDEX. Return the index expression if so,
14773 otherwise return null. */
14774 rtx
14775 aarch64_check_zero_based_sve_index_immediate (rtx x)
14776 {
14777 rtx base, step;
14778 if (const_vec_series_p (x, &base, &step)
14779 && base == const0_rtx
14780 && aarch64_sve_index_immediate_p (step))
14781 return step;
14782 return NULL_RTX;
14783 }
14784
14785 /* Check of immediate shift constants are within range. */
14786 bool
14787 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14788 {
14789 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14790 if (left)
14791 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14792 else
14793 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14794 }
14795
14796 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14797 operation of width WIDTH at bit position POS. */
14798
14799 rtx
14800 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14801 {
14802 gcc_assert (CONST_INT_P (width));
14803 gcc_assert (CONST_INT_P (pos));
14804
14805 unsigned HOST_WIDE_INT mask
14806 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14807 return GEN_INT (mask << UINTVAL (pos));
14808 }
14809
14810 bool
14811 aarch64_mov_operand_p (rtx x, machine_mode mode)
14812 {
14813 if (GET_CODE (x) == HIGH
14814 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14815 return true;
14816
14817 if (CONST_INT_P (x))
14818 return true;
14819
14820 if (VECTOR_MODE_P (GET_MODE (x)))
14821 return aarch64_simd_valid_immediate (x, NULL);
14822
14823 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14824 return true;
14825
14826 if (aarch64_sve_cnt_immediate_p (x))
14827 return true;
14828
14829 return aarch64_classify_symbolic_expression (x)
14830 == SYMBOL_TINY_ABSOLUTE;
14831 }
14832
14833 /* Return a const_int vector of VAL. */
14834 rtx
14835 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14836 {
14837 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14838 return gen_const_vec_duplicate (mode, c);
14839 }
14840
14841 /* Check OP is a legal scalar immediate for the MOVI instruction. */
14842
14843 bool
14844 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14845 {
14846 machine_mode vmode;
14847
14848 vmode = aarch64_simd_container_mode (mode, 64);
14849 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14850 return aarch64_simd_valid_immediate (op_v, NULL);
14851 }
14852
14853 /* Construct and return a PARALLEL RTX vector with elements numbering the
14854 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14855 the vector - from the perspective of the architecture. This does not
14856 line up with GCC's perspective on lane numbers, so we end up with
14857 different masks depending on our target endian-ness. The diagram
14858 below may help. We must draw the distinction when building masks
14859 which select one half of the vector. An instruction selecting
14860 architectural low-lanes for a big-endian target, must be described using
14861 a mask selecting GCC high-lanes.
14862
14863 Big-Endian Little-Endian
14864
14865 GCC 0 1 2 3 3 2 1 0
14866 | x | x | x | x | | x | x | x | x |
14867 Architecture 3 2 1 0 3 2 1 0
14868
14869 Low Mask: { 2, 3 } { 0, 1 }
14870 High Mask: { 0, 1 } { 2, 3 }
14871
14872 MODE Is the mode of the vector and NUNITS is the number of units in it. */
14873
14874 rtx
14875 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14876 {
14877 rtvec v = rtvec_alloc (nunits / 2);
14878 int high_base = nunits / 2;
14879 int low_base = 0;
14880 int base;
14881 rtx t1;
14882 int i;
14883
14884 if (BYTES_BIG_ENDIAN)
14885 base = high ? low_base : high_base;
14886 else
14887 base = high ? high_base : low_base;
14888
14889 for (i = 0; i < nunits / 2; i++)
14890 RTVEC_ELT (v, i) = GEN_INT (base + i);
14891
14892 t1 = gen_rtx_PARALLEL (mode, v);
14893 return t1;
14894 }
14895
14896 /* Check OP for validity as a PARALLEL RTX vector with elements
14897 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14898 from the perspective of the architecture. See the diagram above
14899 aarch64_simd_vect_par_cnst_half for more details. */
14900
14901 bool
14902 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14903 bool high)
14904 {
14905 int nelts;
14906 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14907 return false;
14908
14909 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14910 HOST_WIDE_INT count_op = XVECLEN (op, 0);
14911 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14912 int i = 0;
14913
14914 if (count_op != count_ideal)
14915 return false;
14916
14917 for (i = 0; i < count_ideal; i++)
14918 {
14919 rtx elt_op = XVECEXP (op, 0, i);
14920 rtx elt_ideal = XVECEXP (ideal, 0, i);
14921
14922 if (!CONST_INT_P (elt_op)
14923 || INTVAL (elt_ideal) != INTVAL (elt_op))
14924 return false;
14925 }
14926 return true;
14927 }
14928
14929 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
14930 HIGH (exclusive). */
14931 void
14932 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14933 const_tree exp)
14934 {
14935 HOST_WIDE_INT lane;
14936 gcc_assert (CONST_INT_P (operand));
14937 lane = INTVAL (operand);
14938
14939 if (lane < low || lane >= high)
14940 {
14941 if (exp)
14942 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14943 else
14944 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14945 }
14946 }
14947
14948 /* Peform endian correction on lane number N, which indexes a vector
14949 of mode MODE, and return the result as an SImode rtx. */
14950
14951 rtx
14952 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14953 {
14954 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14955 }
14956
14957 /* Return TRUE if OP is a valid vector addressing mode. */
14958
14959 bool
14960 aarch64_simd_mem_operand_p (rtx op)
14961 {
14962 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14963 || REG_P (XEXP (op, 0)));
14964 }
14965
14966 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
14967
14968 bool
14969 aarch64_sve_ld1r_operand_p (rtx op)
14970 {
14971 struct aarch64_address_info addr;
14972 scalar_mode mode;
14973
14974 return (MEM_P (op)
14975 && is_a <scalar_mode> (GET_MODE (op), &mode)
14976 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14977 && addr.type == ADDRESS_REG_IMM
14978 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14979 }
14980
14981 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14982 The conditions for STR are the same. */
14983 bool
14984 aarch64_sve_ldr_operand_p (rtx op)
14985 {
14986 struct aarch64_address_info addr;
14987
14988 return (MEM_P (op)
14989 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14990 false, ADDR_QUERY_ANY)
14991 && addr.type == ADDRESS_REG_IMM);
14992 }
14993
14994 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14995 We need to be able to access the individual pieces, so the range
14996 is different from LD[234] and ST[234]. */
14997 bool
14998 aarch64_sve_struct_memory_operand_p (rtx op)
14999 {
15000 if (!MEM_P (op))
15001 return false;
15002
15003 machine_mode mode = GET_MODE (op);
15004 struct aarch64_address_info addr;
15005 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15006 ADDR_QUERY_ANY)
15007 || addr.type != ADDRESS_REG_IMM)
15008 return false;
15009
15010 poly_int64 first = addr.const_offset;
15011 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15012 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15013 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15014 }
15015
15016 /* Emit a register copy from operand to operand, taking care not to
15017 early-clobber source registers in the process.
15018
15019 COUNT is the number of components into which the copy needs to be
15020 decomposed. */
15021 void
15022 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15023 unsigned int count)
15024 {
15025 unsigned int i;
15026 int rdest = REGNO (operands[0]);
15027 int rsrc = REGNO (operands[1]);
15028
15029 if (!reg_overlap_mentioned_p (operands[0], operands[1])
15030 || rdest < rsrc)
15031 for (i = 0; i < count; i++)
15032 emit_move_insn (gen_rtx_REG (mode, rdest + i),
15033 gen_rtx_REG (mode, rsrc + i));
15034 else
15035 for (i = 0; i < count; i++)
15036 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15037 gen_rtx_REG (mode, rsrc + count - i - 1));
15038 }
15039
15040 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15041 one of VSTRUCT modes: OI, CI, or XI. */
15042 int
15043 aarch64_simd_attr_length_rglist (machine_mode mode)
15044 {
15045 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15046 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15047 }
15048
15049 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15050 alignment of a vector to 128 bits. SVE predicates have an alignment of
15051 16 bits. */
15052 static HOST_WIDE_INT
15053 aarch64_simd_vector_alignment (const_tree type)
15054 {
15055 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15056 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15057 be set for non-predicate vectors of booleans. Modes are the most
15058 direct way we have of identifying real SVE predicate types. */
15059 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15060 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15061 }
15062
15063 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15064 static poly_uint64
15065 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15066 {
15067 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15068 {
15069 /* If the length of the vector is fixed, try to align to that length,
15070 otherwise don't try to align at all. */
15071 HOST_WIDE_INT result;
15072 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15073 result = TYPE_ALIGN (TREE_TYPE (type));
15074 return result;
15075 }
15076 return TYPE_ALIGN (type);
15077 }
15078
15079 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15080 static bool
15081 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15082 {
15083 if (is_packed)
15084 return false;
15085
15086 /* For fixed-length vectors, check that the vectorizer will aim for
15087 full-vector alignment. This isn't true for generic GCC vectors
15088 that are wider than the ABI maximum of 128 bits. */
15089 poly_uint64 preferred_alignment =
15090 aarch64_vectorize_preferred_vector_alignment (type);
15091 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15092 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15093 preferred_alignment))
15094 return false;
15095
15096 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15097 return true;
15098 }
15099
15100 /* Return true if the vector misalignment factor is supported by the
15101 target. */
15102 static bool
15103 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15104 const_tree type, int misalignment,
15105 bool is_packed)
15106 {
15107 if (TARGET_SIMD && STRICT_ALIGNMENT)
15108 {
15109 /* Return if movmisalign pattern is not supported for this mode. */
15110 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15111 return false;
15112
15113 /* Misalignment factor is unknown at compile time. */
15114 if (misalignment == -1)
15115 return false;
15116 }
15117 return default_builtin_support_vector_misalignment (mode, type, misalignment,
15118 is_packed);
15119 }
15120
15121 /* If VALS is a vector constant that can be loaded into a register
15122 using DUP, generate instructions to do so and return an RTX to
15123 assign to the register. Otherwise return NULL_RTX. */
15124 static rtx
15125 aarch64_simd_dup_constant (rtx vals)
15126 {
15127 machine_mode mode = GET_MODE (vals);
15128 machine_mode inner_mode = GET_MODE_INNER (mode);
15129 rtx x;
15130
15131 if (!const_vec_duplicate_p (vals, &x))
15132 return NULL_RTX;
15133
15134 /* We can load this constant by using DUP and a constant in a
15135 single ARM register. This will be cheaper than a vector
15136 load. */
15137 x = copy_to_mode_reg (inner_mode, x);
15138 return gen_vec_duplicate (mode, x);
15139 }
15140
15141
15142 /* Generate code to load VALS, which is a PARALLEL containing only
15143 constants (for vec_init) or CONST_VECTOR, efficiently into a
15144 register. Returns an RTX to copy into the register, or NULL_RTX
15145 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15146 static rtx
15147 aarch64_simd_make_constant (rtx vals)
15148 {
15149 machine_mode mode = GET_MODE (vals);
15150 rtx const_dup;
15151 rtx const_vec = NULL_RTX;
15152 int n_const = 0;
15153 int i;
15154
15155 if (GET_CODE (vals) == CONST_VECTOR)
15156 const_vec = vals;
15157 else if (GET_CODE (vals) == PARALLEL)
15158 {
15159 /* A CONST_VECTOR must contain only CONST_INTs and
15160 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15161 Only store valid constants in a CONST_VECTOR. */
15162 int n_elts = XVECLEN (vals, 0);
15163 for (i = 0; i < n_elts; ++i)
15164 {
15165 rtx x = XVECEXP (vals, 0, i);
15166 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15167 n_const++;
15168 }
15169 if (n_const == n_elts)
15170 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15171 }
15172 else
15173 gcc_unreachable ();
15174
15175 if (const_vec != NULL_RTX
15176 && aarch64_simd_valid_immediate (const_vec, NULL))
15177 /* Load using MOVI/MVNI. */
15178 return const_vec;
15179 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15180 /* Loaded using DUP. */
15181 return const_dup;
15182 else if (const_vec != NULL_RTX)
15183 /* Load from constant pool. We cannot take advantage of single-cycle
15184 LD1 because we need a PC-relative addressing mode. */
15185 return const_vec;
15186 else
15187 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15188 We cannot construct an initializer. */
15189 return NULL_RTX;
15190 }
15191
15192 /* Expand a vector initialisation sequence, such that TARGET is
15193 initialised to contain VALS. */
15194
15195 void
15196 aarch64_expand_vector_init (rtx target, rtx vals)
15197 {
15198 machine_mode mode = GET_MODE (target);
15199 scalar_mode inner_mode = GET_MODE_INNER (mode);
15200 /* The number of vector elements. */
15201 int n_elts = XVECLEN (vals, 0);
15202 /* The number of vector elements which are not constant. */
15203 int n_var = 0;
15204 rtx any_const = NULL_RTX;
15205 /* The first element of vals. */
15206 rtx v0 = XVECEXP (vals, 0, 0);
15207 bool all_same = true;
15208
15209 /* This is a special vec_init<M><N> where N is not an element mode but a
15210 vector mode with half the elements of M. We expect to find two entries
15211 of mode N in VALS and we must put their concatentation into TARGET. */
15212 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15213 {
15214 gcc_assert (known_eq (GET_MODE_SIZE (mode),
15215 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15216 rtx lo = XVECEXP (vals, 0, 0);
15217 rtx hi = XVECEXP (vals, 0, 1);
15218 machine_mode narrow_mode = GET_MODE (lo);
15219 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15220 gcc_assert (narrow_mode == GET_MODE (hi));
15221
15222 /* When we want to concatenate a half-width vector with zeroes we can
15223 use the aarch64_combinez[_be] patterns. Just make sure that the
15224 zeroes are in the right half. */
15225 if (BYTES_BIG_ENDIAN
15226 && aarch64_simd_imm_zero (lo, narrow_mode)
15227 && general_operand (hi, narrow_mode))
15228 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15229 else if (!BYTES_BIG_ENDIAN
15230 && aarch64_simd_imm_zero (hi, narrow_mode)
15231 && general_operand (lo, narrow_mode))
15232 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15233 else
15234 {
15235 /* Else create the two half-width registers and combine them. */
15236 if (!REG_P (lo))
15237 lo = force_reg (GET_MODE (lo), lo);
15238 if (!REG_P (hi))
15239 hi = force_reg (GET_MODE (hi), hi);
15240
15241 if (BYTES_BIG_ENDIAN)
15242 std::swap (lo, hi);
15243 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
15244 }
15245 return;
15246 }
15247
15248 /* Count the number of variable elements to initialise. */
15249 for (int i = 0; i < n_elts; ++i)
15250 {
15251 rtx x = XVECEXP (vals, 0, i);
15252 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15253 ++n_var;
15254 else
15255 any_const = x;
15256
15257 all_same &= rtx_equal_p (x, v0);
15258 }
15259
15260 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15261 how best to handle this. */
15262 if (n_var == 0)
15263 {
15264 rtx constant = aarch64_simd_make_constant (vals);
15265 if (constant != NULL_RTX)
15266 {
15267 emit_move_insn (target, constant);
15268 return;
15269 }
15270 }
15271
15272 /* Splat a single non-constant element if we can. */
15273 if (all_same)
15274 {
15275 rtx x = copy_to_mode_reg (inner_mode, v0);
15276 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15277 return;
15278 }
15279
15280 enum insn_code icode = optab_handler (vec_set_optab, mode);
15281 gcc_assert (icode != CODE_FOR_nothing);
15282
15283 /* If there are only variable elements, try to optimize
15284 the insertion using dup for the most common element
15285 followed by insertions. */
15286
15287 /* The algorithm will fill matches[*][0] with the earliest matching element,
15288 and matches[X][1] with the count of duplicate elements (if X is the
15289 earliest element which has duplicates). */
15290
15291 if (n_var == n_elts && n_elts <= 16)
15292 {
15293 int matches[16][2] = {0};
15294 for (int i = 0; i < n_elts; i++)
15295 {
15296 for (int j = 0; j <= i; j++)
15297 {
15298 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15299 {
15300 matches[i][0] = j;
15301 matches[j][1]++;
15302 break;
15303 }
15304 }
15305 }
15306 int maxelement = 0;
15307 int maxv = 0;
15308 for (int i = 0; i < n_elts; i++)
15309 if (matches[i][1] > maxv)
15310 {
15311 maxelement = i;
15312 maxv = matches[i][1];
15313 }
15314
15315 /* Create a duplicate of the most common element, unless all elements
15316 are equally useless to us, in which case just immediately set the
15317 vector register using the first element. */
15318
15319 if (maxv == 1)
15320 {
15321 /* For vectors of two 64-bit elements, we can do even better. */
15322 if (n_elts == 2
15323 && (inner_mode == E_DImode
15324 || inner_mode == E_DFmode))
15325
15326 {
15327 rtx x0 = XVECEXP (vals, 0, 0);
15328 rtx x1 = XVECEXP (vals, 0, 1);
15329 /* Combine can pick up this case, but handling it directly
15330 here leaves clearer RTL.
15331
15332 This is load_pair_lanes<mode>, and also gives us a clean-up
15333 for store_pair_lanes<mode>. */
15334 if (memory_operand (x0, inner_mode)
15335 && memory_operand (x1, inner_mode)
15336 && !STRICT_ALIGNMENT
15337 && rtx_equal_p (XEXP (x1, 0),
15338 plus_constant (Pmode,
15339 XEXP (x0, 0),
15340 GET_MODE_SIZE (inner_mode))))
15341 {
15342 rtx t;
15343 if (inner_mode == DFmode)
15344 t = gen_load_pair_lanesdf (target, x0, x1);
15345 else
15346 t = gen_load_pair_lanesdi (target, x0, x1);
15347 emit_insn (t);
15348 return;
15349 }
15350 }
15351 /* The subreg-move sequence below will move into lane zero of the
15352 vector register. For big-endian we want that position to hold
15353 the last element of VALS. */
15354 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15355 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15356 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15357 }
15358 else
15359 {
15360 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15361 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15362 }
15363
15364 /* Insert the rest. */
15365 for (int i = 0; i < n_elts; i++)
15366 {
15367 rtx x = XVECEXP (vals, 0, i);
15368 if (matches[i][0] == maxelement)
15369 continue;
15370 x = copy_to_mode_reg (inner_mode, x);
15371 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15372 }
15373 return;
15374 }
15375
15376 /* Initialise a vector which is part-variable. We want to first try
15377 to build those lanes which are constant in the most efficient way we
15378 can. */
15379 if (n_var != n_elts)
15380 {
15381 rtx copy = copy_rtx (vals);
15382
15383 /* Load constant part of vector. We really don't care what goes into the
15384 parts we will overwrite, but we're more likely to be able to load the
15385 constant efficiently if it has fewer, larger, repeating parts
15386 (see aarch64_simd_valid_immediate). */
15387 for (int i = 0; i < n_elts; i++)
15388 {
15389 rtx x = XVECEXP (vals, 0, i);
15390 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15391 continue;
15392 rtx subst = any_const;
15393 for (int bit = n_elts / 2; bit > 0; bit /= 2)
15394 {
15395 /* Look in the copied vector, as more elements are const. */
15396 rtx test = XVECEXP (copy, 0, i ^ bit);
15397 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15398 {
15399 subst = test;
15400 break;
15401 }
15402 }
15403 XVECEXP (copy, 0, i) = subst;
15404 }
15405 aarch64_expand_vector_init (target, copy);
15406 }
15407
15408 /* Insert the variable lanes directly. */
15409 for (int i = 0; i < n_elts; i++)
15410 {
15411 rtx x = XVECEXP (vals, 0, i);
15412 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15413 continue;
15414 x = copy_to_mode_reg (inner_mode, x);
15415 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15416 }
15417 }
15418
15419 /* Emit RTL corresponding to:
15420 insr TARGET, ELEM. */
15421
15422 static void
15423 emit_insr (rtx target, rtx elem)
15424 {
15425 machine_mode mode = GET_MODE (target);
15426 scalar_mode elem_mode = GET_MODE_INNER (mode);
15427 elem = force_reg (elem_mode, elem);
15428
15429 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
15430 gcc_assert (icode != CODE_FOR_nothing);
15431 emit_insn (GEN_FCN (icode) (target, target, elem));
15432 }
15433
15434 /* Subroutine of aarch64_sve_expand_vector_init for handling
15435 trailing constants.
15436 This function works as follows:
15437 (a) Create a new vector consisting of trailing constants.
15438 (b) Initialize TARGET with the constant vector using emit_move_insn.
15439 (c) Insert remaining elements in TARGET using insr.
15440 NELTS is the total number of elements in original vector while
15441 while NELTS_REQD is the number of elements that are actually
15442 significant.
15443
15444 ??? The heuristic used is to do above only if number of constants
15445 is at least half the total number of elements. May need fine tuning. */
15446
15447 static bool
15448 aarch64_sve_expand_vector_init_handle_trailing_constants
15449 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
15450 {
15451 machine_mode mode = GET_MODE (target);
15452 scalar_mode elem_mode = GET_MODE_INNER (mode);
15453 int n_trailing_constants = 0;
15454
15455 for (int i = nelts_reqd - 1;
15456 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
15457 i--)
15458 n_trailing_constants++;
15459
15460 if (n_trailing_constants >= nelts_reqd / 2)
15461 {
15462 rtx_vector_builder v (mode, 1, nelts);
15463 for (int i = 0; i < nelts; i++)
15464 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
15465 rtx const_vec = v.build ();
15466 emit_move_insn (target, const_vec);
15467
15468 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
15469 emit_insr (target, builder.elt (i));
15470
15471 return true;
15472 }
15473
15474 return false;
15475 }
15476
15477 /* Subroutine of aarch64_sve_expand_vector_init.
15478 Works as follows:
15479 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
15480 (b) Skip trailing elements from BUILDER, which are the same as
15481 element NELTS_REQD - 1.
15482 (c) Insert earlier elements in reverse order in TARGET using insr. */
15483
15484 static void
15485 aarch64_sve_expand_vector_init_insert_elems (rtx target,
15486 const rtx_vector_builder &builder,
15487 int nelts_reqd)
15488 {
15489 machine_mode mode = GET_MODE (target);
15490 scalar_mode elem_mode = GET_MODE_INNER (mode);
15491
15492 struct expand_operand ops[2];
15493 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
15494 gcc_assert (icode != CODE_FOR_nothing);
15495
15496 create_output_operand (&ops[0], target, mode);
15497 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
15498 expand_insn (icode, 2, ops);
15499
15500 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15501 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
15502 emit_insr (target, builder.elt (i));
15503 }
15504
15505 /* Subroutine of aarch64_sve_expand_vector_init to handle case
15506 when all trailing elements of builder are same.
15507 This works as follows:
15508 (a) Use expand_insn interface to broadcast last vector element in TARGET.
15509 (b) Insert remaining elements in TARGET using insr.
15510
15511 ??? The heuristic used is to do above if number of same trailing elements
15512 is at least 3/4 of total number of elements, loosely based on
15513 heuristic from mostly_zeros_p. May need fine-tuning. */
15514
15515 static bool
15516 aarch64_sve_expand_vector_init_handle_trailing_same_elem
15517 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
15518 {
15519 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15520 if (ndups >= (3 * nelts_reqd) / 4)
15521 {
15522 aarch64_sve_expand_vector_init_insert_elems (target, builder,
15523 nelts_reqd - ndups + 1);
15524 return true;
15525 }
15526
15527 return false;
15528 }
15529
15530 /* Initialize register TARGET from BUILDER. NELTS is the constant number
15531 of elements in BUILDER.
15532
15533 The function tries to initialize TARGET from BUILDER if it fits one
15534 of the special cases outlined below.
15535
15536 Failing that, the function divides BUILDER into two sub-vectors:
15537 v_even = even elements of BUILDER;
15538 v_odd = odd elements of BUILDER;
15539
15540 and recursively calls itself with v_even and v_odd.
15541
15542 if (recursive call succeeded for v_even or v_odd)
15543 TARGET = zip (v_even, v_odd)
15544
15545 The function returns true if it managed to build TARGET from BUILDER
15546 with one of the special cases, false otherwise.
15547
15548 Example: {a, 1, b, 2, c, 3, d, 4}
15549
15550 The vector gets divided into:
15551 v_even = {a, b, c, d}
15552 v_odd = {1, 2, 3, 4}
15553
15554 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
15555 initialize tmp2 from constant vector v_odd using emit_move_insn.
15556
15557 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
15558 4 elements, so we construct tmp1 from v_even using insr:
15559 tmp1 = dup(d)
15560 insr tmp1, c
15561 insr tmp1, b
15562 insr tmp1, a
15563
15564 And finally:
15565 TARGET = zip (tmp1, tmp2)
15566 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
15567
15568 static bool
15569 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
15570 int nelts, int nelts_reqd)
15571 {
15572 machine_mode mode = GET_MODE (target);
15573
15574 /* Case 1: Vector contains trailing constants. */
15575
15576 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15577 (target, builder, nelts, nelts_reqd))
15578 return true;
15579
15580 /* Case 2: Vector contains leading constants. */
15581
15582 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
15583 for (int i = 0; i < nelts_reqd; i++)
15584 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
15585 rev_builder.finalize ();
15586
15587 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15588 (target, rev_builder, nelts, nelts_reqd))
15589 {
15590 emit_insn (gen_aarch64_sve_rev (mode, target, target));
15591 return true;
15592 }
15593
15594 /* Case 3: Vector contains trailing same element. */
15595
15596 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15597 (target, builder, nelts_reqd))
15598 return true;
15599
15600 /* Case 4: Vector contains leading same element. */
15601
15602 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15603 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
15604 {
15605 emit_insn (gen_aarch64_sve_rev (mode, target, target));
15606 return true;
15607 }
15608
15609 /* Avoid recursing below 4-elements.
15610 ??? The threshold 4 may need fine-tuning. */
15611
15612 if (nelts_reqd <= 4)
15613 return false;
15614
15615 rtx_vector_builder v_even (mode, 1, nelts);
15616 rtx_vector_builder v_odd (mode, 1, nelts);
15617
15618 for (int i = 0; i < nelts * 2; i += 2)
15619 {
15620 v_even.quick_push (builder.elt (i));
15621 v_odd.quick_push (builder.elt (i + 1));
15622 }
15623
15624 v_even.finalize ();
15625 v_odd.finalize ();
15626
15627 rtx tmp1 = gen_reg_rtx (mode);
15628 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
15629 nelts, nelts_reqd / 2);
15630
15631 rtx tmp2 = gen_reg_rtx (mode);
15632 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
15633 nelts, nelts_reqd / 2);
15634
15635 if (!did_even_p && !did_odd_p)
15636 return false;
15637
15638 /* Initialize v_even and v_odd using INSR if it didn't match any of the
15639 special cases and zip v_even, v_odd. */
15640
15641 if (!did_even_p)
15642 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
15643
15644 if (!did_odd_p)
15645 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
15646
15647 rtvec v = gen_rtvec (2, tmp1, tmp2);
15648 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
15649 return true;
15650 }
15651
15652 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
15653
15654 void
15655 aarch64_sve_expand_vector_init (rtx target, rtx vals)
15656 {
15657 machine_mode mode = GET_MODE (target);
15658 int nelts = XVECLEN (vals, 0);
15659
15660 rtx_vector_builder v (mode, 1, nelts);
15661 for (int i = 0; i < nelts; i++)
15662 v.quick_push (XVECEXP (vals, 0, i));
15663 v.finalize ();
15664
15665 /* If neither sub-vectors of v could be initialized specially,
15666 then use INSR to insert all elements from v into TARGET.
15667 ??? This might not be optimal for vectors with large
15668 initializers like 16-element or above.
15669 For nelts < 4, it probably isn't useful to handle specially. */
15670
15671 if (nelts < 4
15672 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
15673 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
15674 }
15675
15676 static unsigned HOST_WIDE_INT
15677 aarch64_shift_truncation_mask (machine_mode mode)
15678 {
15679 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15680 return 0;
15681 return GET_MODE_UNIT_BITSIZE (mode) - 1;
15682 }
15683
15684 /* Select a format to encode pointers in exception handling data. */
15685 int
15686 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15687 {
15688 int type;
15689 switch (aarch64_cmodel)
15690 {
15691 case AARCH64_CMODEL_TINY:
15692 case AARCH64_CMODEL_TINY_PIC:
15693 case AARCH64_CMODEL_SMALL:
15694 case AARCH64_CMODEL_SMALL_PIC:
15695 case AARCH64_CMODEL_SMALL_SPIC:
15696 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15697 for everything. */
15698 type = DW_EH_PE_sdata4;
15699 break;
15700 default:
15701 /* No assumptions here. 8-byte relocs required. */
15702 type = DW_EH_PE_sdata8;
15703 break;
15704 }
15705 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15706 }
15707
15708 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
15709
15710 static void
15711 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
15712 {
15713 if (aarch64_simd_decl_p (decl))
15714 {
15715 fprintf (stream, "\t.variant_pcs\t");
15716 assemble_name (stream, name);
15717 fprintf (stream, "\n");
15718 }
15719 }
15720
15721 /* The last .arch and .tune assembly strings that we printed. */
15722 static std::string aarch64_last_printed_arch_string;
15723 static std::string aarch64_last_printed_tune_string;
15724
15725 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15726 by the function fndecl. */
15727
15728 void
15729 aarch64_declare_function_name (FILE *stream, const char* name,
15730 tree fndecl)
15731 {
15732 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15733
15734 struct cl_target_option *targ_options;
15735 if (target_parts)
15736 targ_options = TREE_TARGET_OPTION (target_parts);
15737 else
15738 targ_options = TREE_TARGET_OPTION (target_option_current_node);
15739 gcc_assert (targ_options);
15740
15741 const struct processor *this_arch
15742 = aarch64_get_arch (targ_options->x_explicit_arch);
15743
15744 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
15745 std::string extension
15746 = aarch64_get_extension_string_for_isa_flags (isa_flags,
15747 this_arch->flags);
15748 /* Only update the assembler .arch string if it is distinct from the last
15749 such string we printed. */
15750 std::string to_print = this_arch->name + extension;
15751 if (to_print != aarch64_last_printed_arch_string)
15752 {
15753 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15754 aarch64_last_printed_arch_string = to_print;
15755 }
15756
15757 /* Print the cpu name we're tuning for in the comments, might be
15758 useful to readers of the generated asm. Do it only when it changes
15759 from function to function and verbose assembly is requested. */
15760 const struct processor *this_tune
15761 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15762
15763 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15764 {
15765 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15766 this_tune->name);
15767 aarch64_last_printed_tune_string = this_tune->name;
15768 }
15769
15770 aarch64_asm_output_variant_pcs (stream, fndecl, name);
15771
15772 /* Don't forget the type directive for ELF. */
15773 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15774 ASM_OUTPUT_LABEL (stream, name);
15775 }
15776
15777 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
15778
15779 void
15780 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
15781 {
15782 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
15783 const char *value = IDENTIFIER_POINTER (target);
15784 aarch64_asm_output_variant_pcs (stream, decl, name);
15785 ASM_OUTPUT_DEF (stream, name, value);
15786 }
15787
15788 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
15789 function symbol references. */
15790
15791 void
15792 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
15793 {
15794 default_elf_asm_output_external (stream, decl, name);
15795 aarch64_asm_output_variant_pcs (stream, decl, name);
15796 }
15797
15798 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
15799 Used to output the .cfi_b_key_frame directive when signing the current
15800 function with the B key. */
15801
15802 void
15803 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
15804 {
15805 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
15806 && aarch64_ra_sign_key == AARCH64_KEY_B)
15807 asm_fprintf (f, "\t.cfi_b_key_frame\n");
15808 }
15809
15810 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15811
15812 static void
15813 aarch64_start_file (void)
15814 {
15815 struct cl_target_option *default_options
15816 = TREE_TARGET_OPTION (target_option_default_node);
15817
15818 const struct processor *default_arch
15819 = aarch64_get_arch (default_options->x_explicit_arch);
15820 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
15821 std::string extension
15822 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15823 default_arch->flags);
15824
15825 aarch64_last_printed_arch_string = default_arch->name + extension;
15826 aarch64_last_printed_tune_string = "";
15827 asm_fprintf (asm_out_file, "\t.arch %s\n",
15828 aarch64_last_printed_arch_string.c_str ());
15829
15830 default_file_start ();
15831 }
15832
15833 /* Emit load exclusive. */
15834
15835 static void
15836 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15837 rtx mem, rtx model_rtx)
15838 {
15839 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15840 }
15841
15842 /* Emit store exclusive. */
15843
15844 static void
15845 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15846 rtx rval, rtx mem, rtx model_rtx)
15847 {
15848 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15849 }
15850
15851 /* Mark the previous jump instruction as unlikely. */
15852
15853 static void
15854 aarch64_emit_unlikely_jump (rtx insn)
15855 {
15856 rtx_insn *jump = emit_jump_insn (insn);
15857 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15858 }
15859
15860 /* Expand a compare and swap pattern. */
15861
15862 void
15863 aarch64_expand_compare_and_swap (rtx operands[])
15864 {
15865 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15866 machine_mode mode, r_mode;
15867
15868 bval = operands[0];
15869 rval = operands[1];
15870 mem = operands[2];
15871 oldval = operands[3];
15872 newval = operands[4];
15873 is_weak = operands[5];
15874 mod_s = operands[6];
15875 mod_f = operands[7];
15876 mode = GET_MODE (mem);
15877
15878 /* Normally the succ memory model must be stronger than fail, but in the
15879 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15880 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15881 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15882 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15883 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15884
15885 r_mode = mode;
15886 if (mode == QImode || mode == HImode)
15887 {
15888 r_mode = SImode;
15889 rval = gen_reg_rtx (r_mode);
15890 }
15891
15892 if (TARGET_LSE)
15893 {
15894 /* The CAS insn requires oldval and rval overlap, but we need to
15895 have a copy of oldval saved across the operation to tell if
15896 the operation is successful. */
15897 if (reg_overlap_mentioned_p (rval, oldval))
15898 rval = copy_to_mode_reg (r_mode, oldval);
15899 else
15900 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15901
15902 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15903 newval, mod_s));
15904 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15905 }
15906 else
15907 {
15908 /* The oldval predicate varies by mode. Test it and force to reg. */
15909 insn_code code = code_for_aarch64_compare_and_swap (mode);
15910 if (!insn_data[code].operand[2].predicate (oldval, mode))
15911 oldval = force_reg (mode, oldval);
15912
15913 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15914 is_weak, mod_s, mod_f));
15915 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15916 }
15917
15918 if (r_mode != mode)
15919 rval = gen_lowpart (mode, rval);
15920 emit_move_insn (operands[1], rval);
15921
15922 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15923 emit_insn (gen_rtx_SET (bval, x));
15924 }
15925
15926 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15927 sequence implementing an atomic operation. */
15928
15929 static void
15930 aarch64_emit_post_barrier (enum memmodel model)
15931 {
15932 const enum memmodel base_model = memmodel_base (model);
15933
15934 if (is_mm_sync (model)
15935 && (base_model == MEMMODEL_ACQUIRE
15936 || base_model == MEMMODEL_ACQ_REL
15937 || base_model == MEMMODEL_SEQ_CST))
15938 {
15939 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15940 }
15941 }
15942
15943 /* Split a compare and swap pattern. */
15944
15945 void
15946 aarch64_split_compare_and_swap (rtx operands[])
15947 {
15948 rtx rval, mem, oldval, newval, scratch;
15949 machine_mode mode;
15950 bool is_weak;
15951 rtx_code_label *label1, *label2;
15952 rtx x, cond;
15953 enum memmodel model;
15954 rtx model_rtx;
15955
15956 rval = operands[0];
15957 mem = operands[1];
15958 oldval = operands[2];
15959 newval = operands[3];
15960 is_weak = (operands[4] != const0_rtx);
15961 model_rtx = operands[5];
15962 scratch = operands[7];
15963 mode = GET_MODE (mem);
15964 model = memmodel_from_int (INTVAL (model_rtx));
15965
15966 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15967 loop:
15968 .label1:
15969 LD[A]XR rval, [mem]
15970 CBNZ rval, .label2
15971 ST[L]XR scratch, newval, [mem]
15972 CBNZ scratch, .label1
15973 .label2:
15974 CMP rval, 0. */
15975 bool strong_zero_p = !is_weak && oldval == const0_rtx;
15976
15977 label1 = NULL;
15978 if (!is_weak)
15979 {
15980 label1 = gen_label_rtx ();
15981 emit_label (label1);
15982 }
15983 label2 = gen_label_rtx ();
15984
15985 /* The initial load can be relaxed for a __sync operation since a final
15986 barrier will be emitted to stop code hoisting. */
15987 if (is_mm_sync (model))
15988 aarch64_emit_load_exclusive (mode, rval, mem,
15989 GEN_INT (MEMMODEL_RELAXED));
15990 else
15991 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15992
15993 if (strong_zero_p)
15994 {
15995 if (aarch64_track_speculation)
15996 {
15997 /* Emit an explicit compare instruction, so that we can correctly
15998 track the condition codes. */
15999 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16000 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16001 }
16002 else
16003 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16004
16005 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16006 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16007 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16008 }
16009 else
16010 {
16011 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16012 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16013 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16014 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16015 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16016 }
16017
16018 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16019
16020 if (!is_weak)
16021 {
16022 if (aarch64_track_speculation)
16023 {
16024 /* Emit an explicit compare instruction, so that we can correctly
16025 track the condition codes. */
16026 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16027 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16028 }
16029 else
16030 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16031
16032 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16033 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16034 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16035 }
16036 else
16037 {
16038 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16039 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16040 emit_insn (gen_rtx_SET (cond, x));
16041 }
16042
16043 emit_label (label2);
16044 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16045 to set the condition flags. If this is not used it will be removed by
16046 later passes. */
16047 if (strong_zero_p)
16048 {
16049 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16050 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16051 emit_insn (gen_rtx_SET (cond, x));
16052 }
16053 /* Emit any final barrier needed for a __sync operation. */
16054 if (is_mm_sync (model))
16055 aarch64_emit_post_barrier (model);
16056 }
16057
16058 /* Split an atomic operation. */
16059
16060 void
16061 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16062 rtx value, rtx model_rtx, rtx cond)
16063 {
16064 machine_mode mode = GET_MODE (mem);
16065 machine_mode wmode = (mode == DImode ? DImode : SImode);
16066 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16067 const bool is_sync = is_mm_sync (model);
16068 rtx_code_label *label;
16069 rtx x;
16070
16071 /* Split the atomic operation into a sequence. */
16072 label = gen_label_rtx ();
16073 emit_label (label);
16074
16075 if (new_out)
16076 new_out = gen_lowpart (wmode, new_out);
16077 if (old_out)
16078 old_out = gen_lowpart (wmode, old_out);
16079 else
16080 old_out = new_out;
16081 value = simplify_gen_subreg (wmode, value, mode, 0);
16082
16083 /* The initial load can be relaxed for a __sync operation since a final
16084 barrier will be emitted to stop code hoisting. */
16085 if (is_sync)
16086 aarch64_emit_load_exclusive (mode, old_out, mem,
16087 GEN_INT (MEMMODEL_RELAXED));
16088 else
16089 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16090
16091 switch (code)
16092 {
16093 case SET:
16094 new_out = value;
16095 break;
16096
16097 case NOT:
16098 x = gen_rtx_AND (wmode, old_out, value);
16099 emit_insn (gen_rtx_SET (new_out, x));
16100 x = gen_rtx_NOT (wmode, new_out);
16101 emit_insn (gen_rtx_SET (new_out, x));
16102 break;
16103
16104 case MINUS:
16105 if (CONST_INT_P (value))
16106 {
16107 value = GEN_INT (-INTVAL (value));
16108 code = PLUS;
16109 }
16110 /* Fall through. */
16111
16112 default:
16113 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16114 emit_insn (gen_rtx_SET (new_out, x));
16115 break;
16116 }
16117
16118 aarch64_emit_store_exclusive (mode, cond, mem,
16119 gen_lowpart (mode, new_out), model_rtx);
16120
16121 if (aarch64_track_speculation)
16122 {
16123 /* Emit an explicit compare instruction, so that we can correctly
16124 track the condition codes. */
16125 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16126 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16127 }
16128 else
16129 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16130
16131 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16132 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16133 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16134
16135 /* Emit any final barrier needed for a __sync operation. */
16136 if (is_sync)
16137 aarch64_emit_post_barrier (model);
16138 }
16139
16140 static void
16141 aarch64_init_libfuncs (void)
16142 {
16143 /* Half-precision float operations. The compiler handles all operations
16144 with NULL libfuncs by converting to SFmode. */
16145
16146 /* Conversions. */
16147 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16148 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16149
16150 /* Arithmetic. */
16151 set_optab_libfunc (add_optab, HFmode, NULL);
16152 set_optab_libfunc (sdiv_optab, HFmode, NULL);
16153 set_optab_libfunc (smul_optab, HFmode, NULL);
16154 set_optab_libfunc (neg_optab, HFmode, NULL);
16155 set_optab_libfunc (sub_optab, HFmode, NULL);
16156
16157 /* Comparisons. */
16158 set_optab_libfunc (eq_optab, HFmode, NULL);
16159 set_optab_libfunc (ne_optab, HFmode, NULL);
16160 set_optab_libfunc (lt_optab, HFmode, NULL);
16161 set_optab_libfunc (le_optab, HFmode, NULL);
16162 set_optab_libfunc (ge_optab, HFmode, NULL);
16163 set_optab_libfunc (gt_optab, HFmode, NULL);
16164 set_optab_libfunc (unord_optab, HFmode, NULL);
16165 }
16166
16167 /* Target hook for c_mode_for_suffix. */
16168 static machine_mode
16169 aarch64_c_mode_for_suffix (char suffix)
16170 {
16171 if (suffix == 'q')
16172 return TFmode;
16173
16174 return VOIDmode;
16175 }
16176
16177 /* We can only represent floating point constants which will fit in
16178 "quarter-precision" values. These values are characterised by
16179 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16180 by:
16181
16182 (-1)^s * (n/16) * 2^r
16183
16184 Where:
16185 's' is the sign bit.
16186 'n' is an integer in the range 16 <= n <= 31.
16187 'r' is an integer in the range -3 <= r <= 4. */
16188
16189 /* Return true iff X can be represented by a quarter-precision
16190 floating point immediate operand X. Note, we cannot represent 0.0. */
16191 bool
16192 aarch64_float_const_representable_p (rtx x)
16193 {
16194 /* This represents our current view of how many bits
16195 make up the mantissa. */
16196 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16197 int exponent;
16198 unsigned HOST_WIDE_INT mantissa, mask;
16199 REAL_VALUE_TYPE r, m;
16200 bool fail;
16201
16202 if (!CONST_DOUBLE_P (x))
16203 return false;
16204
16205 if (GET_MODE (x) == VOIDmode
16206 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16207 return false;
16208
16209 r = *CONST_DOUBLE_REAL_VALUE (x);
16210
16211 /* We cannot represent infinities, NaNs or +/-zero. We won't
16212 know if we have +zero until we analyse the mantissa, but we
16213 can reject the other invalid values. */
16214 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16215 || REAL_VALUE_MINUS_ZERO (r))
16216 return false;
16217
16218 /* Extract exponent. */
16219 r = real_value_abs (&r);
16220 exponent = REAL_EXP (&r);
16221
16222 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16223 highest (sign) bit, with a fixed binary point at bit point_pos.
16224 m1 holds the low part of the mantissa, m2 the high part.
16225 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16226 bits for the mantissa, this can fail (low bits will be lost). */
16227 real_ldexp (&m, &r, point_pos - exponent);
16228 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16229
16230 /* If the low part of the mantissa has bits set we cannot represent
16231 the value. */
16232 if (w.ulow () != 0)
16233 return false;
16234 /* We have rejected the lower HOST_WIDE_INT, so update our
16235 understanding of how many bits lie in the mantissa and
16236 look only at the high HOST_WIDE_INT. */
16237 mantissa = w.elt (1);
16238 point_pos -= HOST_BITS_PER_WIDE_INT;
16239
16240 /* We can only represent values with a mantissa of the form 1.xxxx. */
16241 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16242 if ((mantissa & mask) != 0)
16243 return false;
16244
16245 /* Having filtered unrepresentable values, we may now remove all
16246 but the highest 5 bits. */
16247 mantissa >>= point_pos - 5;
16248
16249 /* We cannot represent the value 0.0, so reject it. This is handled
16250 elsewhere. */
16251 if (mantissa == 0)
16252 return false;
16253
16254 /* Then, as bit 4 is always set, we can mask it off, leaving
16255 the mantissa in the range [0, 15]. */
16256 mantissa &= ~(1 << 4);
16257 gcc_assert (mantissa <= 15);
16258
16259 /* GCC internally does not use IEEE754-like encoding (where normalized
16260 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
16261 Our mantissa values are shifted 4 places to the left relative to
16262 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16263 by 5 places to correct for GCC's representation. */
16264 exponent = 5 - exponent;
16265
16266 return (exponent >= 0 && exponent <= 7);
16267 }
16268
16269 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16270 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
16271 output MOVI/MVNI, ORR or BIC immediate. */
16272 char*
16273 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16274 enum simd_immediate_check which)
16275 {
16276 bool is_valid;
16277 static char templ[40];
16278 const char *mnemonic;
16279 const char *shift_op;
16280 unsigned int lane_count = 0;
16281 char element_char;
16282
16283 struct simd_immediate_info info;
16284
16285 /* This will return true to show const_vector is legal for use as either
16286 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16287 It will also update INFO to show how the immediate should be generated.
16288 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
16289 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16290 gcc_assert (is_valid);
16291
16292 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16293 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16294
16295 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16296 {
16297 gcc_assert (info.insn == simd_immediate_info::MOV
16298 && info.u.mov.shift == 0);
16299 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16300 move immediate path. */
16301 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16302 info.u.mov.value = GEN_INT (0);
16303 else
16304 {
16305 const unsigned int buf_size = 20;
16306 char float_buf[buf_size] = {'\0'};
16307 real_to_decimal_for_mode (float_buf,
16308 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
16309 buf_size, buf_size, 1, info.elt_mode);
16310
16311 if (lane_count == 1)
16312 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16313 else
16314 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16315 lane_count, element_char, float_buf);
16316 return templ;
16317 }
16318 }
16319
16320 gcc_assert (CONST_INT_P (info.u.mov.value));
16321
16322 if (which == AARCH64_CHECK_MOV)
16323 {
16324 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16325 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
16326 ? "msl" : "lsl");
16327 if (lane_count == 1)
16328 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16329 mnemonic, UINTVAL (info.u.mov.value));
16330 else if (info.u.mov.shift)
16331 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16332 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16333 element_char, UINTVAL (info.u.mov.value), shift_op,
16334 info.u.mov.shift);
16335 else
16336 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16337 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16338 element_char, UINTVAL (info.u.mov.value));
16339 }
16340 else
16341 {
16342 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
16343 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16344 if (info.u.mov.shift)
16345 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16346 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16347 element_char, UINTVAL (info.u.mov.value), "lsl",
16348 info.u.mov.shift);
16349 else
16350 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16351 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16352 element_char, UINTVAL (info.u.mov.value));
16353 }
16354 return templ;
16355 }
16356
16357 char*
16358 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16359 {
16360
16361 /* If a floating point number was passed and we desire to use it in an
16362 integer mode do the conversion to integer. */
16363 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16364 {
16365 unsigned HOST_WIDE_INT ival;
16366 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16367 gcc_unreachable ();
16368 immediate = gen_int_mode (ival, mode);
16369 }
16370
16371 machine_mode vmode;
16372 /* use a 64 bit mode for everything except for DI/DF mode, where we use
16373 a 128 bit vector mode. */
16374 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16375
16376 vmode = aarch64_simd_container_mode (mode, width);
16377 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16378 return aarch64_output_simd_mov_immediate (v_op, width);
16379 }
16380
16381 /* Return the output string to use for moving immediate CONST_VECTOR
16382 into an SVE register. */
16383
16384 char *
16385 aarch64_output_sve_mov_immediate (rtx const_vector)
16386 {
16387 static char templ[40];
16388 struct simd_immediate_info info;
16389 char element_char;
16390
16391 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16392 gcc_assert (is_valid);
16393
16394 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16395
16396 if (info.insn == simd_immediate_info::INDEX)
16397 {
16398 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16399 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16400 element_char, INTVAL (info.u.index.base),
16401 INTVAL (info.u.index.step));
16402 return templ;
16403 }
16404
16405 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16406 {
16407 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16408 info.u.mov.value = GEN_INT (0);
16409 else
16410 {
16411 const int buf_size = 20;
16412 char float_buf[buf_size] = {};
16413 real_to_decimal_for_mode (float_buf,
16414 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
16415 buf_size, buf_size, 1, info.elt_mode);
16416
16417 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
16418 element_char, float_buf);
16419 return templ;
16420 }
16421 }
16422
16423 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
16424 element_char, INTVAL (info.u.mov.value));
16425 return templ;
16426 }
16427
16428 /* Return the asm format for a PTRUE instruction whose destination has
16429 mode MODE. SUFFIX is the element size suffix. */
16430
16431 char *
16432 aarch64_output_ptrue (machine_mode mode, char suffix)
16433 {
16434 unsigned int nunits;
16435 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16436 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
16437 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
16438 else
16439 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
16440 return buf;
16441 }
16442
16443 /* Split operands into moves from op[1] + op[2] into op[0]. */
16444
16445 void
16446 aarch64_split_combinev16qi (rtx operands[3])
16447 {
16448 unsigned int dest = REGNO (operands[0]);
16449 unsigned int src1 = REGNO (operands[1]);
16450 unsigned int src2 = REGNO (operands[2]);
16451 machine_mode halfmode = GET_MODE (operands[1]);
16452 unsigned int halfregs = REG_NREGS (operands[1]);
16453 rtx destlo, desthi;
16454
16455 gcc_assert (halfmode == V16QImode);
16456
16457 if (src1 == dest && src2 == dest + halfregs)
16458 {
16459 /* No-op move. Can't split to nothing; emit something. */
16460 emit_note (NOTE_INSN_DELETED);
16461 return;
16462 }
16463
16464 /* Preserve register attributes for variable tracking. */
16465 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
16466 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
16467 GET_MODE_SIZE (halfmode));
16468
16469 /* Special case of reversed high/low parts. */
16470 if (reg_overlap_mentioned_p (operands[2], destlo)
16471 && reg_overlap_mentioned_p (operands[1], desthi))
16472 {
16473 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16474 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
16475 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16476 }
16477 else if (!reg_overlap_mentioned_p (operands[2], destlo))
16478 {
16479 /* Try to avoid unnecessary moves if part of the result
16480 is in the right place already. */
16481 if (src1 != dest)
16482 emit_move_insn (destlo, operands[1]);
16483 if (src2 != dest + halfregs)
16484 emit_move_insn (desthi, operands[2]);
16485 }
16486 else
16487 {
16488 if (src2 != dest + halfregs)
16489 emit_move_insn (desthi, operands[2]);
16490 if (src1 != dest)
16491 emit_move_insn (destlo, operands[1]);
16492 }
16493 }
16494
16495 /* vec_perm support. */
16496
16497 struct expand_vec_perm_d
16498 {
16499 rtx target, op0, op1;
16500 vec_perm_indices perm;
16501 machine_mode vmode;
16502 unsigned int vec_flags;
16503 bool one_vector_p;
16504 bool testing_p;
16505 };
16506
16507 /* Generate a variable permutation. */
16508
16509 static void
16510 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16511 {
16512 machine_mode vmode = GET_MODE (target);
16513 bool one_vector_p = rtx_equal_p (op0, op1);
16514
16515 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16516 gcc_checking_assert (GET_MODE (op0) == vmode);
16517 gcc_checking_assert (GET_MODE (op1) == vmode);
16518 gcc_checking_assert (GET_MODE (sel) == vmode);
16519 gcc_checking_assert (TARGET_SIMD);
16520
16521 if (one_vector_p)
16522 {
16523 if (vmode == V8QImode)
16524 {
16525 /* Expand the argument to a V16QI mode by duplicating it. */
16526 rtx pair = gen_reg_rtx (V16QImode);
16527 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16528 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16529 }
16530 else
16531 {
16532 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16533 }
16534 }
16535 else
16536 {
16537 rtx pair;
16538
16539 if (vmode == V8QImode)
16540 {
16541 pair = gen_reg_rtx (V16QImode);
16542 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16543 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16544 }
16545 else
16546 {
16547 pair = gen_reg_rtx (OImode);
16548 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16549 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16550 }
16551 }
16552 }
16553
16554 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16555 NELT is the number of elements in the vector. */
16556
16557 void
16558 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16559 unsigned int nelt)
16560 {
16561 machine_mode vmode = GET_MODE (target);
16562 bool one_vector_p = rtx_equal_p (op0, op1);
16563 rtx mask;
16564
16565 /* The TBL instruction does not use a modulo index, so we must take care
16566 of that ourselves. */
16567 mask = aarch64_simd_gen_const_vector_dup (vmode,
16568 one_vector_p ? nelt - 1 : 2 * nelt - 1);
16569 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16570
16571 /* For big-endian, we also need to reverse the index within the vector
16572 (but not which vector). */
16573 if (BYTES_BIG_ENDIAN)
16574 {
16575 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
16576 if (!one_vector_p)
16577 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16578 sel = expand_simple_binop (vmode, XOR, sel, mask,
16579 NULL, 0, OPTAB_LIB_WIDEN);
16580 }
16581 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16582 }
16583
16584 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
16585
16586 static void
16587 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16588 {
16589 emit_insn (gen_rtx_SET (target,
16590 gen_rtx_UNSPEC (GET_MODE (target),
16591 gen_rtvec (2, op0, op1), code)));
16592 }
16593
16594 /* Expand an SVE vec_perm with the given operands. */
16595
16596 void
16597 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16598 {
16599 machine_mode data_mode = GET_MODE (target);
16600 machine_mode sel_mode = GET_MODE (sel);
16601 /* Enforced by the pattern condition. */
16602 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16603
16604 /* Note: vec_perm indices are supposed to wrap when they go beyond the
16605 size of the two value vectors, i.e. the upper bits of the indices
16606 are effectively ignored. SVE TBL instead produces 0 for any
16607 out-of-range indices, so we need to modulo all the vec_perm indices
16608 to ensure they are all in range. */
16609 rtx sel_reg = force_reg (sel_mode, sel);
16610
16611 /* Check if the sel only references the first values vector. */
16612 if (GET_CODE (sel) == CONST_VECTOR
16613 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16614 {
16615 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16616 return;
16617 }
16618
16619 /* Check if the two values vectors are the same. */
16620 if (rtx_equal_p (op0, op1))
16621 {
16622 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16623 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16624 NULL, 0, OPTAB_DIRECT);
16625 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16626 return;
16627 }
16628
16629 /* Run TBL on for each value vector and combine the results. */
16630
16631 rtx res0 = gen_reg_rtx (data_mode);
16632 rtx res1 = gen_reg_rtx (data_mode);
16633 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16634 if (GET_CODE (sel) != CONST_VECTOR
16635 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16636 {
16637 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16638 2 * nunits - 1);
16639 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16640 NULL, 0, OPTAB_DIRECT);
16641 }
16642 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16643 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16644 NULL, 0, OPTAB_DIRECT);
16645 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16646 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16647 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16648 else
16649 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16650 }
16651
16652 /* Recognize patterns suitable for the TRN instructions. */
16653 static bool
16654 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16655 {
16656 HOST_WIDE_INT odd;
16657 poly_uint64 nelt = d->perm.length ();
16658 rtx out, in0, in1, x;
16659 machine_mode vmode = d->vmode;
16660
16661 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16662 return false;
16663
16664 /* Note that these are little-endian tests.
16665 We correct for big-endian later. */
16666 if (!d->perm[0].is_constant (&odd)
16667 || (odd != 0 && odd != 1)
16668 || !d->perm.series_p (0, 2, odd, 2)
16669 || !d->perm.series_p (1, 2, nelt + odd, 2))
16670 return false;
16671
16672 /* Success! */
16673 if (d->testing_p)
16674 return true;
16675
16676 in0 = d->op0;
16677 in1 = d->op1;
16678 /* We don't need a big-endian lane correction for SVE; see the comment
16679 at the head of aarch64-sve.md for details. */
16680 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16681 {
16682 x = in0, in0 = in1, in1 = x;
16683 odd = !odd;
16684 }
16685 out = d->target;
16686
16687 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16688 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16689 return true;
16690 }
16691
16692 /* Recognize patterns suitable for the UZP instructions. */
16693 static bool
16694 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16695 {
16696 HOST_WIDE_INT odd;
16697 rtx out, in0, in1, x;
16698 machine_mode vmode = d->vmode;
16699
16700 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16701 return false;
16702
16703 /* Note that these are little-endian tests.
16704 We correct for big-endian later. */
16705 if (!d->perm[0].is_constant (&odd)
16706 || (odd != 0 && odd != 1)
16707 || !d->perm.series_p (0, 1, odd, 2))
16708 return false;
16709
16710 /* Success! */
16711 if (d->testing_p)
16712 return true;
16713
16714 in0 = d->op0;
16715 in1 = d->op1;
16716 /* We don't need a big-endian lane correction for SVE; see the comment
16717 at the head of aarch64-sve.md for details. */
16718 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16719 {
16720 x = in0, in0 = in1, in1 = x;
16721 odd = !odd;
16722 }
16723 out = d->target;
16724
16725 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16726 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16727 return true;
16728 }
16729
16730 /* Recognize patterns suitable for the ZIP instructions. */
16731 static bool
16732 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16733 {
16734 unsigned int high;
16735 poly_uint64 nelt = d->perm.length ();
16736 rtx out, in0, in1, x;
16737 machine_mode vmode = d->vmode;
16738
16739 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16740 return false;
16741
16742 /* Note that these are little-endian tests.
16743 We correct for big-endian later. */
16744 poly_uint64 first = d->perm[0];
16745 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16746 || !d->perm.series_p (0, 2, first, 1)
16747 || !d->perm.series_p (1, 2, first + nelt, 1))
16748 return false;
16749 high = maybe_ne (first, 0U);
16750
16751 /* Success! */
16752 if (d->testing_p)
16753 return true;
16754
16755 in0 = d->op0;
16756 in1 = d->op1;
16757 /* We don't need a big-endian lane correction for SVE; see the comment
16758 at the head of aarch64-sve.md for details. */
16759 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16760 {
16761 x = in0, in0 = in1, in1 = x;
16762 high = !high;
16763 }
16764 out = d->target;
16765
16766 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16767 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16768 return true;
16769 }
16770
16771 /* Recognize patterns for the EXT insn. */
16772
16773 static bool
16774 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16775 {
16776 HOST_WIDE_INT location;
16777 rtx offset;
16778
16779 /* The first element always refers to the first vector.
16780 Check if the extracted indices are increasing by one. */
16781 if (d->vec_flags == VEC_SVE_PRED
16782 || !d->perm[0].is_constant (&location)
16783 || !d->perm.series_p (0, 1, location, 1))
16784 return false;
16785
16786 /* Success! */
16787 if (d->testing_p)
16788 return true;
16789
16790 /* The case where (location == 0) is a no-op for both big- and little-endian,
16791 and is removed by the mid-end at optimization levels -O1 and higher.
16792
16793 We don't need a big-endian lane correction for SVE; see the comment
16794 at the head of aarch64-sve.md for details. */
16795 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16796 {
16797 /* After setup, we want the high elements of the first vector (stored
16798 at the LSB end of the register), and the low elements of the second
16799 vector (stored at the MSB end of the register). So swap. */
16800 std::swap (d->op0, d->op1);
16801 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16802 to_constant () is safe since this is restricted to Advanced SIMD
16803 vectors. */
16804 location = d->perm.length ().to_constant () - location;
16805 }
16806
16807 offset = GEN_INT (location);
16808 emit_set_insn (d->target,
16809 gen_rtx_UNSPEC (d->vmode,
16810 gen_rtvec (3, d->op0, d->op1, offset),
16811 UNSPEC_EXT));
16812 return true;
16813 }
16814
16815 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16816 within each 64-bit, 32-bit or 16-bit granule. */
16817
16818 static bool
16819 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16820 {
16821 HOST_WIDE_INT diff;
16822 unsigned int i, size, unspec;
16823 machine_mode pred_mode;
16824
16825 if (d->vec_flags == VEC_SVE_PRED
16826 || !d->one_vector_p
16827 || !d->perm[0].is_constant (&diff))
16828 return false;
16829
16830 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16831 if (size == 8)
16832 {
16833 unspec = UNSPEC_REV64;
16834 pred_mode = VNx2BImode;
16835 }
16836 else if (size == 4)
16837 {
16838 unspec = UNSPEC_REV32;
16839 pred_mode = VNx4BImode;
16840 }
16841 else if (size == 2)
16842 {
16843 unspec = UNSPEC_REV16;
16844 pred_mode = VNx8BImode;
16845 }
16846 else
16847 return false;
16848
16849 unsigned int step = diff + 1;
16850 for (i = 0; i < step; ++i)
16851 if (!d->perm.series_p (i, step, diff - i, step))
16852 return false;
16853
16854 /* Success! */
16855 if (d->testing_p)
16856 return true;
16857
16858 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16859 if (d->vec_flags == VEC_SVE_DATA)
16860 {
16861 rtx pred = aarch64_ptrue_reg (pred_mode);
16862 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16863 UNSPEC_MERGE_PTRUE);
16864 }
16865 emit_set_insn (d->target, src);
16866 return true;
16867 }
16868
16869 /* Recognize patterns for the REV insn, which reverses elements within
16870 a full vector. */
16871
16872 static bool
16873 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16874 {
16875 poly_uint64 nelt = d->perm.length ();
16876
16877 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16878 return false;
16879
16880 if (!d->perm.series_p (0, 1, nelt - 1, -1))
16881 return false;
16882
16883 /* Success! */
16884 if (d->testing_p)
16885 return true;
16886
16887 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16888 emit_set_insn (d->target, src);
16889 return true;
16890 }
16891
16892 static bool
16893 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16894 {
16895 rtx out = d->target;
16896 rtx in0;
16897 HOST_WIDE_INT elt;
16898 machine_mode vmode = d->vmode;
16899 rtx lane;
16900
16901 if (d->vec_flags == VEC_SVE_PRED
16902 || d->perm.encoding ().encoded_nelts () != 1
16903 || !d->perm[0].is_constant (&elt))
16904 return false;
16905
16906 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16907 return false;
16908
16909 /* Success! */
16910 if (d->testing_p)
16911 return true;
16912
16913 /* The generic preparation in aarch64_expand_vec_perm_const_1
16914 swaps the operand order and the permute indices if it finds
16915 d->perm[0] to be in the second operand. Thus, we can always
16916 use d->op0 and need not do any extra arithmetic to get the
16917 correct lane number. */
16918 in0 = d->op0;
16919 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
16920
16921 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16922 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16923 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16924 return true;
16925 }
16926
16927 static bool
16928 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16929 {
16930 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16931 machine_mode vmode = d->vmode;
16932
16933 /* Make sure that the indices are constant. */
16934 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16935 for (unsigned int i = 0; i < encoded_nelts; ++i)
16936 if (!d->perm[i].is_constant ())
16937 return false;
16938
16939 if (d->testing_p)
16940 return true;
16941
16942 /* Generic code will try constant permutation twice. Once with the
16943 original mode and again with the elements lowered to QImode.
16944 So wait and don't do the selector expansion ourselves. */
16945 if (vmode != V8QImode && vmode != V16QImode)
16946 return false;
16947
16948 /* to_constant is safe since this routine is specific to Advanced SIMD
16949 vectors. */
16950 unsigned int nelt = d->perm.length ().to_constant ();
16951 for (unsigned int i = 0; i < nelt; ++i)
16952 /* If big-endian and two vectors we end up with a weird mixed-endian
16953 mode on NEON. Reverse the index within each word but not the word
16954 itself. to_constant is safe because we checked is_constant above. */
16955 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16956 ? d->perm[i].to_constant () ^ (nelt - 1)
16957 : d->perm[i].to_constant ());
16958
16959 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16960 sel = force_reg (vmode, sel);
16961
16962 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16963 return true;
16964 }
16965
16966 /* Try to implement D using an SVE TBL instruction. */
16967
16968 static bool
16969 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16970 {
16971 unsigned HOST_WIDE_INT nelt;
16972
16973 /* Permuting two variable-length vectors could overflow the
16974 index range. */
16975 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16976 return false;
16977
16978 if (d->testing_p)
16979 return true;
16980
16981 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16982 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16983 if (d->one_vector_p)
16984 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16985 else
16986 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16987 return true;
16988 }
16989
16990 static bool
16991 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16992 {
16993 /* The pattern matching functions above are written to look for a small
16994 number to begin the sequence (0, 1, N/2). If we begin with an index
16995 from the second operand, we can swap the operands. */
16996 poly_int64 nelt = d->perm.length ();
16997 if (known_ge (d->perm[0], nelt))
16998 {
16999 d->perm.rotate_inputs (1);
17000 std::swap (d->op0, d->op1);
17001 }
17002
17003 if ((d->vec_flags == VEC_ADVSIMD
17004 || d->vec_flags == VEC_SVE_DATA
17005 || d->vec_flags == VEC_SVE_PRED)
17006 && known_gt (nelt, 1))
17007 {
17008 if (aarch64_evpc_rev_local (d))
17009 return true;
17010 else if (aarch64_evpc_rev_global (d))
17011 return true;
17012 else if (aarch64_evpc_ext (d))
17013 return true;
17014 else if (aarch64_evpc_dup (d))
17015 return true;
17016 else if (aarch64_evpc_zip (d))
17017 return true;
17018 else if (aarch64_evpc_uzp (d))
17019 return true;
17020 else if (aarch64_evpc_trn (d))
17021 return true;
17022 if (d->vec_flags == VEC_SVE_DATA)
17023 return aarch64_evpc_sve_tbl (d);
17024 else if (d->vec_flags == VEC_ADVSIMD)
17025 return aarch64_evpc_tbl (d);
17026 }
17027 return false;
17028 }
17029
17030 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
17031
17032 static bool
17033 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17034 rtx op1, const vec_perm_indices &sel)
17035 {
17036 struct expand_vec_perm_d d;
17037
17038 /* Check whether the mask can be applied to a single vector. */
17039 if (sel.ninputs () == 1
17040 || (op0 && rtx_equal_p (op0, op1)))
17041 d.one_vector_p = true;
17042 else if (sel.all_from_input_p (0))
17043 {
17044 d.one_vector_p = true;
17045 op1 = op0;
17046 }
17047 else if (sel.all_from_input_p (1))
17048 {
17049 d.one_vector_p = true;
17050 op0 = op1;
17051 }
17052 else
17053 d.one_vector_p = false;
17054
17055 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17056 sel.nelts_per_input ());
17057 d.vmode = vmode;
17058 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17059 d.target = target;
17060 d.op0 = op0;
17061 d.op1 = op1;
17062 d.testing_p = !target;
17063
17064 if (!d.testing_p)
17065 return aarch64_expand_vec_perm_const_1 (&d);
17066
17067 rtx_insn *last = get_last_insn ();
17068 bool ret = aarch64_expand_vec_perm_const_1 (&d);
17069 gcc_assert (last == get_last_insn ());
17070
17071 return ret;
17072 }
17073
17074 /* Generate a byte permute mask for a register of mode MODE,
17075 which has NUNITS units. */
17076
17077 rtx
17078 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17079 {
17080 /* We have to reverse each vector because we dont have
17081 a permuted load that can reverse-load according to ABI rules. */
17082 rtx mask;
17083 rtvec v = rtvec_alloc (16);
17084 unsigned int i, j;
17085 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17086
17087 gcc_assert (BYTES_BIG_ENDIAN);
17088 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17089
17090 for (i = 0; i < nunits; i++)
17091 for (j = 0; j < usize; j++)
17092 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17093 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17094 return force_reg (V16QImode, mask);
17095 }
17096
17097 /* Return true if X is a valid second operand for the SVE instruction
17098 that implements integer comparison OP_CODE. */
17099
17100 static bool
17101 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
17102 {
17103 if (register_operand (x, VOIDmode))
17104 return true;
17105
17106 switch (op_code)
17107 {
17108 case LTU:
17109 case LEU:
17110 case GEU:
17111 case GTU:
17112 return aarch64_sve_cmp_immediate_p (x, false);
17113 case LT:
17114 case LE:
17115 case GE:
17116 case GT:
17117 case NE:
17118 case EQ:
17119 return aarch64_sve_cmp_immediate_p (x, true);
17120 default:
17121 gcc_unreachable ();
17122 }
17123 }
17124
17125 /* Use predicated SVE instructions to implement the equivalent of:
17126
17127 (set TARGET OP)
17128
17129 given that PTRUE is an all-true predicate of the appropriate mode. */
17130
17131 static void
17132 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17133 {
17134 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17135 gen_rtvec (2, ptrue, op),
17136 UNSPEC_MERGE_PTRUE);
17137 rtx_insn *insn = emit_set_insn (target, unspec);
17138 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17139 }
17140
17141 /* Likewise, but also clobber the condition codes. */
17142
17143 static void
17144 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17145 {
17146 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17147 gen_rtvec (2, ptrue, op),
17148 UNSPEC_MERGE_PTRUE);
17149 rtx_insn *insn = emit_insn (gen_set_clobber_cc_nzc (target, unspec));
17150 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17151 }
17152
17153 /* Return the UNSPEC_COND_* code for comparison CODE. */
17154
17155 static unsigned int
17156 aarch64_unspec_cond_code (rtx_code code)
17157 {
17158 switch (code)
17159 {
17160 case NE:
17161 return UNSPEC_COND_FCMNE;
17162 case EQ:
17163 return UNSPEC_COND_FCMEQ;
17164 case LT:
17165 return UNSPEC_COND_FCMLT;
17166 case GT:
17167 return UNSPEC_COND_FCMGT;
17168 case LE:
17169 return UNSPEC_COND_FCMLE;
17170 case GE:
17171 return UNSPEC_COND_FCMGE;
17172 default:
17173 gcc_unreachable ();
17174 }
17175 }
17176
17177 /* Emit:
17178
17179 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17180
17181 where <X> is the operation associated with comparison CODE. This form
17182 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17183 semantics, such as when PRED might not be all-true and when comparing
17184 inactive lanes could have side effects. */
17185
17186 static void
17187 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17188 rtx pred, rtx op0, rtx op1)
17189 {
17190 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17191 gen_rtvec (3, pred, op0, op1),
17192 aarch64_unspec_cond_code (code));
17193 emit_set_insn (target, unspec);
17194 }
17195
17196 /* Expand an SVE integer comparison using the SVE equivalent of:
17197
17198 (set TARGET (CODE OP0 OP1)). */
17199
17200 void
17201 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17202 {
17203 machine_mode pred_mode = GET_MODE (target);
17204 machine_mode data_mode = GET_MODE (op0);
17205
17206 if (!aarch64_sve_cmp_operand_p (code, op1))
17207 op1 = force_reg (data_mode, op1);
17208
17209 rtx ptrue = aarch64_ptrue_reg (pred_mode);
17210 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17211 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17212 }
17213
17214 /* Emit the SVE equivalent of:
17215
17216 (set TMP1 (CODE1 OP0 OP1))
17217 (set TMP2 (CODE2 OP0 OP1))
17218 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17219
17220 PTRUE is an all-true predicate with the same mode as TARGET. */
17221
17222 static void
17223 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17224 rtx ptrue, rtx op0, rtx op1)
17225 {
17226 machine_mode pred_mode = GET_MODE (ptrue);
17227 rtx tmp1 = gen_reg_rtx (pred_mode);
17228 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17229 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17230 rtx tmp2 = gen_reg_rtx (pred_mode);
17231 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17232 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17233 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17234 }
17235
17236 /* Emit the SVE equivalent of:
17237
17238 (set TMP (CODE OP0 OP1))
17239 (set TARGET (not TMP))
17240
17241 PTRUE is an all-true predicate with the same mode as TARGET. */
17242
17243 static void
17244 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17245 rtx op0, rtx op1)
17246 {
17247 machine_mode pred_mode = GET_MODE (ptrue);
17248 rtx tmp = gen_reg_rtx (pred_mode);
17249 aarch64_emit_sve_ptrue_op (tmp, ptrue,
17250 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17251 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17252 }
17253
17254 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17255
17256 (set TARGET (CODE OP0 OP1))
17257
17258 If CAN_INVERT_P is true, the caller can also handle inverted results;
17259 return true if the result is in fact inverted. */
17260
17261 bool
17262 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17263 rtx op0, rtx op1, bool can_invert_p)
17264 {
17265 machine_mode pred_mode = GET_MODE (target);
17266 machine_mode data_mode = GET_MODE (op0);
17267
17268 rtx ptrue = aarch64_ptrue_reg (pred_mode);
17269 switch (code)
17270 {
17271 case UNORDERED:
17272 /* UNORDERED has no immediate form. */
17273 op1 = force_reg (data_mode, op1);
17274 /* fall through */
17275 case LT:
17276 case LE:
17277 case GT:
17278 case GE:
17279 case EQ:
17280 case NE:
17281 {
17282 /* There is native support for the comparison. */
17283 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17284 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17285 return false;
17286 }
17287
17288 case LTGT:
17289 /* This is a trapping operation (LT or GT). */
17290 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17291 return false;
17292
17293 case UNEQ:
17294 if (!flag_trapping_math)
17295 {
17296 /* This would trap for signaling NaNs. */
17297 op1 = force_reg (data_mode, op1);
17298 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17299 return false;
17300 }
17301 /* fall through */
17302 case UNLT:
17303 case UNLE:
17304 case UNGT:
17305 case UNGE:
17306 if (flag_trapping_math)
17307 {
17308 /* Work out which elements are ordered. */
17309 rtx ordered = gen_reg_rtx (pred_mode);
17310 op1 = force_reg (data_mode, op1);
17311 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17312
17313 /* Test the opposite condition for the ordered elements,
17314 then invert the result. */
17315 if (code == UNEQ)
17316 code = NE;
17317 else
17318 code = reverse_condition_maybe_unordered (code);
17319 if (can_invert_p)
17320 {
17321 aarch64_emit_sve_predicated_cond (target, code,
17322 ordered, op0, op1);
17323 return true;
17324 }
17325 rtx tmp = gen_reg_rtx (pred_mode);
17326 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17327 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17328 return false;
17329 }
17330 break;
17331
17332 case ORDERED:
17333 /* ORDERED has no immediate form. */
17334 op1 = force_reg (data_mode, op1);
17335 break;
17336
17337 default:
17338 gcc_unreachable ();
17339 }
17340
17341 /* There is native support for the inverse comparison. */
17342 code = reverse_condition_maybe_unordered (code);
17343 if (can_invert_p)
17344 {
17345 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17346 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17347 return true;
17348 }
17349 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17350 return false;
17351 }
17352
17353 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
17354 of the data being selected and CMP_MODE is the mode of the values being
17355 compared. */
17356
17357 void
17358 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17359 rtx *ops)
17360 {
17361 machine_mode pred_mode
17362 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17363 GET_MODE_SIZE (cmp_mode)).require ();
17364 rtx pred = gen_reg_rtx (pred_mode);
17365 if (FLOAT_MODE_P (cmp_mode))
17366 {
17367 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17368 ops[4], ops[5], true))
17369 std::swap (ops[1], ops[2]);
17370 }
17371 else
17372 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17373
17374 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17375 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17376 }
17377
17378 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
17379 true. However due to issues with register allocation it is preferable
17380 to avoid tieing integer scalar and FP scalar modes. Executing integer
17381 operations in general registers is better than treating them as scalar
17382 vector operations. This reduces latency and avoids redundant int<->FP
17383 moves. So tie modes if they are either the same class, or vector modes
17384 with other vector modes, vector structs or any scalar mode. */
17385
17386 static bool
17387 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17388 {
17389 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17390 return true;
17391
17392 /* We specifically want to allow elements of "structure" modes to
17393 be tieable to the structure. This more general condition allows
17394 other rarer situations too. The reason we don't extend this to
17395 predicate modes is that there are no predicate structure modes
17396 nor any specific instructions for extracting part of a predicate
17397 register. */
17398 if (aarch64_vector_data_mode_p (mode1)
17399 && aarch64_vector_data_mode_p (mode2))
17400 return true;
17401
17402 /* Also allow any scalar modes with vectors. */
17403 if (aarch64_vector_mode_supported_p (mode1)
17404 || aarch64_vector_mode_supported_p (mode2))
17405 return true;
17406
17407 return false;
17408 }
17409
17410 /* Return a new RTX holding the result of moving POINTER forward by
17411 AMOUNT bytes. */
17412
17413 static rtx
17414 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17415 {
17416 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17417
17418 return adjust_automodify_address (pointer, GET_MODE (pointer),
17419 next, amount);
17420 }
17421
17422 /* Return a new RTX holding the result of moving POINTER forward by the
17423 size of the mode it points to. */
17424
17425 static rtx
17426 aarch64_progress_pointer (rtx pointer)
17427 {
17428 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
17429 }
17430
17431 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17432 MODE bytes. */
17433
17434 static void
17435 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
17436 machine_mode mode)
17437 {
17438 rtx reg = gen_reg_rtx (mode);
17439
17440 /* "Cast" the pointers to the correct mode. */
17441 *src = adjust_address (*src, mode, 0);
17442 *dst = adjust_address (*dst, mode, 0);
17443 /* Emit the memcpy. */
17444 emit_move_insn (reg, *src);
17445 emit_move_insn (*dst, reg);
17446 /* Move the pointers forward. */
17447 *src = aarch64_progress_pointer (*src);
17448 *dst = aarch64_progress_pointer (*dst);
17449 }
17450
17451 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
17452 we succeed, otherwise return false. */
17453
17454 bool
17455 aarch64_expand_cpymem (rtx *operands)
17456 {
17457 int n, mode_bits;
17458 rtx dst = operands[0];
17459 rtx src = operands[1];
17460 rtx base;
17461 machine_mode cur_mode = BLKmode, next_mode;
17462 bool speed_p = !optimize_function_for_size_p (cfun);
17463
17464 /* When optimizing for size, give a better estimate of the length of a
17465 memcpy call, but use the default otherwise. Moves larger than 8 bytes
17466 will always require an even number of instructions to do now. And each
17467 operation requires both a load+store, so devide the max number by 2. */
17468 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
17469
17470 /* We can't do anything smart if the amount to copy is not constant. */
17471 if (!CONST_INT_P (operands[2]))
17472 return false;
17473
17474 n = INTVAL (operands[2]);
17475
17476 /* Try to keep the number of instructions low. For all cases we will do at
17477 most two moves for the residual amount, since we'll always overlap the
17478 remainder. */
17479 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
17480 return false;
17481
17482 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
17483 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
17484
17485 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
17486 src = adjust_automodify_address (src, VOIDmode, base, 0);
17487
17488 /* Convert n to bits to make the rest of the code simpler. */
17489 n = n * BITS_PER_UNIT;
17490
17491 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
17492 larger than TImode, but we should not use them for loads/stores here. */
17493 const int copy_limit = GET_MODE_BITSIZE (TImode);
17494
17495 while (n > 0)
17496 {
17497 /* Find the largest mode in which to do the copy in without over reading
17498 or writing. */
17499 opt_scalar_int_mode mode_iter;
17500 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
17501 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
17502 cur_mode = mode_iter.require ();
17503
17504 gcc_assert (cur_mode != BLKmode);
17505
17506 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
17507 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
17508
17509 n -= mode_bits;
17510
17511 /* Do certain trailing copies as overlapping if it's going to be
17512 cheaper. i.e. less instructions to do so. For instance doing a 15
17513 byte copy it's more efficient to do two overlapping 8 byte copies than
17514 8 + 6 + 1. */
17515 if (n > 0 && n <= 8 * BITS_PER_UNIT)
17516 {
17517 next_mode = smallest_mode_for_size (n, MODE_INT);
17518 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17519 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17520 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17521 n = n_bits;
17522 }
17523 }
17524
17525 return true;
17526 }
17527
17528 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17529 SImode stores. Handle the case when the constant has identical
17530 bottom and top halves. This is beneficial when the two stores can be
17531 merged into an STP and we avoid synthesising potentially expensive
17532 immediates twice. Return true if such a split is possible. */
17533
17534 bool
17535 aarch64_split_dimode_const_store (rtx dst, rtx src)
17536 {
17537 rtx lo = gen_lowpart (SImode, src);
17538 rtx hi = gen_highpart_mode (SImode, DImode, src);
17539
17540 bool size_p = optimize_function_for_size_p (cfun);
17541
17542 if (!rtx_equal_p (lo, hi))
17543 return false;
17544
17545 unsigned int orig_cost
17546 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17547 unsigned int lo_cost
17548 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17549
17550 /* We want to transform:
17551 MOV x1, 49370
17552 MOVK x1, 0x140, lsl 16
17553 MOVK x1, 0xc0da, lsl 32
17554 MOVK x1, 0x140, lsl 48
17555 STR x1, [x0]
17556 into:
17557 MOV w1, 49370
17558 MOVK w1, 0x140, lsl 16
17559 STP w1, w1, [x0]
17560 So we want to perform this only when we save two instructions
17561 or more. When optimizing for size, however, accept any code size
17562 savings we can. */
17563 if (size_p && orig_cost <= lo_cost)
17564 return false;
17565
17566 if (!size_p
17567 && (orig_cost <= lo_cost + 1))
17568 return false;
17569
17570 rtx mem_lo = adjust_address (dst, SImode, 0);
17571 if (!aarch64_mem_pair_operand (mem_lo, SImode))
17572 return false;
17573
17574 rtx tmp_reg = gen_reg_rtx (SImode);
17575 aarch64_expand_mov_immediate (tmp_reg, lo);
17576 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17577 /* Don't emit an explicit store pair as this may not be always profitable.
17578 Let the sched-fusion logic decide whether to merge them. */
17579 emit_move_insn (mem_lo, tmp_reg);
17580 emit_move_insn (mem_hi, tmp_reg);
17581
17582 return true;
17583 }
17584
17585 /* Generate RTL for a conditional branch with rtx comparison CODE in
17586 mode CC_MODE. The destination of the unlikely conditional branch
17587 is LABEL_REF. */
17588
17589 void
17590 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17591 rtx label_ref)
17592 {
17593 rtx x;
17594 x = gen_rtx_fmt_ee (code, VOIDmode,
17595 gen_rtx_REG (cc_mode, CC_REGNUM),
17596 const0_rtx);
17597
17598 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17599 gen_rtx_LABEL_REF (VOIDmode, label_ref),
17600 pc_rtx);
17601 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17602 }
17603
17604 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17605
17606 OP1 represents the TImode destination operand 1
17607 OP2 represents the TImode destination operand 2
17608 LOW_DEST represents the low half (DImode) of TImode operand 0
17609 LOW_IN1 represents the low half (DImode) of TImode operand 1
17610 LOW_IN2 represents the low half (DImode) of TImode operand 2
17611 HIGH_DEST represents the high half (DImode) of TImode operand 0
17612 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17613 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17614
17615 void
17616 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17617 rtx *low_in1, rtx *low_in2,
17618 rtx *high_dest, rtx *high_in1,
17619 rtx *high_in2)
17620 {
17621 *low_dest = gen_reg_rtx (DImode);
17622 *low_in1 = gen_lowpart (DImode, op1);
17623 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17624 subreg_lowpart_offset (DImode, TImode));
17625 *high_dest = gen_reg_rtx (DImode);
17626 *high_in1 = gen_highpart (DImode, op1);
17627 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17628 subreg_highpart_offset (DImode, TImode));
17629 }
17630
17631 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17632
17633 This function differs from 'arch64_addti_scratch_regs' in that
17634 OP1 can be an immediate constant (zero). We must call
17635 subreg_highpart_offset with DImode and TImode arguments, otherwise
17636 VOIDmode will be used for the const_int which generates an internal
17637 error from subreg_size_highpart_offset which does not expect a size of zero.
17638
17639 OP1 represents the TImode destination operand 1
17640 OP2 represents the TImode destination operand 2
17641 LOW_DEST represents the low half (DImode) of TImode operand 0
17642 LOW_IN1 represents the low half (DImode) of TImode operand 1
17643 LOW_IN2 represents the low half (DImode) of TImode operand 2
17644 HIGH_DEST represents the high half (DImode) of TImode operand 0
17645 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17646 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17647
17648
17649 void
17650 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17651 rtx *low_in1, rtx *low_in2,
17652 rtx *high_dest, rtx *high_in1,
17653 rtx *high_in2)
17654 {
17655 *low_dest = gen_reg_rtx (DImode);
17656 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17657 subreg_lowpart_offset (DImode, TImode));
17658
17659 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17660 subreg_lowpart_offset (DImode, TImode));
17661 *high_dest = gen_reg_rtx (DImode);
17662
17663 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17664 subreg_highpart_offset (DImode, TImode));
17665 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17666 subreg_highpart_offset (DImode, TImode));
17667 }
17668
17669 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17670
17671 OP0 represents the TImode destination operand 0
17672 LOW_DEST represents the low half (DImode) of TImode operand 0
17673 LOW_IN1 represents the low half (DImode) of TImode operand 1
17674 LOW_IN2 represents the low half (DImode) of TImode operand 2
17675 HIGH_DEST represents the high half (DImode) of TImode operand 0
17676 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17677 HIGH_IN2 represents the high half (DImode) of TImode operand 2
17678 UNSIGNED_P is true if the operation is being performed on unsigned
17679 values. */
17680 void
17681 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17682 rtx low_in2, rtx high_dest, rtx high_in1,
17683 rtx high_in2, bool unsigned_p)
17684 {
17685 if (low_in2 == const0_rtx)
17686 {
17687 low_dest = low_in1;
17688 high_in2 = force_reg (DImode, high_in2);
17689 if (unsigned_p)
17690 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17691 else
17692 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17693 }
17694 else
17695 {
17696 if (CONST_INT_P (low_in2))
17697 {
17698 high_in2 = force_reg (DImode, high_in2);
17699 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17700 GEN_INT (-INTVAL (low_in2))));
17701 }
17702 else
17703 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17704
17705 if (unsigned_p)
17706 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17707 else
17708 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17709 }
17710
17711 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17712 emit_move_insn (gen_highpart (DImode, op0), high_dest);
17713
17714 }
17715
17716 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
17717
17718 static unsigned HOST_WIDE_INT
17719 aarch64_asan_shadow_offset (void)
17720 {
17721 if (TARGET_ILP32)
17722 return (HOST_WIDE_INT_1 << 29);
17723 else
17724 return (HOST_WIDE_INT_1 << 36);
17725 }
17726
17727 static rtx
17728 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17729 int code, tree treeop0, tree treeop1)
17730 {
17731 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17732 rtx op0, op1;
17733 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17734 insn_code icode;
17735 struct expand_operand ops[4];
17736
17737 start_sequence ();
17738 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17739
17740 op_mode = GET_MODE (op0);
17741 if (op_mode == VOIDmode)
17742 op_mode = GET_MODE (op1);
17743
17744 switch (op_mode)
17745 {
17746 case E_QImode:
17747 case E_HImode:
17748 case E_SImode:
17749 cmp_mode = SImode;
17750 icode = CODE_FOR_cmpsi;
17751 break;
17752
17753 case E_DImode:
17754 cmp_mode = DImode;
17755 icode = CODE_FOR_cmpdi;
17756 break;
17757
17758 case E_SFmode:
17759 cmp_mode = SFmode;
17760 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17761 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17762 break;
17763
17764 case E_DFmode:
17765 cmp_mode = DFmode;
17766 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17767 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17768 break;
17769
17770 default:
17771 end_sequence ();
17772 return NULL_RTX;
17773 }
17774
17775 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17776 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17777 if (!op0 || !op1)
17778 {
17779 end_sequence ();
17780 return NULL_RTX;
17781 }
17782 *prep_seq = get_insns ();
17783 end_sequence ();
17784
17785 create_fixed_operand (&ops[0], op0);
17786 create_fixed_operand (&ops[1], op1);
17787
17788 start_sequence ();
17789 if (!maybe_expand_insn (icode, 2, ops))
17790 {
17791 end_sequence ();
17792 return NULL_RTX;
17793 }
17794 *gen_seq = get_insns ();
17795 end_sequence ();
17796
17797 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17798 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17799 }
17800
17801 static rtx
17802 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17803 int cmp_code, tree treeop0, tree treeop1, int bit_code)
17804 {
17805 rtx op0, op1, target;
17806 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17807 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17808 insn_code icode;
17809 struct expand_operand ops[6];
17810 int aarch64_cond;
17811
17812 push_to_sequence (*prep_seq);
17813 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17814
17815 op_mode = GET_MODE (op0);
17816 if (op_mode == VOIDmode)
17817 op_mode = GET_MODE (op1);
17818
17819 switch (op_mode)
17820 {
17821 case E_QImode:
17822 case E_HImode:
17823 case E_SImode:
17824 cmp_mode = SImode;
17825 icode = CODE_FOR_ccmpsi;
17826 break;
17827
17828 case E_DImode:
17829 cmp_mode = DImode;
17830 icode = CODE_FOR_ccmpdi;
17831 break;
17832
17833 case E_SFmode:
17834 cmp_mode = SFmode;
17835 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17836 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17837 break;
17838
17839 case E_DFmode:
17840 cmp_mode = DFmode;
17841 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17842 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17843 break;
17844
17845 default:
17846 end_sequence ();
17847 return NULL_RTX;
17848 }
17849
17850 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17851 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17852 if (!op0 || !op1)
17853 {
17854 end_sequence ();
17855 return NULL_RTX;
17856 }
17857 *prep_seq = get_insns ();
17858 end_sequence ();
17859
17860 target = gen_rtx_REG (cc_mode, CC_REGNUM);
17861 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17862
17863 if (bit_code != AND)
17864 {
17865 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17866 GET_MODE (XEXP (prev, 0))),
17867 VOIDmode, XEXP (prev, 0), const0_rtx);
17868 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17869 }
17870
17871 create_fixed_operand (&ops[0], XEXP (prev, 0));
17872 create_fixed_operand (&ops[1], target);
17873 create_fixed_operand (&ops[2], op0);
17874 create_fixed_operand (&ops[3], op1);
17875 create_fixed_operand (&ops[4], prev);
17876 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17877
17878 push_to_sequence (*gen_seq);
17879 if (!maybe_expand_insn (icode, 6, ops))
17880 {
17881 end_sequence ();
17882 return NULL_RTX;
17883 }
17884
17885 *gen_seq = get_insns ();
17886 end_sequence ();
17887
17888 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17889 }
17890
17891 #undef TARGET_GEN_CCMP_FIRST
17892 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17893
17894 #undef TARGET_GEN_CCMP_NEXT
17895 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17896
17897 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17898 instruction fusion of some sort. */
17899
17900 static bool
17901 aarch64_macro_fusion_p (void)
17902 {
17903 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17904 }
17905
17906
17907 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17908 should be kept together during scheduling. */
17909
17910 static bool
17911 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17912 {
17913 rtx set_dest;
17914 rtx prev_set = single_set (prev);
17915 rtx curr_set = single_set (curr);
17916 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17917 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17918
17919 if (!aarch64_macro_fusion_p ())
17920 return false;
17921
17922 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17923 {
17924 /* We are trying to match:
17925 prev (mov) == (set (reg r0) (const_int imm16))
17926 curr (movk) == (set (zero_extract (reg r0)
17927 (const_int 16)
17928 (const_int 16))
17929 (const_int imm16_1)) */
17930
17931 set_dest = SET_DEST (curr_set);
17932
17933 if (GET_CODE (set_dest) == ZERO_EXTRACT
17934 && CONST_INT_P (SET_SRC (curr_set))
17935 && CONST_INT_P (SET_SRC (prev_set))
17936 && CONST_INT_P (XEXP (set_dest, 2))
17937 && INTVAL (XEXP (set_dest, 2)) == 16
17938 && REG_P (XEXP (set_dest, 0))
17939 && REG_P (SET_DEST (prev_set))
17940 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17941 {
17942 return true;
17943 }
17944 }
17945
17946 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17947 {
17948
17949 /* We're trying to match:
17950 prev (adrp) == (set (reg r1)
17951 (high (symbol_ref ("SYM"))))
17952 curr (add) == (set (reg r0)
17953 (lo_sum (reg r1)
17954 (symbol_ref ("SYM"))))
17955 Note that r0 need not necessarily be the same as r1, especially
17956 during pre-regalloc scheduling. */
17957
17958 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17959 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17960 {
17961 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17962 && REG_P (XEXP (SET_SRC (curr_set), 0))
17963 && REGNO (XEXP (SET_SRC (curr_set), 0))
17964 == REGNO (SET_DEST (prev_set))
17965 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17966 XEXP (SET_SRC (curr_set), 1)))
17967 return true;
17968 }
17969 }
17970
17971 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17972 {
17973
17974 /* We're trying to match:
17975 prev (movk) == (set (zero_extract (reg r0)
17976 (const_int 16)
17977 (const_int 32))
17978 (const_int imm16_1))
17979 curr (movk) == (set (zero_extract (reg r0)
17980 (const_int 16)
17981 (const_int 48))
17982 (const_int imm16_2)) */
17983
17984 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17985 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17986 && REG_P (XEXP (SET_DEST (prev_set), 0))
17987 && REG_P (XEXP (SET_DEST (curr_set), 0))
17988 && REGNO (XEXP (SET_DEST (prev_set), 0))
17989 == REGNO (XEXP (SET_DEST (curr_set), 0))
17990 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17991 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17992 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17993 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17994 && CONST_INT_P (SET_SRC (prev_set))
17995 && CONST_INT_P (SET_SRC (curr_set)))
17996 return true;
17997
17998 }
17999 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18000 {
18001 /* We're trying to match:
18002 prev (adrp) == (set (reg r0)
18003 (high (symbol_ref ("SYM"))))
18004 curr (ldr) == (set (reg r1)
18005 (mem (lo_sum (reg r0)
18006 (symbol_ref ("SYM")))))
18007 or
18008 curr (ldr) == (set (reg r1)
18009 (zero_extend (mem
18010 (lo_sum (reg r0)
18011 (symbol_ref ("SYM")))))) */
18012 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18013 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18014 {
18015 rtx curr_src = SET_SRC (curr_set);
18016
18017 if (GET_CODE (curr_src) == ZERO_EXTEND)
18018 curr_src = XEXP (curr_src, 0);
18019
18020 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18021 && REG_P (XEXP (XEXP (curr_src, 0), 0))
18022 && REGNO (XEXP (XEXP (curr_src, 0), 0))
18023 == REGNO (SET_DEST (prev_set))
18024 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18025 XEXP (SET_SRC (prev_set), 0)))
18026 return true;
18027 }
18028 }
18029
18030 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18031 && any_condjump_p (curr))
18032 {
18033 unsigned int condreg1, condreg2;
18034 rtx cc_reg_1;
18035 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18036 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18037
18038 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18039 && prev
18040 && modified_in_p (cc_reg_1, prev))
18041 {
18042 enum attr_type prev_type = get_attr_type (prev);
18043
18044 /* FIXME: this misses some which is considered simple arthematic
18045 instructions for ThunderX. Simple shifts are missed here. */
18046 if (prev_type == TYPE_ALUS_SREG
18047 || prev_type == TYPE_ALUS_IMM
18048 || prev_type == TYPE_LOGICS_REG
18049 || prev_type == TYPE_LOGICS_IMM)
18050 return true;
18051 }
18052 }
18053
18054 if (prev_set
18055 && curr_set
18056 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18057 && any_condjump_p (curr))
18058 {
18059 /* We're trying to match:
18060 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18061 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18062 (const_int 0))
18063 (label_ref ("SYM"))
18064 (pc)) */
18065 if (SET_DEST (curr_set) == (pc_rtx)
18066 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18067 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18068 && REG_P (SET_DEST (prev_set))
18069 && REGNO (SET_DEST (prev_set))
18070 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18071 {
18072 /* Fuse ALU operations followed by conditional branch instruction. */
18073 switch (get_attr_type (prev))
18074 {
18075 case TYPE_ALU_IMM:
18076 case TYPE_ALU_SREG:
18077 case TYPE_ADC_REG:
18078 case TYPE_ADC_IMM:
18079 case TYPE_ADCS_REG:
18080 case TYPE_ADCS_IMM:
18081 case TYPE_LOGIC_REG:
18082 case TYPE_LOGIC_IMM:
18083 case TYPE_CSEL:
18084 case TYPE_ADR:
18085 case TYPE_MOV_IMM:
18086 case TYPE_SHIFT_REG:
18087 case TYPE_SHIFT_IMM:
18088 case TYPE_BFM:
18089 case TYPE_RBIT:
18090 case TYPE_REV:
18091 case TYPE_EXTEND:
18092 return true;
18093
18094 default:;
18095 }
18096 }
18097 }
18098
18099 return false;
18100 }
18101
18102 /* Return true iff the instruction fusion described by OP is enabled. */
18103
18104 bool
18105 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18106 {
18107 return (aarch64_tune_params.fusible_ops & op) != 0;
18108 }
18109
18110 /* If MEM is in the form of [base+offset], extract the two parts
18111 of address and set to BASE and OFFSET, otherwise return false
18112 after clearing BASE and OFFSET. */
18113
18114 bool
18115 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18116 {
18117 rtx addr;
18118
18119 gcc_assert (MEM_P (mem));
18120
18121 addr = XEXP (mem, 0);
18122
18123 if (REG_P (addr))
18124 {
18125 *base = addr;
18126 *offset = const0_rtx;
18127 return true;
18128 }
18129
18130 if (GET_CODE (addr) == PLUS
18131 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18132 {
18133 *base = XEXP (addr, 0);
18134 *offset = XEXP (addr, 1);
18135 return true;
18136 }
18137
18138 *base = NULL_RTX;
18139 *offset = NULL_RTX;
18140
18141 return false;
18142 }
18143
18144 /* Types for scheduling fusion. */
18145 enum sched_fusion_type
18146 {
18147 SCHED_FUSION_NONE = 0,
18148 SCHED_FUSION_LD_SIGN_EXTEND,
18149 SCHED_FUSION_LD_ZERO_EXTEND,
18150 SCHED_FUSION_LD,
18151 SCHED_FUSION_ST,
18152 SCHED_FUSION_NUM
18153 };
18154
18155 /* If INSN is a load or store of address in the form of [base+offset],
18156 extract the two parts and set to BASE and OFFSET. Return scheduling
18157 fusion type this INSN is. */
18158
18159 static enum sched_fusion_type
18160 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18161 {
18162 rtx x, dest, src;
18163 enum sched_fusion_type fusion = SCHED_FUSION_LD;
18164
18165 gcc_assert (INSN_P (insn));
18166 x = PATTERN (insn);
18167 if (GET_CODE (x) != SET)
18168 return SCHED_FUSION_NONE;
18169
18170 src = SET_SRC (x);
18171 dest = SET_DEST (x);
18172
18173 machine_mode dest_mode = GET_MODE (dest);
18174
18175 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18176 return SCHED_FUSION_NONE;
18177
18178 if (GET_CODE (src) == SIGN_EXTEND)
18179 {
18180 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18181 src = XEXP (src, 0);
18182 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18183 return SCHED_FUSION_NONE;
18184 }
18185 else if (GET_CODE (src) == ZERO_EXTEND)
18186 {
18187 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18188 src = XEXP (src, 0);
18189 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18190 return SCHED_FUSION_NONE;
18191 }
18192
18193 if (GET_CODE (src) == MEM && REG_P (dest))
18194 extract_base_offset_in_addr (src, base, offset);
18195 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18196 {
18197 fusion = SCHED_FUSION_ST;
18198 extract_base_offset_in_addr (dest, base, offset);
18199 }
18200 else
18201 return SCHED_FUSION_NONE;
18202
18203 if (*base == NULL_RTX || *offset == NULL_RTX)
18204 fusion = SCHED_FUSION_NONE;
18205
18206 return fusion;
18207 }
18208
18209 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18210
18211 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18212 and PRI are only calculated for these instructions. For other instruction,
18213 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18214 type instruction fusion can be added by returning different priorities.
18215
18216 It's important that irrelevant instructions get the largest FUSION_PRI. */
18217
18218 static void
18219 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18220 int *fusion_pri, int *pri)
18221 {
18222 int tmp, off_val;
18223 rtx base, offset;
18224 enum sched_fusion_type fusion;
18225
18226 gcc_assert (INSN_P (insn));
18227
18228 tmp = max_pri - 1;
18229 fusion = fusion_load_store (insn, &base, &offset);
18230 if (fusion == SCHED_FUSION_NONE)
18231 {
18232 *pri = tmp;
18233 *fusion_pri = tmp;
18234 return;
18235 }
18236
18237 /* Set FUSION_PRI according to fusion type and base register. */
18238 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18239
18240 /* Calculate PRI. */
18241 tmp /= 2;
18242
18243 /* INSN with smaller offset goes first. */
18244 off_val = (int)(INTVAL (offset));
18245 if (off_val >= 0)
18246 tmp -= (off_val & 0xfffff);
18247 else
18248 tmp += ((- off_val) & 0xfffff);
18249
18250 *pri = tmp;
18251 return;
18252 }
18253
18254 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18255 Adjust priority of sha1h instructions so they are scheduled before
18256 other SHA1 instructions. */
18257
18258 static int
18259 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18260 {
18261 rtx x = PATTERN (insn);
18262
18263 if (GET_CODE (x) == SET)
18264 {
18265 x = SET_SRC (x);
18266
18267 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18268 return priority + 10;
18269 }
18270
18271 return priority;
18272 }
18273
18274 /* Given OPERANDS of consecutive load/store, check if we can merge
18275 them into ldp/stp. LOAD is true if they are load instructions.
18276 MODE is the mode of memory operands. */
18277
18278 bool
18279 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18280 machine_mode mode)
18281 {
18282 HOST_WIDE_INT offval_1, offval_2, msize;
18283 enum reg_class rclass_1, rclass_2;
18284 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18285
18286 if (load)
18287 {
18288 mem_1 = operands[1];
18289 mem_2 = operands[3];
18290 reg_1 = operands[0];
18291 reg_2 = operands[2];
18292 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18293 if (REGNO (reg_1) == REGNO (reg_2))
18294 return false;
18295 }
18296 else
18297 {
18298 mem_1 = operands[0];
18299 mem_2 = operands[2];
18300 reg_1 = operands[1];
18301 reg_2 = operands[3];
18302 }
18303
18304 /* The mems cannot be volatile. */
18305 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18306 return false;
18307
18308 /* If we have SImode and slow unaligned ldp,
18309 check the alignment to be at least 8 byte. */
18310 if (mode == SImode
18311 && (aarch64_tune_params.extra_tuning_flags
18312 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18313 && !optimize_size
18314 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18315 return false;
18316
18317 /* Check if the addresses are in the form of [base+offset]. */
18318 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18319 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18320 return false;
18321 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18322 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18323 return false;
18324
18325 /* Check if the bases are same. */
18326 if (!rtx_equal_p (base_1, base_2))
18327 return false;
18328
18329 /* The operands must be of the same size. */
18330 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18331 GET_MODE_SIZE (GET_MODE (mem_2))));
18332
18333 offval_1 = INTVAL (offset_1);
18334 offval_2 = INTVAL (offset_2);
18335 /* We should only be trying this for fixed-sized modes. There is no
18336 SVE LDP/STP instruction. */
18337 msize = GET_MODE_SIZE (mode).to_constant ();
18338 /* Check if the offsets are consecutive. */
18339 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18340 return false;
18341
18342 /* Check if the addresses are clobbered by load. */
18343 if (load)
18344 {
18345 if (reg_mentioned_p (reg_1, mem_1))
18346 return false;
18347
18348 /* In increasing order, the last load can clobber the address. */
18349 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18350 return false;
18351 }
18352
18353 /* One of the memory accesses must be a mempair operand.
18354 If it is not the first one, they need to be swapped by the
18355 peephole. */
18356 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18357 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18358 return false;
18359
18360 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18361 rclass_1 = FP_REGS;
18362 else
18363 rclass_1 = GENERAL_REGS;
18364
18365 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18366 rclass_2 = FP_REGS;
18367 else
18368 rclass_2 = GENERAL_REGS;
18369
18370 /* Check if the registers are of same class. */
18371 if (rclass_1 != rclass_2)
18372 return false;
18373
18374 return true;
18375 }
18376
18377 /* Given OPERANDS of consecutive load/store that can be merged,
18378 swap them if they are not in ascending order. */
18379 void
18380 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18381 {
18382 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18383 HOST_WIDE_INT offval_1, offval_2;
18384
18385 if (load)
18386 {
18387 mem_1 = operands[1];
18388 mem_2 = operands[3];
18389 }
18390 else
18391 {
18392 mem_1 = operands[0];
18393 mem_2 = operands[2];
18394 }
18395
18396 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18397 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18398
18399 offval_1 = INTVAL (offset_1);
18400 offval_2 = INTVAL (offset_2);
18401
18402 if (offval_1 > offval_2)
18403 {
18404 /* Irrespective of whether this is a load or a store,
18405 we do the same swap. */
18406 std::swap (operands[0], operands[2]);
18407 std::swap (operands[1], operands[3]);
18408 }
18409 }
18410
18411 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18412 comparison between the two. */
18413 int
18414 aarch64_host_wide_int_compare (const void *x, const void *y)
18415 {
18416 return wi::cmps (* ((const HOST_WIDE_INT *) x),
18417 * ((const HOST_WIDE_INT *) y));
18418 }
18419
18420 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18421 other pointing to a REG rtx containing an offset, compare the offsets
18422 of the two pairs.
18423
18424 Return:
18425
18426 1 iff offset (X) > offset (Y)
18427 0 iff offset (X) == offset (Y)
18428 -1 iff offset (X) < offset (Y) */
18429 int
18430 aarch64_ldrstr_offset_compare (const void *x, const void *y)
18431 {
18432 const rtx * operands_1 = (const rtx *) x;
18433 const rtx * operands_2 = (const rtx *) y;
18434 rtx mem_1, mem_2, base, offset_1, offset_2;
18435
18436 if (MEM_P (operands_1[0]))
18437 mem_1 = operands_1[0];
18438 else
18439 mem_1 = operands_1[1];
18440
18441 if (MEM_P (operands_2[0]))
18442 mem_2 = operands_2[0];
18443 else
18444 mem_2 = operands_2[1];
18445
18446 /* Extract the offsets. */
18447 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18448 extract_base_offset_in_addr (mem_2, &base, &offset_2);
18449
18450 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
18451
18452 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
18453 }
18454
18455 /* Given OPERANDS of consecutive load/store, check if we can merge
18456 them into ldp/stp by adjusting the offset. LOAD is true if they
18457 are load instructions. MODE is the mode of memory operands.
18458
18459 Given below consecutive stores:
18460
18461 str w1, [xb, 0x100]
18462 str w1, [xb, 0x104]
18463 str w1, [xb, 0x108]
18464 str w1, [xb, 0x10c]
18465
18466 Though the offsets are out of the range supported by stp, we can
18467 still pair them after adjusting the offset, like:
18468
18469 add scratch, xb, 0x100
18470 stp w1, w1, [scratch]
18471 stp w1, w1, [scratch, 0x8]
18472
18473 The peephole patterns detecting this opportunity should guarantee
18474 the scratch register is avaliable. */
18475
18476 bool
18477 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
18478 scalar_mode mode)
18479 {
18480 const int num_insns = 4;
18481 enum reg_class rclass;
18482 HOST_WIDE_INT offvals[num_insns], msize;
18483 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
18484
18485 if (load)
18486 {
18487 for (int i = 0; i < num_insns; i++)
18488 {
18489 reg[i] = operands[2 * i];
18490 mem[i] = operands[2 * i + 1];
18491
18492 gcc_assert (REG_P (reg[i]));
18493 }
18494
18495 /* Do not attempt to merge the loads if the loads clobber each other. */
18496 for (int i = 0; i < 8; i += 2)
18497 for (int j = i + 2; j < 8; j += 2)
18498 if (reg_overlap_mentioned_p (operands[i], operands[j]))
18499 return false;
18500 }
18501 else
18502 for (int i = 0; i < num_insns; i++)
18503 {
18504 mem[i] = operands[2 * i];
18505 reg[i] = operands[2 * i + 1];
18506 }
18507
18508 /* Skip if memory operand is by itself valid for ldp/stp. */
18509 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18510 return false;
18511
18512 for (int i = 0; i < num_insns; i++)
18513 {
18514 /* The mems cannot be volatile. */
18515 if (MEM_VOLATILE_P (mem[i]))
18516 return false;
18517
18518 /* Check if the addresses are in the form of [base+offset]. */
18519 extract_base_offset_in_addr (mem[i], base + i, offset + i);
18520 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18521 return false;
18522 }
18523
18524 /* Check if the registers are of same class. */
18525 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18526 ? FP_REGS : GENERAL_REGS;
18527
18528 for (int i = 1; i < num_insns; i++)
18529 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18530 {
18531 if (rclass != FP_REGS)
18532 return false;
18533 }
18534 else
18535 {
18536 if (rclass != GENERAL_REGS)
18537 return false;
18538 }
18539
18540 /* Only the last register in the order in which they occur
18541 may be clobbered by the load. */
18542 if (rclass == GENERAL_REGS && load)
18543 for (int i = 0; i < num_insns - 1; i++)
18544 if (reg_mentioned_p (reg[i], mem[i]))
18545 return false;
18546
18547 /* Check if the bases are same. */
18548 for (int i = 0; i < num_insns - 1; i++)
18549 if (!rtx_equal_p (base[i], base[i + 1]))
18550 return false;
18551
18552 for (int i = 0; i < num_insns; i++)
18553 offvals[i] = INTVAL (offset[i]);
18554
18555 msize = GET_MODE_SIZE (mode);
18556
18557 /* Check if the offsets can be put in the right order to do a ldp/stp. */
18558 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18559 aarch64_host_wide_int_compare);
18560
18561 if (!(offvals[1] == offvals[0] + msize
18562 && offvals[3] == offvals[2] + msize))
18563 return false;
18564
18565 /* Check that offsets are within range of each other. The ldp/stp
18566 instructions have 7 bit immediate offsets, so use 0x80. */
18567 if (offvals[2] - offvals[0] >= msize * 0x80)
18568 return false;
18569
18570 /* The offsets must be aligned with respect to each other. */
18571 if (offvals[0] % msize != offvals[2] % msize)
18572 return false;
18573
18574 /* If we have SImode and slow unaligned ldp,
18575 check the alignment to be at least 8 byte. */
18576 if (mode == SImode
18577 && (aarch64_tune_params.extra_tuning_flags
18578 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18579 && !optimize_size
18580 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18581 return false;
18582
18583 return true;
18584 }
18585
18586 /* Given OPERANDS of consecutive load/store, this function pairs them
18587 into LDP/STP after adjusting the offset. It depends on the fact
18588 that the operands can be sorted so the offsets are correct for STP.
18589 MODE is the mode of memory operands. CODE is the rtl operator
18590 which should be applied to all memory operands, it's SIGN_EXTEND,
18591 ZERO_EXTEND or UNKNOWN. */
18592
18593 bool
18594 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18595 scalar_mode mode, RTX_CODE code)
18596 {
18597 rtx base, offset_1, offset_3, t1, t2;
18598 rtx mem_1, mem_2, mem_3, mem_4;
18599 rtx temp_operands[8];
18600 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18601 stp_off_upper_limit, stp_off_lower_limit, msize;
18602
18603 /* We make changes on a copy as we may still bail out. */
18604 for (int i = 0; i < 8; i ++)
18605 temp_operands[i] = operands[i];
18606
18607 /* Sort the operands. */
18608 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18609
18610 /* Copy the memory operands so that if we have to bail for some
18611 reason the original addresses are unchanged. */
18612 if (load)
18613 {
18614 mem_1 = copy_rtx (temp_operands[1]);
18615 mem_2 = copy_rtx (temp_operands[3]);
18616 mem_3 = copy_rtx (temp_operands[5]);
18617 mem_4 = copy_rtx (temp_operands[7]);
18618 }
18619 else
18620 {
18621 mem_1 = copy_rtx (temp_operands[0]);
18622 mem_2 = copy_rtx (temp_operands[2]);
18623 mem_3 = copy_rtx (temp_operands[4]);
18624 mem_4 = copy_rtx (temp_operands[6]);
18625 gcc_assert (code == UNKNOWN);
18626 }
18627
18628 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18629 extract_base_offset_in_addr (mem_3, &base, &offset_3);
18630 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18631 && offset_3 != NULL_RTX);
18632
18633 /* Adjust offset so it can fit in LDP/STP instruction. */
18634 msize = GET_MODE_SIZE (mode);
18635 stp_off_upper_limit = msize * (0x40 - 1);
18636 stp_off_lower_limit = - msize * 0x40;
18637
18638 off_val_1 = INTVAL (offset_1);
18639 off_val_3 = INTVAL (offset_3);
18640
18641 /* The base offset is optimally half way between the two STP/LDP offsets. */
18642 if (msize <= 4)
18643 base_off = (off_val_1 + off_val_3) / 2;
18644 else
18645 /* However, due to issues with negative LDP/STP offset generation for
18646 larger modes, for DF, DI and vector modes. we must not use negative
18647 addresses smaller than 9 signed unadjusted bits can store. This
18648 provides the most range in this case. */
18649 base_off = off_val_1;
18650
18651 /* Adjust the base so that it is aligned with the addresses but still
18652 optimal. */
18653 if (base_off % msize != off_val_1 % msize)
18654 /* Fix the offset, bearing in mind we want to make it bigger not
18655 smaller. */
18656 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18657 else if (msize <= 4)
18658 /* The negative range of LDP/STP is one larger than the positive range. */
18659 base_off += msize;
18660
18661 /* Check if base offset is too big or too small. We can attempt to resolve
18662 this issue by setting it to the maximum value and seeing if the offsets
18663 still fit. */
18664 if (base_off >= 0x1000)
18665 {
18666 base_off = 0x1000 - 1;
18667 /* We must still make sure that the base offset is aligned with respect
18668 to the address. But it may may not be made any bigger. */
18669 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18670 }
18671
18672 /* Likewise for the case where the base is too small. */
18673 if (base_off <= -0x1000)
18674 {
18675 base_off = -0x1000 + 1;
18676 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18677 }
18678
18679 /* Offset of the first STP/LDP. */
18680 new_off_1 = off_val_1 - base_off;
18681
18682 /* Offset of the second STP/LDP. */
18683 new_off_3 = off_val_3 - base_off;
18684
18685 /* The offsets must be within the range of the LDP/STP instructions. */
18686 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18687 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18688 return false;
18689
18690 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18691 new_off_1), true);
18692 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18693 new_off_1 + msize), true);
18694 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18695 new_off_3), true);
18696 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18697 new_off_3 + msize), true);
18698
18699 if (!aarch64_mem_pair_operand (mem_1, mode)
18700 || !aarch64_mem_pair_operand (mem_3, mode))
18701 return false;
18702
18703 if (code == ZERO_EXTEND)
18704 {
18705 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18706 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18707 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18708 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18709 }
18710 else if (code == SIGN_EXTEND)
18711 {
18712 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18713 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18714 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18715 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18716 }
18717
18718 if (load)
18719 {
18720 operands[0] = temp_operands[0];
18721 operands[1] = mem_1;
18722 operands[2] = temp_operands[2];
18723 operands[3] = mem_2;
18724 operands[4] = temp_operands[4];
18725 operands[5] = mem_3;
18726 operands[6] = temp_operands[6];
18727 operands[7] = mem_4;
18728 }
18729 else
18730 {
18731 operands[0] = mem_1;
18732 operands[1] = temp_operands[1];
18733 operands[2] = mem_2;
18734 operands[3] = temp_operands[3];
18735 operands[4] = mem_3;
18736 operands[5] = temp_operands[5];
18737 operands[6] = mem_4;
18738 operands[7] = temp_operands[7];
18739 }
18740
18741 /* Emit adjusting instruction. */
18742 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18743 /* Emit ldp/stp instructions. */
18744 t1 = gen_rtx_SET (operands[0], operands[1]);
18745 t2 = gen_rtx_SET (operands[2], operands[3]);
18746 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18747 t1 = gen_rtx_SET (operands[4], operands[5]);
18748 t2 = gen_rtx_SET (operands[6], operands[7]);
18749 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18750 return true;
18751 }
18752
18753 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18754 it isn't worth branching around empty masked ops (including masked
18755 stores). */
18756
18757 static bool
18758 aarch64_empty_mask_is_expensive (unsigned)
18759 {
18760 return false;
18761 }
18762
18763 /* Return 1 if pseudo register should be created and used to hold
18764 GOT address for PIC code. */
18765
18766 bool
18767 aarch64_use_pseudo_pic_reg (void)
18768 {
18769 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18770 }
18771
18772 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18773
18774 static int
18775 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18776 {
18777 switch (XINT (x, 1))
18778 {
18779 case UNSPEC_GOTSMALLPIC:
18780 case UNSPEC_GOTSMALLPIC28K:
18781 case UNSPEC_GOTTINYPIC:
18782 return 0;
18783 default:
18784 break;
18785 }
18786
18787 return default_unspec_may_trap_p (x, flags);
18788 }
18789
18790
18791 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18792 return the log2 of that value. Otherwise return -1. */
18793
18794 int
18795 aarch64_fpconst_pow_of_2 (rtx x)
18796 {
18797 const REAL_VALUE_TYPE *r;
18798
18799 if (!CONST_DOUBLE_P (x))
18800 return -1;
18801
18802 r = CONST_DOUBLE_REAL_VALUE (x);
18803
18804 if (REAL_VALUE_NEGATIVE (*r)
18805 || REAL_VALUE_ISNAN (*r)
18806 || REAL_VALUE_ISINF (*r)
18807 || !real_isinteger (r, DFmode))
18808 return -1;
18809
18810 return exact_log2 (real_to_integer (r));
18811 }
18812
18813 /* If X is a vector of equal CONST_DOUBLE values and that value is
18814 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18815
18816 int
18817 aarch64_vec_fpconst_pow_of_2 (rtx x)
18818 {
18819 int nelts;
18820 if (GET_CODE (x) != CONST_VECTOR
18821 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18822 return -1;
18823
18824 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18825 return -1;
18826
18827 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18828 if (firstval <= 0)
18829 return -1;
18830
18831 for (int i = 1; i < nelts; i++)
18832 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18833 return -1;
18834
18835 return firstval;
18836 }
18837
18838 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18839 to float.
18840
18841 __fp16 always promotes through this hook.
18842 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18843 through the generic excess precision logic rather than here. */
18844
18845 static tree
18846 aarch64_promoted_type (const_tree t)
18847 {
18848 if (SCALAR_FLOAT_TYPE_P (t)
18849 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18850 return float_type_node;
18851
18852 return NULL_TREE;
18853 }
18854
18855 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18856
18857 static bool
18858 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18859 optimization_type opt_type)
18860 {
18861 switch (op)
18862 {
18863 case rsqrt_optab:
18864 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18865
18866 default:
18867 return true;
18868 }
18869 }
18870
18871 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18872
18873 static unsigned int
18874 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18875 int *offset)
18876 {
18877 /* Polynomial invariant 1 == (VG / 2) - 1. */
18878 gcc_assert (i == 1);
18879 *factor = 2;
18880 *offset = 1;
18881 return AARCH64_DWARF_VG;
18882 }
18883
18884 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18885 if MODE is HFmode, and punt to the generic implementation otherwise. */
18886
18887 static bool
18888 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18889 {
18890 return (mode == HFmode
18891 ? true
18892 : default_libgcc_floating_mode_supported_p (mode));
18893 }
18894
18895 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18896 if MODE is HFmode, and punt to the generic implementation otherwise. */
18897
18898 static bool
18899 aarch64_scalar_mode_supported_p (scalar_mode mode)
18900 {
18901 return (mode == HFmode
18902 ? true
18903 : default_scalar_mode_supported_p (mode));
18904 }
18905
18906 /* Set the value of FLT_EVAL_METHOD.
18907 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18908
18909 0: evaluate all operations and constants, whose semantic type has at
18910 most the range and precision of type float, to the range and
18911 precision of float; evaluate all other operations and constants to
18912 the range and precision of the semantic type;
18913
18914 N, where _FloatN is a supported interchange floating type
18915 evaluate all operations and constants, whose semantic type has at
18916 most the range and precision of _FloatN type, to the range and
18917 precision of the _FloatN type; evaluate all other operations and
18918 constants to the range and precision of the semantic type;
18919
18920 If we have the ARMv8.2-A extensions then we support _Float16 in native
18921 precision, so we should set this to 16. Otherwise, we support the type,
18922 but want to evaluate expressions in float precision, so set this to
18923 0. */
18924
18925 static enum flt_eval_method
18926 aarch64_excess_precision (enum excess_precision_type type)
18927 {
18928 switch (type)
18929 {
18930 case EXCESS_PRECISION_TYPE_FAST:
18931 case EXCESS_PRECISION_TYPE_STANDARD:
18932 /* We can calculate either in 16-bit range and precision or
18933 32-bit range and precision. Make that decision based on whether
18934 we have native support for the ARMv8.2-A 16-bit floating-point
18935 instructions or not. */
18936 return (TARGET_FP_F16INST
18937 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18938 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18939 case EXCESS_PRECISION_TYPE_IMPLICIT:
18940 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18941 default:
18942 gcc_unreachable ();
18943 }
18944 return FLT_EVAL_METHOD_UNPREDICTABLE;
18945 }
18946
18947 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18948 scheduled for speculative execution. Reject the long-running division
18949 and square-root instructions. */
18950
18951 static bool
18952 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18953 {
18954 switch (get_attr_type (insn))
18955 {
18956 case TYPE_SDIV:
18957 case TYPE_UDIV:
18958 case TYPE_FDIVS:
18959 case TYPE_FDIVD:
18960 case TYPE_FSQRTS:
18961 case TYPE_FSQRTD:
18962 case TYPE_NEON_FP_SQRT_S:
18963 case TYPE_NEON_FP_SQRT_D:
18964 case TYPE_NEON_FP_SQRT_S_Q:
18965 case TYPE_NEON_FP_SQRT_D_Q:
18966 case TYPE_NEON_FP_DIV_S:
18967 case TYPE_NEON_FP_DIV_D:
18968 case TYPE_NEON_FP_DIV_S_Q:
18969 case TYPE_NEON_FP_DIV_D_Q:
18970 return false;
18971 default:
18972 return true;
18973 }
18974 }
18975
18976 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18977
18978 static int
18979 aarch64_compute_pressure_classes (reg_class *classes)
18980 {
18981 int i = 0;
18982 classes[i++] = GENERAL_REGS;
18983 classes[i++] = FP_REGS;
18984 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18985 registers need to go in PR_LO_REGS at some point during their
18986 lifetime. Splitting it into two halves has the effect of making
18987 all predicates count against PR_LO_REGS, so that we try whenever
18988 possible to restrict the number of live predicates to 8. This
18989 greatly reduces the amount of spilling in certain loops. */
18990 classes[i++] = PR_LO_REGS;
18991 classes[i++] = PR_HI_REGS;
18992 return i;
18993 }
18994
18995 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18996
18997 static bool
18998 aarch64_can_change_mode_class (machine_mode from,
18999 machine_mode to, reg_class_t)
19000 {
19001 if (BYTES_BIG_ENDIAN)
19002 {
19003 bool from_sve_p = aarch64_sve_data_mode_p (from);
19004 bool to_sve_p = aarch64_sve_data_mode_p (to);
19005
19006 /* Don't allow changes between SVE data modes and non-SVE modes.
19007 See the comment at the head of aarch64-sve.md for details. */
19008 if (from_sve_p != to_sve_p)
19009 return false;
19010
19011 /* Don't allow changes in element size: lane 0 of the new vector
19012 would not then be lane 0 of the old vector. See the comment
19013 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19014 description.
19015
19016 In the worst case, this forces a register to be spilled in
19017 one mode and reloaded in the other, which handles the
19018 endianness correctly. */
19019 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19020 return false;
19021 }
19022 return true;
19023 }
19024
19025 /* Implement TARGET_EARLY_REMAT_MODES. */
19026
19027 static void
19028 aarch64_select_early_remat_modes (sbitmap modes)
19029 {
19030 /* SVE values are not normally live across a call, so it should be
19031 worth doing early rematerialization even in VL-specific mode. */
19032 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19033 {
19034 machine_mode mode = (machine_mode) i;
19035 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19036 if (vec_flags & VEC_ANY_SVE)
19037 bitmap_set_bit (modes, i);
19038 }
19039 }
19040
19041 /* Override the default target speculation_safe_value. */
19042 static rtx
19043 aarch64_speculation_safe_value (machine_mode mode,
19044 rtx result, rtx val, rtx failval)
19045 {
19046 /* Maybe we should warn if falling back to hard barriers. They are
19047 likely to be noticably more expensive than the alternative below. */
19048 if (!aarch64_track_speculation)
19049 return default_speculation_safe_value (mode, result, val, failval);
19050
19051 if (!REG_P (val))
19052 val = copy_to_mode_reg (mode, val);
19053
19054 if (!aarch64_reg_or_zero (failval, mode))
19055 failval = copy_to_mode_reg (mode, failval);
19056
19057 emit_insn (gen_despeculate_copy (mode, result, val, failval));
19058 return result;
19059 }
19060
19061 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19062 Look into the tuning structure for an estimate.
19063 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19064 Advanced SIMD 128 bits. */
19065
19066 static HOST_WIDE_INT
19067 aarch64_estimated_poly_value (poly_int64 val)
19068 {
19069 enum aarch64_sve_vector_bits_enum width_source
19070 = aarch64_tune_params.sve_width;
19071
19072 /* If we still don't have an estimate, use the default. */
19073 if (width_source == SVE_SCALABLE)
19074 return default_estimated_poly_value (val);
19075
19076 HOST_WIDE_INT over_128 = width_source - 128;
19077 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19078 }
19079
19080
19081 /* Return true for types that could be supported as SIMD return or
19082 argument types. */
19083
19084 static bool
19085 supported_simd_type (tree t)
19086 {
19087 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19088 {
19089 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19090 return s == 1 || s == 2 || s == 4 || s == 8;
19091 }
19092 return false;
19093 }
19094
19095 /* Return true for types that currently are supported as SIMD return
19096 or argument types. */
19097
19098 static bool
19099 currently_supported_simd_type (tree t, tree b)
19100 {
19101 if (COMPLEX_FLOAT_TYPE_P (t))
19102 return false;
19103
19104 if (TYPE_SIZE (t) != TYPE_SIZE (b))
19105 return false;
19106
19107 return supported_simd_type (t);
19108 }
19109
19110 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19111
19112 static int
19113 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19114 struct cgraph_simd_clone *clonei,
19115 tree base_type, int num)
19116 {
19117 tree t, ret_type, arg_type;
19118 unsigned int elt_bits, vec_bits, count;
19119
19120 if (!TARGET_SIMD)
19121 return 0;
19122
19123 if (clonei->simdlen
19124 && (clonei->simdlen < 2
19125 || clonei->simdlen > 1024
19126 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19127 {
19128 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19129 "unsupported simdlen %d", clonei->simdlen);
19130 return 0;
19131 }
19132
19133 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19134 if (TREE_CODE (ret_type) != VOID_TYPE
19135 && !currently_supported_simd_type (ret_type, base_type))
19136 {
19137 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19138 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19139 "GCC does not currently support mixed size types "
19140 "for %<simd%> functions");
19141 else if (supported_simd_type (ret_type))
19142 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19143 "GCC does not currently support return type %qT "
19144 "for %<simd%> functions", ret_type);
19145 else
19146 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19147 "unsupported return type %qT for %<simd%> functions",
19148 ret_type);
19149 return 0;
19150 }
19151
19152 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19153 {
19154 arg_type = TREE_TYPE (t);
19155
19156 if (!currently_supported_simd_type (arg_type, base_type))
19157 {
19158 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19159 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19160 "GCC does not currently support mixed size types "
19161 "for %<simd%> functions");
19162 else
19163 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19164 "GCC does not currently support argument type %qT "
19165 "for %<simd%> functions", arg_type);
19166 return 0;
19167 }
19168 }
19169
19170 clonei->vecsize_mangle = 'n';
19171 clonei->mask_mode = VOIDmode;
19172 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19173 if (clonei->simdlen == 0)
19174 {
19175 count = 2;
19176 vec_bits = (num == 0 ? 64 : 128);
19177 clonei->simdlen = vec_bits / elt_bits;
19178 }
19179 else
19180 {
19181 count = 1;
19182 vec_bits = clonei->simdlen * elt_bits;
19183 if (vec_bits != 64 && vec_bits != 128)
19184 {
19185 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19186 "GCC does not currently support simdlen %d for type %qT",
19187 clonei->simdlen, base_type);
19188 return 0;
19189 }
19190 }
19191 clonei->vecsize_int = vec_bits;
19192 clonei->vecsize_float = vec_bits;
19193 return count;
19194 }
19195
19196 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19197
19198 static void
19199 aarch64_simd_clone_adjust (struct cgraph_node *node)
19200 {
19201 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19202 use the correct ABI. */
19203
19204 tree t = TREE_TYPE (node->decl);
19205 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19206 TYPE_ATTRIBUTES (t));
19207 }
19208
19209 /* Implement TARGET_SIMD_CLONE_USABLE. */
19210
19211 static int
19212 aarch64_simd_clone_usable (struct cgraph_node *node)
19213 {
19214 switch (node->simdclone->vecsize_mangle)
19215 {
19216 case 'n':
19217 if (!TARGET_SIMD)
19218 return -1;
19219 return 0;
19220 default:
19221 gcc_unreachable ();
19222 }
19223 }
19224
19225 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19226
19227 static int
19228 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19229 {
19230 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19231 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19232 return 0;
19233 return 1;
19234 }
19235
19236 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19237
19238 static const char *
19239 aarch64_get_multilib_abi_name (void)
19240 {
19241 if (TARGET_BIG_END)
19242 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19243 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19244 }
19245
19246 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19247 global variable based guard use the default else
19248 return a null tree. */
19249 static tree
19250 aarch64_stack_protect_guard (void)
19251 {
19252 if (aarch64_stack_protector_guard == SSP_GLOBAL)
19253 return default_stack_protect_guard ();
19254
19255 return NULL_TREE;
19256 }
19257
19258 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
19259 section at the end if needed. */
19260 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
19261 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
19262 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
19263 void
19264 aarch64_file_end_indicate_exec_stack ()
19265 {
19266 file_end_indicate_exec_stack ();
19267
19268 unsigned feature_1_and = 0;
19269 if (aarch64_bti_enabled ())
19270 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19271
19272 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19273 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19274
19275 if (feature_1_and)
19276 {
19277 /* Generate .note.gnu.property section. */
19278 switch_to_section (get_section (".note.gnu.property",
19279 SECTION_NOTYPE, NULL));
19280
19281 /* PT_NOTE header: namesz, descsz, type.
19282 namesz = 4 ("GNU\0")
19283 descsz = 16 (Size of the program property array)
19284 [(12 + padding) * Number of array elements]
19285 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
19286 assemble_align (POINTER_SIZE);
19287 assemble_integer (GEN_INT (4), 4, 32, 1);
19288 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19289 assemble_integer (GEN_INT (5), 4, 32, 1);
19290
19291 /* PT_NOTE name. */
19292 assemble_string ("GNU", 4);
19293
19294 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19295 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19296 datasz = 4
19297 data = feature_1_and. */
19298 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19299 assemble_integer (GEN_INT (4), 4, 32, 1);
19300 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19301
19302 /* Pad the size of the note to the required alignment. */
19303 assemble_align (POINTER_SIZE);
19304 }
19305 }
19306 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19307 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19308 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19309
19310 /* Target-specific selftests. */
19311
19312 #if CHECKING_P
19313
19314 namespace selftest {
19315
19316 /* Selftest for the RTL loader.
19317 Verify that the RTL loader copes with a dump from
19318 print_rtx_function. This is essentially just a test that class
19319 function_reader can handle a real dump, but it also verifies
19320 that lookup_reg_by_dump_name correctly handles hard regs.
19321 The presence of hard reg names in the dump means that the test is
19322 target-specific, hence it is in this file. */
19323
19324 static void
19325 aarch64_test_loading_full_dump ()
19326 {
19327 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19328
19329 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19330
19331 rtx_insn *insn_1 = get_insn_by_uid (1);
19332 ASSERT_EQ (NOTE, GET_CODE (insn_1));
19333
19334 rtx_insn *insn_15 = get_insn_by_uid (15);
19335 ASSERT_EQ (INSN, GET_CODE (insn_15));
19336 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19337
19338 /* Verify crtl->return_rtx. */
19339 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19340 ASSERT_EQ (0, REGNO (crtl->return_rtx));
19341 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19342 }
19343
19344 /* Run all target-specific selftests. */
19345
19346 static void
19347 aarch64_run_selftests (void)
19348 {
19349 aarch64_test_loading_full_dump ();
19350 }
19351
19352 } // namespace selftest
19353
19354 #endif /* #if CHECKING_P */
19355
19356 #undef TARGET_STACK_PROTECT_GUARD
19357 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19358
19359 #undef TARGET_ADDRESS_COST
19360 #define TARGET_ADDRESS_COST aarch64_address_cost
19361
19362 /* This hook will determines whether unnamed bitfields affect the alignment
19363 of the containing structure. The hook returns true if the structure
19364 should inherit the alignment requirements of an unnamed bitfield's
19365 type. */
19366 #undef TARGET_ALIGN_ANON_BITFIELD
19367 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19368
19369 #undef TARGET_ASM_ALIGNED_DI_OP
19370 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19371
19372 #undef TARGET_ASM_ALIGNED_HI_OP
19373 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19374
19375 #undef TARGET_ASM_ALIGNED_SI_OP
19376 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19377
19378 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19379 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19380 hook_bool_const_tree_hwi_hwi_const_tree_true
19381
19382 #undef TARGET_ASM_FILE_START
19383 #define TARGET_ASM_FILE_START aarch64_start_file
19384
19385 #undef TARGET_ASM_OUTPUT_MI_THUNK
19386 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19387
19388 #undef TARGET_ASM_SELECT_RTX_SECTION
19389 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19390
19391 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19392 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19393
19394 #undef TARGET_BUILD_BUILTIN_VA_LIST
19395 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19396
19397 #undef TARGET_CALLEE_COPIES
19398 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19399
19400 #undef TARGET_CAN_ELIMINATE
19401 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19402
19403 #undef TARGET_CAN_INLINE_P
19404 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19405
19406 #undef TARGET_CANNOT_FORCE_CONST_MEM
19407 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19408
19409 #undef TARGET_CASE_VALUES_THRESHOLD
19410 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19411
19412 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19413 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19414
19415 /* Only the least significant bit is used for initialization guard
19416 variables. */
19417 #undef TARGET_CXX_GUARD_MASK_BIT
19418 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19419
19420 #undef TARGET_C_MODE_FOR_SUFFIX
19421 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19422
19423 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19424 #undef TARGET_DEFAULT_TARGET_FLAGS
19425 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19426 #endif
19427
19428 #undef TARGET_CLASS_MAX_NREGS
19429 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19430
19431 #undef TARGET_BUILTIN_DECL
19432 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19433
19434 #undef TARGET_BUILTIN_RECIPROCAL
19435 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19436
19437 #undef TARGET_C_EXCESS_PRECISION
19438 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19439
19440 #undef TARGET_EXPAND_BUILTIN
19441 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19442
19443 #undef TARGET_EXPAND_BUILTIN_VA_START
19444 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19445
19446 #undef TARGET_FOLD_BUILTIN
19447 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19448
19449 #undef TARGET_FUNCTION_ARG
19450 #define TARGET_FUNCTION_ARG aarch64_function_arg
19451
19452 #undef TARGET_FUNCTION_ARG_ADVANCE
19453 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19454
19455 #undef TARGET_FUNCTION_ARG_BOUNDARY
19456 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19457
19458 #undef TARGET_FUNCTION_ARG_PADDING
19459 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19460
19461 #undef TARGET_GET_RAW_RESULT_MODE
19462 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19463 #undef TARGET_GET_RAW_ARG_MODE
19464 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19465
19466 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19467 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19468
19469 #undef TARGET_FUNCTION_VALUE
19470 #define TARGET_FUNCTION_VALUE aarch64_function_value
19471
19472 #undef TARGET_FUNCTION_VALUE_REGNO_P
19473 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19474
19475 #undef TARGET_GIMPLE_FOLD_BUILTIN
19476 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19477
19478 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19479 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19480
19481 #undef TARGET_INIT_BUILTINS
19482 #define TARGET_INIT_BUILTINS aarch64_init_builtins
19483
19484 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19485 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19486 aarch64_ira_change_pseudo_allocno_class
19487
19488 #undef TARGET_LEGITIMATE_ADDRESS_P
19489 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19490
19491 #undef TARGET_LEGITIMATE_CONSTANT_P
19492 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19493
19494 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19495 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19496 aarch64_legitimize_address_displacement
19497
19498 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19499 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19500
19501 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19502 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19503 aarch64_libgcc_floating_mode_supported_p
19504
19505 #undef TARGET_MANGLE_TYPE
19506 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19507
19508 #undef TARGET_MEMORY_MOVE_COST
19509 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19510
19511 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19512 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19513
19514 #undef TARGET_MUST_PASS_IN_STACK
19515 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19516
19517 /* This target hook should return true if accesses to volatile bitfields
19518 should use the narrowest mode possible. It should return false if these
19519 accesses should use the bitfield container type. */
19520 #undef TARGET_NARROW_VOLATILE_BITFIELD
19521 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19522
19523 #undef TARGET_OPTION_OVERRIDE
19524 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19525
19526 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19527 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19528 aarch64_override_options_after_change
19529
19530 #undef TARGET_OPTION_SAVE
19531 #define TARGET_OPTION_SAVE aarch64_option_save
19532
19533 #undef TARGET_OPTION_RESTORE
19534 #define TARGET_OPTION_RESTORE aarch64_option_restore
19535
19536 #undef TARGET_OPTION_PRINT
19537 #define TARGET_OPTION_PRINT aarch64_option_print
19538
19539 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19540 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19541
19542 #undef TARGET_SET_CURRENT_FUNCTION
19543 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19544
19545 #undef TARGET_PASS_BY_REFERENCE
19546 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19547
19548 #undef TARGET_PREFERRED_RELOAD_CLASS
19549 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19550
19551 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19552 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19553
19554 #undef TARGET_PROMOTED_TYPE
19555 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19556
19557 #undef TARGET_SECONDARY_RELOAD
19558 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19559
19560 #undef TARGET_SHIFT_TRUNCATION_MASK
19561 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19562
19563 #undef TARGET_SETUP_INCOMING_VARARGS
19564 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19565
19566 #undef TARGET_STRUCT_VALUE_RTX
19567 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
19568
19569 #undef TARGET_REGISTER_MOVE_COST
19570 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19571
19572 #undef TARGET_RETURN_IN_MEMORY
19573 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19574
19575 #undef TARGET_RETURN_IN_MSB
19576 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19577
19578 #undef TARGET_RTX_COSTS
19579 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19580
19581 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19582 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19583
19584 #undef TARGET_SCHED_ISSUE_RATE
19585 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19586
19587 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19588 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19589 aarch64_sched_first_cycle_multipass_dfa_lookahead
19590
19591 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19592 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19593 aarch64_first_cycle_multipass_dfa_lookahead_guard
19594
19595 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19596 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19597 aarch64_get_separate_components
19598
19599 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19600 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19601 aarch64_components_for_bb
19602
19603 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19604 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19605 aarch64_disqualify_components
19606
19607 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19608 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19609 aarch64_emit_prologue_components
19610
19611 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19612 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19613 aarch64_emit_epilogue_components
19614
19615 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19616 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19617 aarch64_set_handled_components
19618
19619 #undef TARGET_TRAMPOLINE_INIT
19620 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19621
19622 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19623 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19624
19625 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19626 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19627
19628 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19629 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19630 aarch64_builtin_support_vector_misalignment
19631
19632 #undef TARGET_ARRAY_MODE
19633 #define TARGET_ARRAY_MODE aarch64_array_mode
19634
19635 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19636 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19637
19638 #undef TARGET_VECTORIZE_ADD_STMT_COST
19639 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19640
19641 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19642 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19643 aarch64_builtin_vectorization_cost
19644
19645 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19646 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19647
19648 #undef TARGET_VECTORIZE_BUILTINS
19649 #define TARGET_VECTORIZE_BUILTINS
19650
19651 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19652 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19653 aarch64_builtin_vectorized_function
19654
19655 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19656 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19657 aarch64_autovectorize_vector_sizes
19658
19659 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19660 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19661 aarch64_atomic_assign_expand_fenv
19662
19663 /* Section anchor support. */
19664
19665 #undef TARGET_MIN_ANCHOR_OFFSET
19666 #define TARGET_MIN_ANCHOR_OFFSET -256
19667
19668 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19669 byte offset; we can do much more for larger data types, but have no way
19670 to determine the size of the access. We assume accesses are aligned. */
19671 #undef TARGET_MAX_ANCHOR_OFFSET
19672 #define TARGET_MAX_ANCHOR_OFFSET 4095
19673
19674 #undef TARGET_VECTOR_ALIGNMENT
19675 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19676
19677 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19678 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19679 aarch64_vectorize_preferred_vector_alignment
19680 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19681 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19682 aarch64_simd_vector_alignment_reachable
19683
19684 /* vec_perm support. */
19685
19686 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19687 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19688 aarch64_vectorize_vec_perm_const
19689
19690 #undef TARGET_VECTORIZE_GET_MASK_MODE
19691 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19692 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19693 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19694 aarch64_empty_mask_is_expensive
19695 #undef TARGET_PREFERRED_ELSE_VALUE
19696 #define TARGET_PREFERRED_ELSE_VALUE \
19697 aarch64_preferred_else_value
19698
19699 #undef TARGET_INIT_LIBFUNCS
19700 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19701
19702 #undef TARGET_FIXED_CONDITION_CODE_REGS
19703 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19704
19705 #undef TARGET_FLAGS_REGNUM
19706 #define TARGET_FLAGS_REGNUM CC_REGNUM
19707
19708 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19709 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19710
19711 #undef TARGET_ASAN_SHADOW_OFFSET
19712 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19713
19714 #undef TARGET_LEGITIMIZE_ADDRESS
19715 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19716
19717 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19718 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19719
19720 #undef TARGET_CAN_USE_DOLOOP_P
19721 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19722
19723 #undef TARGET_SCHED_ADJUST_PRIORITY
19724 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19725
19726 #undef TARGET_SCHED_MACRO_FUSION_P
19727 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19728
19729 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19730 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19731
19732 #undef TARGET_SCHED_FUSION_PRIORITY
19733 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19734
19735 #undef TARGET_UNSPEC_MAY_TRAP_P
19736 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19737
19738 #undef TARGET_USE_PSEUDO_PIC_REG
19739 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19740
19741 #undef TARGET_PRINT_OPERAND
19742 #define TARGET_PRINT_OPERAND aarch64_print_operand
19743
19744 #undef TARGET_PRINT_OPERAND_ADDRESS
19745 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19746
19747 #undef TARGET_OPTAB_SUPPORTED_P
19748 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19749
19750 #undef TARGET_OMIT_STRUCT_RETURN_REG
19751 #define TARGET_OMIT_STRUCT_RETURN_REG true
19752
19753 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19754 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19755 aarch64_dwarf_poly_indeterminate_value
19756
19757 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
19758 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19759 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19760
19761 #undef TARGET_HARD_REGNO_NREGS
19762 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19763 #undef TARGET_HARD_REGNO_MODE_OK
19764 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19765
19766 #undef TARGET_MODES_TIEABLE_P
19767 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19768
19769 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19770 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19771 aarch64_hard_regno_call_part_clobbered
19772
19773 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19774 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19775 aarch64_remove_extra_call_preserved_regs
19776
19777 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19778 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19779 aarch64_return_call_with_max_clobbers
19780
19781 #undef TARGET_CONSTANT_ALIGNMENT
19782 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19783
19784 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19785 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19786 aarch64_stack_clash_protection_alloca_probe_range
19787
19788 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19789 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19790
19791 #undef TARGET_CAN_CHANGE_MODE_CLASS
19792 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19793
19794 #undef TARGET_SELECT_EARLY_REMAT_MODES
19795 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19796
19797 #undef TARGET_SPECULATION_SAFE_VALUE
19798 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19799
19800 #undef TARGET_ESTIMATED_POLY_VALUE
19801 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19802
19803 #undef TARGET_ATTRIBUTE_TABLE
19804 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19805
19806 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19807 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19808 aarch64_simd_clone_compute_vecsize_and_simdlen
19809
19810 #undef TARGET_SIMD_CLONE_ADJUST
19811 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19812
19813 #undef TARGET_SIMD_CLONE_USABLE
19814 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19815
19816 #undef TARGET_COMP_TYPE_ATTRIBUTES
19817 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
19818
19819 #undef TARGET_GET_MULTILIB_ABI_NAME
19820 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
19821
19822 #if CHECKING_P
19823 #undef TARGET_RUN_TARGET_SELFTESTS
19824 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19825 #endif /* #if CHECKING_P */
19826
19827 #undef TARGET_ASM_POST_CFI_STARTPROC
19828 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
19829
19830 struct gcc_target targetm = TARGET_INITIALIZER;
19831
19832 #include "gt-aarch64.h"