]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
23f72160fbbfd34167f577409dad98214f357d57
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77
78 /* This file should be included last. */
79 #include "target-def.h"
80
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
86 {
87 enum insn_type { MOV, MVN, INDEX, PTRUE };
88 enum modifier_type { LSL, MSL };
89
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode, rtx);
92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 insn_type = MOV, modifier_type = LSL,
94 unsigned int = 0);
95 simd_immediate_info (scalar_mode, rtx, rtx);
96 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
97
98 /* The mode of the elements. */
99 scalar_mode elt_mode;
100
101 /* The instruction to use to move the immediate into a vector. */
102 insn_type insn;
103
104 union
105 {
106 /* For MOV and MVN. */
107 struct
108 {
109 /* The value of each element. */
110 rtx value;
111
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier;
115 unsigned int shift;
116 } mov;
117
118 /* For INDEX. */
119 struct
120 {
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
123 rtx base, step;
124 } index;
125
126 /* For PTRUE. */
127 aarch64_svpattern pattern;
128 } u;
129 };
130
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
135 : elt_mode (elt_mode_in), insn (MOV)
136 {
137 u.mov.value = value_in;
138 u.mov.modifier = LSL;
139 u.mov.shift = 0;
140 }
141
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
144 fields. */
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
147 unsigned HOST_WIDE_INT value_in,
148 insn_type insn_in, modifier_type modifier_in,
149 unsigned int shift_in)
150 : elt_mode (elt_mode_in), insn (insn_in)
151 {
152 u.mov.value = gen_int_mode (value_in, elt_mode_in);
153 u.mov.modifier = modifier_in;
154 u.mov.shift = shift_in;
155 }
156
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161 : elt_mode (elt_mode_in), insn (INDEX)
162 {
163 u.index.base = base_in;
164 u.index.step = step_in;
165 }
166
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
171 aarch64_svpattern pattern_in)
172 : elt_mode (elt_mode_in), insn (PTRUE)
173 {
174 u.pattern = pattern_in;
175 }
176
177 /* The current code model. */
178 enum aarch64_code_model aarch64_cmodel;
179
180 /* The number of 64-bit elements in an SVE vector. */
181 poly_uint16 aarch64_sve_vg;
182
183 #ifdef HAVE_AS_TLS
184 #undef TARGET_HAVE_TLS
185 #define TARGET_HAVE_TLS 1
186 #endif
187
188 static bool aarch64_composite_type_p (const_tree, machine_mode);
189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
190 const_tree,
191 machine_mode *, int *,
192 bool *);
193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
195 static void aarch64_override_options_after_change (void);
196 static bool aarch64_vector_mode_supported_p (machine_mode);
197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
199 const_tree type,
200 int misalignment,
201 bool is_packed);
202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
204 aarch64_addr_query_type);
205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
206
207 /* Major revision number of the ARM Architecture implemented by the target. */
208 unsigned aarch64_architecture_version;
209
210 /* The processor for which instructions should be scheduled. */
211 enum aarch64_processor aarch64_tune = cortexa53;
212
213 /* Mask to specify which instruction scheduling options should be used. */
214 uint64_t aarch64_tune_flags = 0;
215
216 /* Global flag for PC relative loads. */
217 bool aarch64_pcrelative_literal_loads;
218
219 /* Global flag for whether frame pointer is enabled. */
220 bool aarch64_use_frame_pointer;
221
222 #define BRANCH_PROTECT_STR_MAX 255
223 char *accepted_branch_protection_string = NULL;
224
225 static enum aarch64_parse_opt_result
226 aarch64_parse_branch_protection (const char*, char**);
227
228 /* Support for command line parsing of boolean flags in the tuning
229 structures. */
230 struct aarch64_flag_desc
231 {
232 const char* name;
233 unsigned int flag;
234 };
235
236 #define AARCH64_FUSION_PAIR(name, internal_name) \
237 { name, AARCH64_FUSE_##internal_name },
238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
239 {
240 { "none", AARCH64_FUSE_NOTHING },
241 #include "aarch64-fusion-pairs.def"
242 { "all", AARCH64_FUSE_ALL },
243 { NULL, AARCH64_FUSE_NOTHING }
244 };
245
246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
247 { name, AARCH64_EXTRA_TUNE_##internal_name },
248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
249 {
250 { "none", AARCH64_EXTRA_TUNE_NONE },
251 #include "aarch64-tuning-flags.def"
252 { "all", AARCH64_EXTRA_TUNE_ALL },
253 { NULL, AARCH64_EXTRA_TUNE_NONE }
254 };
255
256 /* Tuning parameters. */
257
258 static const struct cpu_addrcost_table generic_addrcost_table =
259 {
260 {
261 1, /* hi */
262 0, /* si */
263 0, /* di */
264 1, /* ti */
265 },
266 0, /* pre_modify */
267 0, /* post_modify */
268 0, /* register_offset */
269 0, /* register_sextend */
270 0, /* register_zextend */
271 0 /* imm_offset */
272 };
273
274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
275 {
276 {
277 0, /* hi */
278 0, /* si */
279 0, /* di */
280 2, /* ti */
281 },
282 0, /* pre_modify */
283 0, /* post_modify */
284 1, /* register_offset */
285 1, /* register_sextend */
286 2, /* register_zextend */
287 0, /* imm_offset */
288 };
289
290 static const struct cpu_addrcost_table xgene1_addrcost_table =
291 {
292 {
293 1, /* hi */
294 0, /* si */
295 0, /* di */
296 1, /* ti */
297 },
298 1, /* pre_modify */
299 1, /* post_modify */
300 0, /* register_offset */
301 1, /* register_sextend */
302 1, /* register_zextend */
303 0, /* imm_offset */
304 };
305
306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
307 {
308 {
309 1, /* hi */
310 1, /* si */
311 1, /* di */
312 2, /* ti */
313 },
314 0, /* pre_modify */
315 0, /* post_modify */
316 2, /* register_offset */
317 3, /* register_sextend */
318 3, /* register_zextend */
319 0, /* imm_offset */
320 };
321
322 static const struct cpu_addrcost_table tsv110_addrcost_table =
323 {
324 {
325 1, /* hi */
326 0, /* si */
327 0, /* di */
328 1, /* ti */
329 },
330 0, /* pre_modify */
331 0, /* post_modify */
332 0, /* register_offset */
333 1, /* register_sextend */
334 1, /* register_zextend */
335 0, /* imm_offset */
336 };
337
338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
339 {
340 {
341 1, /* hi */
342 1, /* si */
343 1, /* di */
344 2, /* ti */
345 },
346 1, /* pre_modify */
347 1, /* post_modify */
348 3, /* register_offset */
349 3, /* register_sextend */
350 3, /* register_zextend */
351 2, /* imm_offset */
352 };
353
354 static const struct cpu_regmove_cost generic_regmove_cost =
355 {
356 1, /* GP2GP */
357 /* Avoid the use of slow int<->fp moves for spilling by setting
358 their cost higher than memmov_cost. */
359 5, /* GP2FP */
360 5, /* FP2GP */
361 2 /* FP2FP */
362 };
363
364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
365 {
366 1, /* GP2GP */
367 /* Avoid the use of slow int<->fp moves for spilling by setting
368 their cost higher than memmov_cost. */
369 5, /* GP2FP */
370 5, /* FP2GP */
371 2 /* FP2FP */
372 };
373
374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
375 {
376 1, /* GP2GP */
377 /* Avoid the use of slow int<->fp moves for spilling by setting
378 their cost higher than memmov_cost. */
379 5, /* GP2FP */
380 5, /* FP2GP */
381 2 /* FP2FP */
382 };
383
384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
385 {
386 1, /* GP2GP */
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost (actual, 4 and 9). */
389 9, /* GP2FP */
390 9, /* FP2GP */
391 1 /* FP2FP */
392 };
393
394 static const struct cpu_regmove_cost thunderx_regmove_cost =
395 {
396 2, /* GP2GP */
397 2, /* GP2FP */
398 6, /* FP2GP */
399 4 /* FP2FP */
400 };
401
402 static const struct cpu_regmove_cost xgene1_regmove_cost =
403 {
404 1, /* GP2GP */
405 /* Avoid the use of slow int<->fp moves for spilling by setting
406 their cost higher than memmov_cost. */
407 8, /* GP2FP */
408 8, /* FP2GP */
409 2 /* FP2FP */
410 };
411
412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
413 {
414 2, /* GP2GP */
415 /* Avoid the use of int<->fp moves for spilling. */
416 6, /* GP2FP */
417 6, /* FP2GP */
418 4 /* FP2FP */
419 };
420
421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
422 {
423 1, /* GP2GP */
424 /* Avoid the use of int<->fp moves for spilling. */
425 8, /* GP2FP */
426 8, /* FP2GP */
427 4 /* FP2FP */
428 };
429
430 static const struct cpu_regmove_cost tsv110_regmove_cost =
431 {
432 1, /* GP2GP */
433 /* Avoid the use of slow int<->fp moves for spilling by setting
434 their cost higher than memmov_cost. */
435 2, /* GP2FP */
436 3, /* FP2GP */
437 2 /* FP2FP */
438 };
439
440 /* Generic costs for vector insn classes. */
441 static const struct cpu_vector_cost generic_vector_cost =
442 {
443 1, /* scalar_int_stmt_cost */
444 1, /* scalar_fp_stmt_cost */
445 1, /* scalar_load_cost */
446 1, /* scalar_store_cost */
447 1, /* vec_int_stmt_cost */
448 1, /* vec_fp_stmt_cost */
449 2, /* vec_permute_cost */
450 1, /* vec_to_scalar_cost */
451 1, /* scalar_to_vec_cost */
452 1, /* vec_align_load_cost */
453 1, /* vec_unalign_load_cost */
454 1, /* vec_unalign_store_cost */
455 1, /* vec_store_cost */
456 3, /* cond_taken_branch_cost */
457 1 /* cond_not_taken_branch_cost */
458 };
459
460 /* QDF24XX costs for vector insn classes. */
461 static const struct cpu_vector_cost qdf24xx_vector_cost =
462 {
463 1, /* scalar_int_stmt_cost */
464 1, /* scalar_fp_stmt_cost */
465 1, /* scalar_load_cost */
466 1, /* scalar_store_cost */
467 1, /* vec_int_stmt_cost */
468 3, /* vec_fp_stmt_cost */
469 2, /* vec_permute_cost */
470 1, /* vec_to_scalar_cost */
471 1, /* scalar_to_vec_cost */
472 1, /* vec_align_load_cost */
473 1, /* vec_unalign_load_cost */
474 1, /* vec_unalign_store_cost */
475 1, /* vec_store_cost */
476 3, /* cond_taken_branch_cost */
477 1 /* cond_not_taken_branch_cost */
478 };
479
480 /* ThunderX costs for vector insn classes. */
481 static const struct cpu_vector_cost thunderx_vector_cost =
482 {
483 1, /* scalar_int_stmt_cost */
484 1, /* scalar_fp_stmt_cost */
485 3, /* scalar_load_cost */
486 1, /* scalar_store_cost */
487 4, /* vec_int_stmt_cost */
488 1, /* vec_fp_stmt_cost */
489 4, /* vec_permute_cost */
490 2, /* vec_to_scalar_cost */
491 2, /* scalar_to_vec_cost */
492 3, /* vec_align_load_cost */
493 5, /* vec_unalign_load_cost */
494 5, /* vec_unalign_store_cost */
495 1, /* vec_store_cost */
496 3, /* cond_taken_branch_cost */
497 3 /* cond_not_taken_branch_cost */
498 };
499
500 static const struct cpu_vector_cost tsv110_vector_cost =
501 {
502 1, /* scalar_int_stmt_cost */
503 1, /* scalar_fp_stmt_cost */
504 5, /* scalar_load_cost */
505 1, /* scalar_store_cost */
506 2, /* vec_int_stmt_cost */
507 2, /* vec_fp_stmt_cost */
508 2, /* vec_permute_cost */
509 3, /* vec_to_scalar_cost */
510 2, /* scalar_to_vec_cost */
511 5, /* vec_align_load_cost */
512 5, /* vec_unalign_load_cost */
513 1, /* vec_unalign_store_cost */
514 1, /* vec_store_cost */
515 1, /* cond_taken_branch_cost */
516 1 /* cond_not_taken_branch_cost */
517 };
518
519 /* Generic costs for vector insn classes. */
520 static const struct cpu_vector_cost cortexa57_vector_cost =
521 {
522 1, /* scalar_int_stmt_cost */
523 1, /* scalar_fp_stmt_cost */
524 4, /* scalar_load_cost */
525 1, /* scalar_store_cost */
526 2, /* vec_int_stmt_cost */
527 2, /* vec_fp_stmt_cost */
528 3, /* vec_permute_cost */
529 8, /* vec_to_scalar_cost */
530 8, /* scalar_to_vec_cost */
531 4, /* vec_align_load_cost */
532 4, /* vec_unalign_load_cost */
533 1, /* vec_unalign_store_cost */
534 1, /* vec_store_cost */
535 1, /* cond_taken_branch_cost */
536 1 /* cond_not_taken_branch_cost */
537 };
538
539 static const struct cpu_vector_cost exynosm1_vector_cost =
540 {
541 1, /* scalar_int_stmt_cost */
542 1, /* scalar_fp_stmt_cost */
543 5, /* scalar_load_cost */
544 1, /* scalar_store_cost */
545 3, /* vec_int_stmt_cost */
546 3, /* vec_fp_stmt_cost */
547 3, /* vec_permute_cost */
548 3, /* vec_to_scalar_cost */
549 3, /* scalar_to_vec_cost */
550 5, /* vec_align_load_cost */
551 5, /* vec_unalign_load_cost */
552 1, /* vec_unalign_store_cost */
553 1, /* vec_store_cost */
554 1, /* cond_taken_branch_cost */
555 1 /* cond_not_taken_branch_cost */
556 };
557
558 /* Generic costs for vector insn classes. */
559 static const struct cpu_vector_cost xgene1_vector_cost =
560 {
561 1, /* scalar_int_stmt_cost */
562 1, /* scalar_fp_stmt_cost */
563 5, /* scalar_load_cost */
564 1, /* scalar_store_cost */
565 2, /* vec_int_stmt_cost */
566 2, /* vec_fp_stmt_cost */
567 2, /* vec_permute_cost */
568 4, /* vec_to_scalar_cost */
569 4, /* scalar_to_vec_cost */
570 10, /* vec_align_load_cost */
571 10, /* vec_unalign_load_cost */
572 2, /* vec_unalign_store_cost */
573 2, /* vec_store_cost */
574 2, /* cond_taken_branch_cost */
575 1 /* cond_not_taken_branch_cost */
576 };
577
578 /* Costs for vector insn classes for Vulcan. */
579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
580 {
581 1, /* scalar_int_stmt_cost */
582 6, /* scalar_fp_stmt_cost */
583 4, /* scalar_load_cost */
584 1, /* scalar_store_cost */
585 5, /* vec_int_stmt_cost */
586 6, /* vec_fp_stmt_cost */
587 3, /* vec_permute_cost */
588 6, /* vec_to_scalar_cost */
589 5, /* scalar_to_vec_cost */
590 8, /* vec_align_load_cost */
591 8, /* vec_unalign_load_cost */
592 4, /* vec_unalign_store_cost */
593 4, /* vec_store_cost */
594 2, /* cond_taken_branch_cost */
595 1 /* cond_not_taken_branch_cost */
596 };
597
598 /* Generic costs for branch instructions. */
599 static const struct cpu_branch_cost generic_branch_cost =
600 {
601 1, /* Predictable. */
602 3 /* Unpredictable. */
603 };
604
605 /* Generic approximation modes. */
606 static const cpu_approx_modes generic_approx_modes =
607 {
608 AARCH64_APPROX_NONE, /* division */
609 AARCH64_APPROX_NONE, /* sqrt */
610 AARCH64_APPROX_NONE /* recip_sqrt */
611 };
612
613 /* Approximation modes for Exynos M1. */
614 static const cpu_approx_modes exynosm1_approx_modes =
615 {
616 AARCH64_APPROX_NONE, /* division */
617 AARCH64_APPROX_ALL, /* sqrt */
618 AARCH64_APPROX_ALL /* recip_sqrt */
619 };
620
621 /* Approximation modes for X-Gene 1. */
622 static const cpu_approx_modes xgene1_approx_modes =
623 {
624 AARCH64_APPROX_NONE, /* division */
625 AARCH64_APPROX_NONE, /* sqrt */
626 AARCH64_APPROX_ALL /* recip_sqrt */
627 };
628
629 /* Generic prefetch settings (which disable prefetch). */
630 static const cpu_prefetch_tune generic_prefetch_tune =
631 {
632 0, /* num_slots */
633 -1, /* l1_cache_size */
634 -1, /* l1_cache_line_size */
635 -1, /* l2_cache_size */
636 true, /* prefetch_dynamic_strides */
637 -1, /* minimum_stride */
638 -1 /* default_opt_level */
639 };
640
641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
642 {
643 0, /* num_slots */
644 -1, /* l1_cache_size */
645 64, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
650 };
651
652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
653 {
654 4, /* num_slots */
655 32, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 512, /* l2_cache_size */
658 false, /* prefetch_dynamic_strides */
659 2048, /* minimum_stride */
660 3 /* default_opt_level */
661 };
662
663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
664 {
665 8, /* num_slots */
666 32, /* l1_cache_size */
667 128, /* l1_cache_line_size */
668 16*1024, /* l2_cache_size */
669 true, /* prefetch_dynamic_strides */
670 -1, /* minimum_stride */
671 3 /* default_opt_level */
672 };
673
674 static const cpu_prefetch_tune thunderx_prefetch_tune =
675 {
676 8, /* num_slots */
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 -1, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 -1 /* default_opt_level */
683 };
684
685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
686 {
687 8, /* num_slots */
688 32, /* l1_cache_size */
689 64, /* l1_cache_line_size */
690 256, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
694 };
695
696 static const cpu_prefetch_tune tsv110_prefetch_tune =
697 {
698 0, /* num_slots */
699 64, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 512, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
705 };
706
707 static const cpu_prefetch_tune xgene1_prefetch_tune =
708 {
709 8, /* num_slots */
710 32, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 256, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
716 };
717
718 static const struct tune_params generic_tunings =
719 {
720 &cortexa57_extra_costs,
721 &generic_addrcost_table,
722 &generic_regmove_cost,
723 &generic_vector_cost,
724 &generic_branch_cost,
725 &generic_approx_modes,
726 SVE_NOT_IMPLEMENTED, /* sve_width */
727 4, /* memmov_cost */
728 2, /* issue_rate */
729 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
730 "16:12", /* function_align. */
731 "4", /* jump_align. */
732 "8", /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
741 &generic_prefetch_tune
742 };
743
744 static const struct tune_params cortexa35_tunings =
745 {
746 &cortexa53_extra_costs,
747 &generic_addrcost_table,
748 &cortexa53_regmove_cost,
749 &generic_vector_cost,
750 &generic_branch_cost,
751 &generic_approx_modes,
752 SVE_NOT_IMPLEMENTED, /* sve_width */
753 4, /* memmov_cost */
754 1, /* issue_rate */
755 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
756 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
757 "16", /* function_align. */
758 "4", /* jump_align. */
759 "8", /* loop_align. */
760 2, /* int_reassoc_width. */
761 4, /* fp_reassoc_width. */
762 1, /* vec_reassoc_width. */
763 2, /* min_div_recip_mul_sf. */
764 2, /* min_div_recip_mul_df. */
765 0, /* max_case_values. */
766 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
768 &generic_prefetch_tune
769 };
770
771 static const struct tune_params cortexa53_tunings =
772 {
773 &cortexa53_extra_costs,
774 &generic_addrcost_table,
775 &cortexa53_regmove_cost,
776 &generic_vector_cost,
777 &generic_branch_cost,
778 &generic_approx_modes,
779 SVE_NOT_IMPLEMENTED, /* sve_width */
780 4, /* memmov_cost */
781 2, /* issue_rate */
782 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
783 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
784 "16", /* function_align. */
785 "4", /* jump_align. */
786 "8", /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
795 &generic_prefetch_tune
796 };
797
798 static const struct tune_params cortexa57_tunings =
799 {
800 &cortexa57_extra_costs,
801 &generic_addrcost_table,
802 &cortexa57_regmove_cost,
803 &cortexa57_vector_cost,
804 &generic_branch_cost,
805 &generic_approx_modes,
806 SVE_NOT_IMPLEMENTED, /* sve_width */
807 4, /* memmov_cost */
808 3, /* issue_rate */
809 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
811 "16", /* function_align. */
812 "4", /* jump_align. */
813 "8", /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
822 &generic_prefetch_tune
823 };
824
825 static const struct tune_params cortexa72_tunings =
826 {
827 &cortexa57_extra_costs,
828 &generic_addrcost_table,
829 &cortexa57_regmove_cost,
830 &cortexa57_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 SVE_NOT_IMPLEMENTED, /* sve_width */
834 4, /* memmov_cost */
835 3, /* issue_rate */
836 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
837 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
838 "16", /* function_align. */
839 "4", /* jump_align. */
840 "8", /* loop_align. */
841 2, /* int_reassoc_width. */
842 4, /* fp_reassoc_width. */
843 1, /* vec_reassoc_width. */
844 2, /* min_div_recip_mul_sf. */
845 2, /* min_div_recip_mul_df. */
846 0, /* max_case_values. */
847 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
848 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
849 &generic_prefetch_tune
850 };
851
852 static const struct tune_params cortexa73_tunings =
853 {
854 &cortexa57_extra_costs,
855 &generic_addrcost_table,
856 &cortexa57_regmove_cost,
857 &cortexa57_vector_cost,
858 &generic_branch_cost,
859 &generic_approx_modes,
860 SVE_NOT_IMPLEMENTED, /* sve_width */
861 4, /* memmov_cost. */
862 2, /* issue_rate. */
863 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
864 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
865 "16", /* function_align. */
866 "4", /* jump_align. */
867 "8", /* loop_align. */
868 2, /* int_reassoc_width. */
869 4, /* fp_reassoc_width. */
870 1, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
876 &generic_prefetch_tune
877 };
878
879
880
881 static const struct tune_params exynosm1_tunings =
882 {
883 &exynosm1_extra_costs,
884 &exynosm1_addrcost_table,
885 &exynosm1_regmove_cost,
886 &exynosm1_vector_cost,
887 &generic_branch_cost,
888 &exynosm1_approx_modes,
889 SVE_NOT_IMPLEMENTED, /* sve_width */
890 4, /* memmov_cost */
891 3, /* issue_rate */
892 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
893 "4", /* function_align. */
894 "4", /* jump_align. */
895 "4", /* loop_align. */
896 2, /* int_reassoc_width. */
897 4, /* fp_reassoc_width. */
898 1, /* vec_reassoc_width. */
899 2, /* min_div_recip_mul_sf. */
900 2, /* min_div_recip_mul_df. */
901 48, /* max_case_values. */
902 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
903 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
904 &exynosm1_prefetch_tune
905 };
906
907 static const struct tune_params thunderxt88_tunings =
908 {
909 &thunderx_extra_costs,
910 &generic_addrcost_table,
911 &thunderx_regmove_cost,
912 &thunderx_vector_cost,
913 &generic_branch_cost,
914 &generic_approx_modes,
915 SVE_NOT_IMPLEMENTED, /* sve_width */
916 6, /* memmov_cost */
917 2, /* issue_rate */
918 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
919 "8", /* function_align. */
920 "8", /* jump_align. */
921 "8", /* loop_align. */
922 2, /* int_reassoc_width. */
923 4, /* fp_reassoc_width. */
924 1, /* vec_reassoc_width. */
925 2, /* min_div_recip_mul_sf. */
926 2, /* min_div_recip_mul_df. */
927 0, /* max_case_values. */
928 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
929 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
930 &thunderxt88_prefetch_tune
931 };
932
933 static const struct tune_params thunderx_tunings =
934 {
935 &thunderx_extra_costs,
936 &generic_addrcost_table,
937 &thunderx_regmove_cost,
938 &thunderx_vector_cost,
939 &generic_branch_cost,
940 &generic_approx_modes,
941 SVE_NOT_IMPLEMENTED, /* sve_width */
942 6, /* memmov_cost */
943 2, /* issue_rate */
944 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
945 "8", /* function_align. */
946 "8", /* jump_align. */
947 "8", /* loop_align. */
948 2, /* int_reassoc_width. */
949 4, /* fp_reassoc_width. */
950 1, /* vec_reassoc_width. */
951 2, /* min_div_recip_mul_sf. */
952 2, /* min_div_recip_mul_df. */
953 0, /* max_case_values. */
954 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
955 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
956 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
957 &thunderx_prefetch_tune
958 };
959
960 static const struct tune_params tsv110_tunings =
961 {
962 &tsv110_extra_costs,
963 &tsv110_addrcost_table,
964 &tsv110_regmove_cost,
965 &tsv110_vector_cost,
966 &generic_branch_cost,
967 &generic_approx_modes,
968 SVE_NOT_IMPLEMENTED, /* sve_width */
969 4, /* memmov_cost */
970 4, /* issue_rate */
971 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
972 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
973 "16", /* function_align. */
974 "4", /* jump_align. */
975 "8", /* loop_align. */
976 2, /* int_reassoc_width. */
977 4, /* fp_reassoc_width. */
978 1, /* vec_reassoc_width. */
979 2, /* min_div_recip_mul_sf. */
980 2, /* min_div_recip_mul_df. */
981 0, /* max_case_values. */
982 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
983 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
984 &tsv110_prefetch_tune
985 };
986
987 static const struct tune_params xgene1_tunings =
988 {
989 &xgene1_extra_costs,
990 &xgene1_addrcost_table,
991 &xgene1_regmove_cost,
992 &xgene1_vector_cost,
993 &generic_branch_cost,
994 &xgene1_approx_modes,
995 SVE_NOT_IMPLEMENTED, /* sve_width */
996 6, /* memmov_cost */
997 4, /* issue_rate */
998 AARCH64_FUSE_NOTHING, /* fusible_ops */
999 "16", /* function_align. */
1000 "16", /* jump_align. */
1001 "16", /* loop_align. */
1002 2, /* int_reassoc_width. */
1003 4, /* fp_reassoc_width. */
1004 1, /* vec_reassoc_width. */
1005 2, /* min_div_recip_mul_sf. */
1006 2, /* min_div_recip_mul_df. */
1007 17, /* max_case_values. */
1008 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1009 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1010 &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015 &xgene1_extra_costs,
1016 &xgene1_addrcost_table,
1017 &xgene1_regmove_cost,
1018 &xgene1_vector_cost,
1019 &generic_branch_cost,
1020 &xgene1_approx_modes,
1021 SVE_NOT_IMPLEMENTED,
1022 6, /* memmov_cost */
1023 4, /* issue_rate */
1024 AARCH64_FUSE_NOTHING, /* fusible_ops */
1025 "16", /* function_align. */
1026 "16", /* jump_align. */
1027 "16", /* loop_align. */
1028 2, /* int_reassoc_width. */
1029 4, /* fp_reassoc_width. */
1030 1, /* vec_reassoc_width. */
1031 2, /* min_div_recip_mul_sf. */
1032 2, /* min_div_recip_mul_df. */
1033 17, /* max_case_values. */
1034 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1035 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1036 &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041 &qdf24xx_extra_costs,
1042 &qdf24xx_addrcost_table,
1043 &qdf24xx_regmove_cost,
1044 &qdf24xx_vector_cost,
1045 &generic_branch_cost,
1046 &generic_approx_modes,
1047 SVE_NOT_IMPLEMENTED, /* sve_width */
1048 4, /* memmov_cost */
1049 4, /* issue_rate */
1050 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1052 "16", /* function_align. */
1053 "8", /* jump_align. */
1054 "16", /* loop_align. */
1055 2, /* int_reassoc_width. */
1056 4, /* fp_reassoc_width. */
1057 1, /* vec_reassoc_width. */
1058 2, /* min_div_recip_mul_sf. */
1059 2, /* min_div_recip_mul_df. */
1060 0, /* max_case_values. */
1061 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1062 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1063 &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1067 for now. */
1068 static const struct tune_params saphira_tunings =
1069 {
1070 &generic_extra_costs,
1071 &generic_addrcost_table,
1072 &generic_regmove_cost,
1073 &generic_vector_cost,
1074 &generic_branch_cost,
1075 &generic_approx_modes,
1076 SVE_NOT_IMPLEMENTED, /* sve_width */
1077 4, /* memmov_cost */
1078 4, /* issue_rate */
1079 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1081 "16", /* function_align. */
1082 "8", /* jump_align. */
1083 "16", /* loop_align. */
1084 2, /* int_reassoc_width. */
1085 4, /* fp_reassoc_width. */
1086 1, /* vec_reassoc_width. */
1087 2, /* min_div_recip_mul_sf. */
1088 2, /* min_div_recip_mul_df. */
1089 0, /* max_case_values. */
1090 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1091 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1092 &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097 &thunderx2t99_extra_costs,
1098 &thunderx2t99_addrcost_table,
1099 &thunderx2t99_regmove_cost,
1100 &thunderx2t99_vector_cost,
1101 &generic_branch_cost,
1102 &generic_approx_modes,
1103 SVE_NOT_IMPLEMENTED, /* sve_width */
1104 4, /* memmov_cost. */
1105 4, /* issue_rate. */
1106 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1107 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1108 "16", /* function_align. */
1109 "8", /* jump_align. */
1110 "16", /* loop_align. */
1111 3, /* int_reassoc_width. */
1112 2, /* fp_reassoc_width. */
1113 2, /* vec_reassoc_width. */
1114 2, /* min_div_recip_mul_sf. */
1115 2, /* min_div_recip_mul_df. */
1116 0, /* max_case_values. */
1117 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1118 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1119 &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124 &cortexa57_extra_costs,
1125 &generic_addrcost_table,
1126 &generic_regmove_cost,
1127 &cortexa57_vector_cost,
1128 &generic_branch_cost,
1129 &generic_approx_modes,
1130 SVE_NOT_IMPLEMENTED, /* sve_width */
1131 4, /* memmov_cost */
1132 3, /* issue_rate */
1133 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1134 "32:16", /* function_align. */
1135 "32:16", /* jump_align. */
1136 "32:16", /* loop_align. */
1137 2, /* int_reassoc_width. */
1138 4, /* fp_reassoc_width. */
1139 2, /* vec_reassoc_width. */
1140 2, /* min_div_recip_mul_sf. */
1141 2, /* min_div_recip_mul_df. */
1142 0, /* max_case_values. */
1143 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1144 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1145 &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures. */
1149 struct aarch64_tuning_override_function
1150 {
1151 const char* name;
1152 void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162 { "fuse", aarch64_parse_fuse_string },
1163 { "tune", aarch64_parse_tune_string },
1164 { "sve_width", aarch64_parse_sve_width_string },
1165 { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64. */
1169 struct processor
1170 {
1171 const char *const name;
1172 enum aarch64_processor ident;
1173 enum aarch64_processor sched_core;
1174 enum aarch64_arch arch;
1175 unsigned architecture_version;
1176 const uint64_t flags;
1177 const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64. */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64. */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1194 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1195 FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification. These are populated by the -march, -mtune, -mcpu
1204 handling code or by target attributes. */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set. */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Table of machine attributes. */
1215 static const struct attribute_spec aarch64_attribute_table[] =
1216 {
1217 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1218 affects_type_identity, handler, exclude } */
1219 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1220 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1221 };
1222
1223 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1224
1225 /* An ISA extension in the co-processor and main instruction set space. */
1226 struct aarch64_option_extension
1227 {
1228 const char *const name;
1229 const unsigned long flags_on;
1230 const unsigned long flags_off;
1231 };
1232
1233 typedef enum aarch64_cond_code
1234 {
1235 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1236 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1237 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1238 }
1239 aarch64_cc;
1240
1241 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1242
1243 struct aarch64_branch_protect_type
1244 {
1245 /* The type's name that the user passes to the branch-protection option
1246 string. */
1247 const char* name;
1248 /* Function to handle the protection type and set global variables.
1249 First argument is the string token corresponding with this type and the
1250 second argument is the next token in the option string.
1251 Return values:
1252 * AARCH64_PARSE_OK: Handling was sucessful.
1253 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1254 should print an error.
1255 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1256 own error. */
1257 enum aarch64_parse_opt_result (*handler)(char*, char*);
1258 /* A list of types that can follow this type in the option string. */
1259 const aarch64_branch_protect_type* subtypes;
1260 unsigned int num_subtypes;
1261 };
1262
1263 static enum aarch64_parse_opt_result
1264 aarch64_handle_no_branch_protection (char* str, char* rest)
1265 {
1266 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1267 aarch64_enable_bti = 0;
1268 if (rest)
1269 {
1270 error ("unexpected %<%s%> after %<%s%>", rest, str);
1271 return AARCH64_PARSE_INVALID_FEATURE;
1272 }
1273 return AARCH64_PARSE_OK;
1274 }
1275
1276 static enum aarch64_parse_opt_result
1277 aarch64_handle_standard_branch_protection (char* str, char* rest)
1278 {
1279 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1280 aarch64_ra_sign_key = AARCH64_KEY_A;
1281 aarch64_enable_bti = 1;
1282 if (rest)
1283 {
1284 error ("unexpected %<%s%> after %<%s%>", rest, str);
1285 return AARCH64_PARSE_INVALID_FEATURE;
1286 }
1287 return AARCH64_PARSE_OK;
1288 }
1289
1290 static enum aarch64_parse_opt_result
1291 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1292 char* rest ATTRIBUTE_UNUSED)
1293 {
1294 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1295 aarch64_ra_sign_key = AARCH64_KEY_A;
1296 return AARCH64_PARSE_OK;
1297 }
1298
1299 static enum aarch64_parse_opt_result
1300 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1301 char* rest ATTRIBUTE_UNUSED)
1302 {
1303 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1304 return AARCH64_PARSE_OK;
1305 }
1306
1307 static enum aarch64_parse_opt_result
1308 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1309 char* rest ATTRIBUTE_UNUSED)
1310 {
1311 aarch64_ra_sign_key = AARCH64_KEY_B;
1312 return AARCH64_PARSE_OK;
1313 }
1314
1315 static enum aarch64_parse_opt_result
1316 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1317 char* rest ATTRIBUTE_UNUSED)
1318 {
1319 aarch64_enable_bti = 1;
1320 return AARCH64_PARSE_OK;
1321 }
1322
1323 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1324 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1325 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1326 { NULL, NULL, NULL, 0 }
1327 };
1328
1329 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1330 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1331 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1332 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1333 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1334 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1335 { NULL, NULL, NULL, 0 }
1336 };
1337
1338 /* The condition codes of the processor, and the inverse function. */
1339 static const char * const aarch64_condition_codes[] =
1340 {
1341 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1342 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1343 };
1344
1345 /* The preferred condition codes for SVE conditions. */
1346 static const char *const aarch64_sve_condition_codes[] =
1347 {
1348 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1349 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1350 };
1351
1352 /* Return the assembly token for svpattern value VALUE. */
1353
1354 static const char *
1355 svpattern_token (enum aarch64_svpattern pattern)
1356 {
1357 switch (pattern)
1358 {
1359 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1360 AARCH64_FOR_SVPATTERN (CASE)
1361 #undef CASE
1362 case AARCH64_NUM_SVPATTERNS:
1363 break;
1364 }
1365 gcc_unreachable ();
1366 }
1367
1368 /* Generate code to enable conditional branches in functions over 1 MiB. */
1369 const char *
1370 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1371 const char * branch_format)
1372 {
1373 rtx_code_label * tmp_label = gen_label_rtx ();
1374 char label_buf[256];
1375 char buffer[128];
1376 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1377 CODE_LABEL_NUMBER (tmp_label));
1378 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1379 rtx dest_label = operands[pos_label];
1380 operands[pos_label] = tmp_label;
1381
1382 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1383 output_asm_insn (buffer, operands);
1384
1385 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1386 operands[pos_label] = dest_label;
1387 output_asm_insn (buffer, operands);
1388 return "";
1389 }
1390
1391 void
1392 aarch64_err_no_fpadvsimd (machine_mode mode)
1393 {
1394 if (TARGET_GENERAL_REGS_ONLY)
1395 if (FLOAT_MODE_P (mode))
1396 error ("%qs is incompatible with the use of floating-point types",
1397 "-mgeneral-regs-only");
1398 else
1399 error ("%qs is incompatible with the use of vector types",
1400 "-mgeneral-regs-only");
1401 else
1402 if (FLOAT_MODE_P (mode))
1403 error ("%qs feature modifier is incompatible with the use of"
1404 " floating-point types", "+nofp");
1405 else
1406 error ("%qs feature modifier is incompatible with the use of"
1407 " vector types", "+nofp");
1408 }
1409
1410 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1411 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1412 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1413 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1414 and GENERAL_REGS is lower than the memory cost (in this case the best class
1415 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1416 cost results in bad allocations with many redundant int<->FP moves which
1417 are expensive on various cores.
1418 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1419 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1420 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1421 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1422 The result of this is that it is no longer inefficient to have a higher
1423 memory move cost than the register move cost.
1424 */
1425
1426 static reg_class_t
1427 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1428 reg_class_t best_class)
1429 {
1430 machine_mode mode;
1431
1432 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1433 || !reg_class_subset_p (FP_REGS, allocno_class))
1434 return allocno_class;
1435
1436 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1437 || !reg_class_subset_p (FP_REGS, best_class))
1438 return best_class;
1439
1440 mode = PSEUDO_REGNO_MODE (regno);
1441 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1442 }
1443
1444 static unsigned int
1445 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1446 {
1447 if (GET_MODE_UNIT_SIZE (mode) == 4)
1448 return aarch64_tune_params.min_div_recip_mul_sf;
1449 return aarch64_tune_params.min_div_recip_mul_df;
1450 }
1451
1452 /* Return the reassociation width of treeop OPC with mode MODE. */
1453 static int
1454 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1455 {
1456 if (VECTOR_MODE_P (mode))
1457 return aarch64_tune_params.vec_reassoc_width;
1458 if (INTEGRAL_MODE_P (mode))
1459 return aarch64_tune_params.int_reassoc_width;
1460 /* Avoid reassociating floating point addition so we emit more FMAs. */
1461 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1462 return aarch64_tune_params.fp_reassoc_width;
1463 return 1;
1464 }
1465
1466 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1467 unsigned
1468 aarch64_dbx_register_number (unsigned regno)
1469 {
1470 if (GP_REGNUM_P (regno))
1471 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1472 else if (regno == SP_REGNUM)
1473 return AARCH64_DWARF_SP;
1474 else if (FP_REGNUM_P (regno))
1475 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1476 else if (PR_REGNUM_P (regno))
1477 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1478 else if (regno == VG_REGNUM)
1479 return AARCH64_DWARF_VG;
1480
1481 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1482 equivalent DWARF register. */
1483 return DWARF_FRAME_REGISTERS;
1484 }
1485
1486 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1487 integer, otherwise return X unmodified. */
1488 static rtx
1489 aarch64_bit_representation (rtx x)
1490 {
1491 if (CONST_DOUBLE_P (x))
1492 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1493 return x;
1494 }
1495
1496 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1497 static bool
1498 aarch64_advsimd_struct_mode_p (machine_mode mode)
1499 {
1500 return (TARGET_SIMD
1501 && (mode == OImode || mode == CImode || mode == XImode));
1502 }
1503
1504 /* Return true if MODE is an SVE predicate mode. */
1505 static bool
1506 aarch64_sve_pred_mode_p (machine_mode mode)
1507 {
1508 return (TARGET_SVE
1509 && (mode == VNx16BImode
1510 || mode == VNx8BImode
1511 || mode == VNx4BImode
1512 || mode == VNx2BImode));
1513 }
1514
1515 /* Three mutually-exclusive flags describing a vector or predicate type. */
1516 const unsigned int VEC_ADVSIMD = 1;
1517 const unsigned int VEC_SVE_DATA = 2;
1518 const unsigned int VEC_SVE_PRED = 4;
1519 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1520 a structure of 2, 3 or 4 vectors. */
1521 const unsigned int VEC_STRUCT = 8;
1522 /* Useful combinations of the above. */
1523 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1524 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1525
1526 /* Return a set of flags describing the vector properties of mode MODE.
1527 Ignore modes that are not supported by the current target. */
1528 static unsigned int
1529 aarch64_classify_vector_mode (machine_mode mode)
1530 {
1531 if (aarch64_advsimd_struct_mode_p (mode))
1532 return VEC_ADVSIMD | VEC_STRUCT;
1533
1534 if (aarch64_sve_pred_mode_p (mode))
1535 return VEC_SVE_PRED;
1536
1537 /* Make the decision based on the mode's enum value rather than its
1538 properties, so that we keep the correct classification regardless
1539 of -msve-vector-bits. */
1540 switch (mode)
1541 {
1542 /* Single SVE vectors. */
1543 case E_VNx16QImode:
1544 case E_VNx8HImode:
1545 case E_VNx4SImode:
1546 case E_VNx2DImode:
1547 case E_VNx8HFmode:
1548 case E_VNx4SFmode:
1549 case E_VNx2DFmode:
1550 return TARGET_SVE ? VEC_SVE_DATA : 0;
1551
1552 /* x2 SVE vectors. */
1553 case E_VNx32QImode:
1554 case E_VNx16HImode:
1555 case E_VNx8SImode:
1556 case E_VNx4DImode:
1557 case E_VNx16HFmode:
1558 case E_VNx8SFmode:
1559 case E_VNx4DFmode:
1560 /* x3 SVE vectors. */
1561 case E_VNx48QImode:
1562 case E_VNx24HImode:
1563 case E_VNx12SImode:
1564 case E_VNx6DImode:
1565 case E_VNx24HFmode:
1566 case E_VNx12SFmode:
1567 case E_VNx6DFmode:
1568 /* x4 SVE vectors. */
1569 case E_VNx64QImode:
1570 case E_VNx32HImode:
1571 case E_VNx16SImode:
1572 case E_VNx8DImode:
1573 case E_VNx32HFmode:
1574 case E_VNx16SFmode:
1575 case E_VNx8DFmode:
1576 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1577
1578 /* 64-bit Advanced SIMD vectors. */
1579 case E_V8QImode:
1580 case E_V4HImode:
1581 case E_V2SImode:
1582 /* ...E_V1DImode doesn't exist. */
1583 case E_V4HFmode:
1584 case E_V2SFmode:
1585 case E_V1DFmode:
1586 /* 128-bit Advanced SIMD vectors. */
1587 case E_V16QImode:
1588 case E_V8HImode:
1589 case E_V4SImode:
1590 case E_V2DImode:
1591 case E_V8HFmode:
1592 case E_V4SFmode:
1593 case E_V2DFmode:
1594 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1595
1596 default:
1597 return 0;
1598 }
1599 }
1600
1601 /* Return true if MODE is any of the data vector modes, including
1602 structure modes. */
1603 static bool
1604 aarch64_vector_data_mode_p (machine_mode mode)
1605 {
1606 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1607 }
1608
1609 /* Return true if MODE is any form of SVE mode, including predicates,
1610 vectors and structures. */
1611 bool
1612 aarch64_sve_mode_p (machine_mode mode)
1613 {
1614 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1615 }
1616
1617 /* Return true if MODE is an SVE data vector mode; either a single vector
1618 or a structure of vectors. */
1619 static bool
1620 aarch64_sve_data_mode_p (machine_mode mode)
1621 {
1622 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1623 }
1624
1625 /* Implement target hook TARGET_ARRAY_MODE. */
1626 static opt_machine_mode
1627 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1628 {
1629 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1630 && IN_RANGE (nelems, 2, 4))
1631 return mode_for_vector (GET_MODE_INNER (mode),
1632 GET_MODE_NUNITS (mode) * nelems);
1633
1634 return opt_machine_mode ();
1635 }
1636
1637 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1638 static bool
1639 aarch64_array_mode_supported_p (machine_mode mode,
1640 unsigned HOST_WIDE_INT nelems)
1641 {
1642 if (TARGET_SIMD
1643 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1644 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1645 && (nelems >= 2 && nelems <= 4))
1646 return true;
1647
1648 return false;
1649 }
1650
1651 /* Return the SVE predicate mode to use for elements that have
1652 ELEM_NBYTES bytes, if such a mode exists. */
1653
1654 opt_machine_mode
1655 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1656 {
1657 if (TARGET_SVE)
1658 {
1659 if (elem_nbytes == 1)
1660 return VNx16BImode;
1661 if (elem_nbytes == 2)
1662 return VNx8BImode;
1663 if (elem_nbytes == 4)
1664 return VNx4BImode;
1665 if (elem_nbytes == 8)
1666 return VNx2BImode;
1667 }
1668 return opt_machine_mode ();
1669 }
1670
1671 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1672
1673 static opt_machine_mode
1674 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1675 {
1676 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1677 {
1678 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1679 machine_mode pred_mode;
1680 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1681 return pred_mode;
1682 }
1683
1684 return default_get_mask_mode (nunits, nbytes);
1685 }
1686
1687 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1688
1689 static opt_machine_mode
1690 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1691 {
1692 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1693 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1694 machine_mode mode;
1695 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1696 if (inner_mode == GET_MODE_INNER (mode)
1697 && known_eq (nunits, GET_MODE_NUNITS (mode))
1698 && aarch64_sve_data_mode_p (mode))
1699 return mode;
1700 return opt_machine_mode ();
1701 }
1702
1703 /* Return the integer element mode associated with SVE mode MODE. */
1704
1705 static scalar_int_mode
1706 aarch64_sve_element_int_mode (machine_mode mode)
1707 {
1708 unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1709 GET_MODE_NUNITS (mode));
1710 return int_mode_for_size (elt_bits, 0).require ();
1711 }
1712
1713 /* Return the integer vector mode associated with SVE mode MODE.
1714 Unlike mode_for_int_vector, this can handle the case in which
1715 MODE is a predicate (and thus has a different total size). */
1716
1717 static machine_mode
1718 aarch64_sve_int_mode (machine_mode mode)
1719 {
1720 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1721 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1722 }
1723
1724 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1725 prefer to use the first arithmetic operand as the else value if
1726 the else value doesn't matter, since that exactly matches the SVE
1727 destructive merging form. For ternary operations we could either
1728 pick the first operand and use FMAD-like instructions or the last
1729 operand and use FMLA-like instructions; the latter seems more
1730 natural. */
1731
1732 static tree
1733 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1734 {
1735 return nops == 3 ? ops[2] : ops[0];
1736 }
1737
1738 /* Implement TARGET_HARD_REGNO_NREGS. */
1739
1740 static unsigned int
1741 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1742 {
1743 /* ??? Logically we should only need to provide a value when
1744 HARD_REGNO_MODE_OK says that the combination is valid,
1745 but at the moment we need to handle all modes. Just ignore
1746 any runtime parts for registers that can't store them. */
1747 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1748 switch (aarch64_regno_regclass (regno))
1749 {
1750 case FP_REGS:
1751 case FP_LO_REGS:
1752 case FP_LO8_REGS:
1753 if (aarch64_sve_data_mode_p (mode))
1754 return exact_div (GET_MODE_SIZE (mode),
1755 BYTES_PER_SVE_VECTOR).to_constant ();
1756 return CEIL (lowest_size, UNITS_PER_VREG);
1757 case PR_REGS:
1758 case PR_LO_REGS:
1759 case PR_HI_REGS:
1760 return 1;
1761 default:
1762 return CEIL (lowest_size, UNITS_PER_WORD);
1763 }
1764 gcc_unreachable ();
1765 }
1766
1767 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1768
1769 static bool
1770 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1771 {
1772 if (GET_MODE_CLASS (mode) == MODE_CC)
1773 return regno == CC_REGNUM;
1774
1775 if (regno == VG_REGNUM)
1776 /* This must have the same size as _Unwind_Word. */
1777 return mode == DImode;
1778
1779 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1780 if (vec_flags & VEC_SVE_PRED)
1781 return PR_REGNUM_P (regno);
1782
1783 if (PR_REGNUM_P (regno))
1784 return 0;
1785
1786 if (regno == SP_REGNUM)
1787 /* The purpose of comparing with ptr_mode is to support the
1788 global register variable associated with the stack pointer
1789 register via the syntax of asm ("wsp") in ILP32. */
1790 return mode == Pmode || mode == ptr_mode;
1791
1792 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1793 return mode == Pmode;
1794
1795 if (GP_REGNUM_P (regno))
1796 {
1797 if (known_le (GET_MODE_SIZE (mode), 8))
1798 return true;
1799 else if (known_le (GET_MODE_SIZE (mode), 16))
1800 return (regno & 1) == 0;
1801 }
1802 else if (FP_REGNUM_P (regno))
1803 {
1804 if (vec_flags & VEC_STRUCT)
1805 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1806 else
1807 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1808 }
1809
1810 return false;
1811 }
1812
1813 /* Return true if this is a definition of a vectorized simd function. */
1814
1815 static bool
1816 aarch64_simd_decl_p (tree fndecl)
1817 {
1818 tree fntype;
1819
1820 if (fndecl == NULL)
1821 return false;
1822 fntype = TREE_TYPE (fndecl);
1823 if (fntype == NULL)
1824 return false;
1825
1826 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1827 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1828 return true;
1829
1830 return false;
1831 }
1832
1833 /* Return the mode a register save/restore should use. DImode for integer
1834 registers, DFmode for FP registers in non-SIMD functions (they only save
1835 the bottom half of a 128 bit register), or TFmode for FP registers in
1836 SIMD functions. */
1837
1838 static machine_mode
1839 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1840 {
1841 return GP_REGNUM_P (regno)
1842 ? E_DImode
1843 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1844 }
1845
1846 /* Return true if the instruction is a call to a SIMD function, false
1847 if it is not a SIMD function or if we do not know anything about
1848 the function. */
1849
1850 static bool
1851 aarch64_simd_call_p (rtx_insn *insn)
1852 {
1853 rtx symbol;
1854 rtx call;
1855 tree fndecl;
1856
1857 gcc_assert (CALL_P (insn));
1858 call = get_call_rtx_from (insn);
1859 symbol = XEXP (XEXP (call, 0), 0);
1860 if (GET_CODE (symbol) != SYMBOL_REF)
1861 return false;
1862 fndecl = SYMBOL_REF_DECL (symbol);
1863 if (!fndecl)
1864 return false;
1865
1866 return aarch64_simd_decl_p (fndecl);
1867 }
1868
1869 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1870 a function that uses the SIMD ABI, take advantage of the extra
1871 call-preserved registers that the ABI provides. */
1872
1873 void
1874 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1875 HARD_REG_SET *return_set)
1876 {
1877 if (aarch64_simd_call_p (insn))
1878 {
1879 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1880 if (FP_SIMD_SAVED_REGNUM_P (regno))
1881 CLEAR_HARD_REG_BIT (*return_set, regno);
1882 }
1883 }
1884
1885 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1886 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1887 clobbers the top 64 bits when restoring the bottom 64 bits. */
1888
1889 static bool
1890 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1891 machine_mode mode)
1892 {
1893 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1894 return FP_REGNUM_P (regno)
1895 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1896 }
1897
1898 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1899
1900 rtx_insn *
1901 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1902 {
1903 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1904
1905 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1906 return call_1;
1907 else
1908 return call_2;
1909 }
1910
1911 /* Implement REGMODE_NATURAL_SIZE. */
1912 poly_uint64
1913 aarch64_regmode_natural_size (machine_mode mode)
1914 {
1915 /* The natural size for SVE data modes is one SVE data vector,
1916 and similarly for predicates. We can't independently modify
1917 anything smaller than that. */
1918 /* ??? For now, only do this for variable-width SVE registers.
1919 Doing it for constant-sized registers breaks lower-subreg.c. */
1920 /* ??? And once that's fixed, we should probably have similar
1921 code for Advanced SIMD. */
1922 if (!aarch64_sve_vg.is_constant ())
1923 {
1924 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1925 if (vec_flags & VEC_SVE_PRED)
1926 return BYTES_PER_SVE_PRED;
1927 if (vec_flags & VEC_SVE_DATA)
1928 return BYTES_PER_SVE_VECTOR;
1929 }
1930 return UNITS_PER_WORD;
1931 }
1932
1933 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1934 machine_mode
1935 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1936 machine_mode mode)
1937 {
1938 /* The predicate mode determines which bits are significant and
1939 which are "don't care". Decreasing the number of lanes would
1940 lose data while increasing the number of lanes would make bits
1941 unnecessarily significant. */
1942 if (PR_REGNUM_P (regno))
1943 return mode;
1944 if (known_ge (GET_MODE_SIZE (mode), 4))
1945 return mode;
1946 else
1947 return SImode;
1948 }
1949
1950 /* Return true if I's bits are consecutive ones from the MSB. */
1951 bool
1952 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1953 {
1954 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1955 }
1956
1957 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1958 that strcpy from constants will be faster. */
1959
1960 static HOST_WIDE_INT
1961 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1962 {
1963 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1964 return MAX (align, BITS_PER_WORD);
1965 return align;
1966 }
1967
1968 /* Return true if calls to DECL should be treated as
1969 long-calls (ie called via a register). */
1970 static bool
1971 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1972 {
1973 return false;
1974 }
1975
1976 /* Return true if calls to symbol-ref SYM should be treated as
1977 long-calls (ie called via a register). */
1978 bool
1979 aarch64_is_long_call_p (rtx sym)
1980 {
1981 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1982 }
1983
1984 /* Return true if calls to symbol-ref SYM should not go through
1985 plt stubs. */
1986
1987 bool
1988 aarch64_is_noplt_call_p (rtx sym)
1989 {
1990 const_tree decl = SYMBOL_REF_DECL (sym);
1991
1992 if (flag_pic
1993 && decl
1994 && (!flag_plt
1995 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1996 && !targetm.binds_local_p (decl))
1997 return true;
1998
1999 return false;
2000 }
2001
2002 /* Return true if the offsets to a zero/sign-extract operation
2003 represent an expression that matches an extend operation. The
2004 operands represent the paramters from
2005
2006 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2007 bool
2008 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2009 rtx extract_imm)
2010 {
2011 HOST_WIDE_INT mult_val, extract_val;
2012
2013 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2014 return false;
2015
2016 mult_val = INTVAL (mult_imm);
2017 extract_val = INTVAL (extract_imm);
2018
2019 if (extract_val > 8
2020 && extract_val < GET_MODE_BITSIZE (mode)
2021 && exact_log2 (extract_val & ~7) > 0
2022 && (extract_val & 7) <= 4
2023 && mult_val == (1 << (extract_val & 7)))
2024 return true;
2025
2026 return false;
2027 }
2028
2029 /* Emit an insn that's a simple single-set. Both the operands must be
2030 known to be valid. */
2031 inline static rtx_insn *
2032 emit_set_insn (rtx x, rtx y)
2033 {
2034 return emit_insn (gen_rtx_SET (x, y));
2035 }
2036
2037 /* X and Y are two things to compare using CODE. Emit the compare insn and
2038 return the rtx for register 0 in the proper mode. */
2039 rtx
2040 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2041 {
2042 machine_mode mode = SELECT_CC_MODE (code, x, y);
2043 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
2044
2045 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2046 return cc_reg;
2047 }
2048
2049 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2050
2051 static rtx
2052 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2053 machine_mode y_mode)
2054 {
2055 if (y_mode == E_QImode || y_mode == E_HImode)
2056 {
2057 if (CONST_INT_P (y))
2058 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2059 else
2060 {
2061 rtx t, cc_reg;
2062 machine_mode cc_mode;
2063
2064 t = gen_rtx_ZERO_EXTEND (SImode, y);
2065 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2066 cc_mode = CC_SWPmode;
2067 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2068 emit_set_insn (cc_reg, t);
2069 return cc_reg;
2070 }
2071 }
2072
2073 if (!aarch64_plus_operand (y, y_mode))
2074 y = force_reg (y_mode, y);
2075
2076 return aarch64_gen_compare_reg (code, x, y);
2077 }
2078
2079 /* Build the SYMBOL_REF for __tls_get_addr. */
2080
2081 static GTY(()) rtx tls_get_addr_libfunc;
2082
2083 rtx
2084 aarch64_tls_get_addr (void)
2085 {
2086 if (!tls_get_addr_libfunc)
2087 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2088 return tls_get_addr_libfunc;
2089 }
2090
2091 /* Return the TLS model to use for ADDR. */
2092
2093 static enum tls_model
2094 tls_symbolic_operand_type (rtx addr)
2095 {
2096 enum tls_model tls_kind = TLS_MODEL_NONE;
2097 if (GET_CODE (addr) == CONST)
2098 {
2099 poly_int64 addend;
2100 rtx sym = strip_offset (addr, &addend);
2101 if (GET_CODE (sym) == SYMBOL_REF)
2102 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2103 }
2104 else if (GET_CODE (addr) == SYMBOL_REF)
2105 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2106
2107 return tls_kind;
2108 }
2109
2110 /* We'll allow lo_sum's in addresses in our legitimate addresses
2111 so that combine would take care of combining addresses where
2112 necessary, but for generation purposes, we'll generate the address
2113 as :
2114 RTL Absolute
2115 tmp = hi (symbol_ref); adrp x1, foo
2116 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2117 nop
2118
2119 PIC TLS
2120 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2121 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2122 bl __tls_get_addr
2123 nop
2124
2125 Load TLS symbol, depending on TLS mechanism and TLS access model.
2126
2127 Global Dynamic - Traditional TLS:
2128 adrp tmp, :tlsgd:imm
2129 add dest, tmp, #:tlsgd_lo12:imm
2130 bl __tls_get_addr
2131
2132 Global Dynamic - TLS Descriptors:
2133 adrp dest, :tlsdesc:imm
2134 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2135 add dest, dest, #:tlsdesc_lo12:imm
2136 blr tmp
2137 mrs tp, tpidr_el0
2138 add dest, dest, tp
2139
2140 Initial Exec:
2141 mrs tp, tpidr_el0
2142 adrp tmp, :gottprel:imm
2143 ldr dest, [tmp, #:gottprel_lo12:imm]
2144 add dest, dest, tp
2145
2146 Local Exec:
2147 mrs tp, tpidr_el0
2148 add t0, tp, #:tprel_hi12:imm, lsl #12
2149 add t0, t0, #:tprel_lo12_nc:imm
2150 */
2151
2152 static void
2153 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2154 enum aarch64_symbol_type type)
2155 {
2156 switch (type)
2157 {
2158 case SYMBOL_SMALL_ABSOLUTE:
2159 {
2160 /* In ILP32, the mode of dest can be either SImode or DImode. */
2161 rtx tmp_reg = dest;
2162 machine_mode mode = GET_MODE (dest);
2163
2164 gcc_assert (mode == Pmode || mode == ptr_mode);
2165
2166 if (can_create_pseudo_p ())
2167 tmp_reg = gen_reg_rtx (mode);
2168
2169 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2170 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2171 return;
2172 }
2173
2174 case SYMBOL_TINY_ABSOLUTE:
2175 emit_insn (gen_rtx_SET (dest, imm));
2176 return;
2177
2178 case SYMBOL_SMALL_GOT_28K:
2179 {
2180 machine_mode mode = GET_MODE (dest);
2181 rtx gp_rtx = pic_offset_table_rtx;
2182 rtx insn;
2183 rtx mem;
2184
2185 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2186 here before rtl expand. Tree IVOPT will generate rtl pattern to
2187 decide rtx costs, in which case pic_offset_table_rtx is not
2188 initialized. For that case no need to generate the first adrp
2189 instruction as the final cost for global variable access is
2190 one instruction. */
2191 if (gp_rtx != NULL)
2192 {
2193 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2194 using the page base as GOT base, the first page may be wasted,
2195 in the worst scenario, there is only 28K space for GOT).
2196
2197 The generate instruction sequence for accessing global variable
2198 is:
2199
2200 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2201
2202 Only one instruction needed. But we must initialize
2203 pic_offset_table_rtx properly. We generate initialize insn for
2204 every global access, and allow CSE to remove all redundant.
2205
2206 The final instruction sequences will look like the following
2207 for multiply global variables access.
2208
2209 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2210
2211 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2212 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2213 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2214 ... */
2215
2216 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2217 crtl->uses_pic_offset_table = 1;
2218 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2219
2220 if (mode != GET_MODE (gp_rtx))
2221 gp_rtx = gen_lowpart (mode, gp_rtx);
2222
2223 }
2224
2225 if (mode == ptr_mode)
2226 {
2227 if (mode == DImode)
2228 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2229 else
2230 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2231
2232 mem = XVECEXP (SET_SRC (insn), 0, 0);
2233 }
2234 else
2235 {
2236 gcc_assert (mode == Pmode);
2237
2238 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2239 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2240 }
2241
2242 /* The operand is expected to be MEM. Whenever the related insn
2243 pattern changed, above code which calculate mem should be
2244 updated. */
2245 gcc_assert (GET_CODE (mem) == MEM);
2246 MEM_READONLY_P (mem) = 1;
2247 MEM_NOTRAP_P (mem) = 1;
2248 emit_insn (insn);
2249 return;
2250 }
2251
2252 case SYMBOL_SMALL_GOT_4G:
2253 {
2254 /* In ILP32, the mode of dest can be either SImode or DImode,
2255 while the got entry is always of SImode size. The mode of
2256 dest depends on how dest is used: if dest is assigned to a
2257 pointer (e.g. in the memory), it has SImode; it may have
2258 DImode if dest is dereferenced to access the memeory.
2259 This is why we have to handle three different ldr_got_small
2260 patterns here (two patterns for ILP32). */
2261
2262 rtx insn;
2263 rtx mem;
2264 rtx tmp_reg = dest;
2265 machine_mode mode = GET_MODE (dest);
2266
2267 if (can_create_pseudo_p ())
2268 tmp_reg = gen_reg_rtx (mode);
2269
2270 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2271 if (mode == ptr_mode)
2272 {
2273 if (mode == DImode)
2274 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2275 else
2276 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2277
2278 mem = XVECEXP (SET_SRC (insn), 0, 0);
2279 }
2280 else
2281 {
2282 gcc_assert (mode == Pmode);
2283
2284 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2285 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2286 }
2287
2288 gcc_assert (GET_CODE (mem) == MEM);
2289 MEM_READONLY_P (mem) = 1;
2290 MEM_NOTRAP_P (mem) = 1;
2291 emit_insn (insn);
2292 return;
2293 }
2294
2295 case SYMBOL_SMALL_TLSGD:
2296 {
2297 rtx_insn *insns;
2298 machine_mode mode = GET_MODE (dest);
2299 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2300
2301 start_sequence ();
2302 if (TARGET_ILP32)
2303 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2304 else
2305 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2306 insns = get_insns ();
2307 end_sequence ();
2308
2309 RTL_CONST_CALL_P (insns) = 1;
2310 emit_libcall_block (insns, dest, result, imm);
2311 return;
2312 }
2313
2314 case SYMBOL_SMALL_TLSDESC:
2315 {
2316 machine_mode mode = GET_MODE (dest);
2317 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2318 rtx tp;
2319
2320 gcc_assert (mode == Pmode || mode == ptr_mode);
2321
2322 /* In ILP32, the got entry is always of SImode size. Unlike
2323 small GOT, the dest is fixed at reg 0. */
2324 if (TARGET_ILP32)
2325 emit_insn (gen_tlsdesc_small_si (imm));
2326 else
2327 emit_insn (gen_tlsdesc_small_di (imm));
2328 tp = aarch64_load_tp (NULL);
2329
2330 if (mode != Pmode)
2331 tp = gen_lowpart (mode, tp);
2332
2333 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2334 if (REG_P (dest))
2335 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2336 return;
2337 }
2338
2339 case SYMBOL_SMALL_TLSIE:
2340 {
2341 /* In ILP32, the mode of dest can be either SImode or DImode,
2342 while the got entry is always of SImode size. The mode of
2343 dest depends on how dest is used: if dest is assigned to a
2344 pointer (e.g. in the memory), it has SImode; it may have
2345 DImode if dest is dereferenced to access the memeory.
2346 This is why we have to handle three different tlsie_small
2347 patterns here (two patterns for ILP32). */
2348 machine_mode mode = GET_MODE (dest);
2349 rtx tmp_reg = gen_reg_rtx (mode);
2350 rtx tp = aarch64_load_tp (NULL);
2351
2352 if (mode == ptr_mode)
2353 {
2354 if (mode == DImode)
2355 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2356 else
2357 {
2358 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2359 tp = gen_lowpart (mode, tp);
2360 }
2361 }
2362 else
2363 {
2364 gcc_assert (mode == Pmode);
2365 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2366 }
2367
2368 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2369 if (REG_P (dest))
2370 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2371 return;
2372 }
2373
2374 case SYMBOL_TLSLE12:
2375 case SYMBOL_TLSLE24:
2376 case SYMBOL_TLSLE32:
2377 case SYMBOL_TLSLE48:
2378 {
2379 machine_mode mode = GET_MODE (dest);
2380 rtx tp = aarch64_load_tp (NULL);
2381
2382 if (mode != Pmode)
2383 tp = gen_lowpart (mode, tp);
2384
2385 switch (type)
2386 {
2387 case SYMBOL_TLSLE12:
2388 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2389 (dest, tp, imm));
2390 break;
2391 case SYMBOL_TLSLE24:
2392 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2393 (dest, tp, imm));
2394 break;
2395 case SYMBOL_TLSLE32:
2396 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2397 (dest, imm));
2398 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2399 (dest, dest, tp));
2400 break;
2401 case SYMBOL_TLSLE48:
2402 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2403 (dest, imm));
2404 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2405 (dest, dest, tp));
2406 break;
2407 default:
2408 gcc_unreachable ();
2409 }
2410
2411 if (REG_P (dest))
2412 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2413 return;
2414 }
2415
2416 case SYMBOL_TINY_GOT:
2417 emit_insn (gen_ldr_got_tiny (dest, imm));
2418 return;
2419
2420 case SYMBOL_TINY_TLSIE:
2421 {
2422 machine_mode mode = GET_MODE (dest);
2423 rtx tp = aarch64_load_tp (NULL);
2424
2425 if (mode == ptr_mode)
2426 {
2427 if (mode == DImode)
2428 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2429 else
2430 {
2431 tp = gen_lowpart (mode, tp);
2432 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2433 }
2434 }
2435 else
2436 {
2437 gcc_assert (mode == Pmode);
2438 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2439 }
2440
2441 if (REG_P (dest))
2442 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2443 return;
2444 }
2445
2446 default:
2447 gcc_unreachable ();
2448 }
2449 }
2450
2451 /* Emit a move from SRC to DEST. Assume that the move expanders can
2452 handle all moves if !can_create_pseudo_p (). The distinction is
2453 important because, unlike emit_move_insn, the move expanders know
2454 how to force Pmode objects into the constant pool even when the
2455 constant pool address is not itself legitimate. */
2456 static rtx
2457 aarch64_emit_move (rtx dest, rtx src)
2458 {
2459 return (can_create_pseudo_p ()
2460 ? emit_move_insn (dest, src)
2461 : emit_move_insn_1 (dest, src));
2462 }
2463
2464 /* Apply UNOPTAB to OP and store the result in DEST. */
2465
2466 static void
2467 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2468 {
2469 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2470 if (dest != tmp)
2471 emit_move_insn (dest, tmp);
2472 }
2473
2474 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2475
2476 static void
2477 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2478 {
2479 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2480 OPTAB_DIRECT);
2481 if (dest != tmp)
2482 emit_move_insn (dest, tmp);
2483 }
2484
2485 /* Split a 128-bit move operation into two 64-bit move operations,
2486 taking care to handle partial overlap of register to register
2487 copies. Special cases are needed when moving between GP regs and
2488 FP regs. SRC can be a register, constant or memory; DST a register
2489 or memory. If either operand is memory it must not have any side
2490 effects. */
2491 void
2492 aarch64_split_128bit_move (rtx dst, rtx src)
2493 {
2494 rtx dst_lo, dst_hi;
2495 rtx src_lo, src_hi;
2496
2497 machine_mode mode = GET_MODE (dst);
2498
2499 gcc_assert (mode == TImode || mode == TFmode);
2500 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2501 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2502
2503 if (REG_P (dst) && REG_P (src))
2504 {
2505 int src_regno = REGNO (src);
2506 int dst_regno = REGNO (dst);
2507
2508 /* Handle FP <-> GP regs. */
2509 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2510 {
2511 src_lo = gen_lowpart (word_mode, src);
2512 src_hi = gen_highpart (word_mode, src);
2513
2514 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2515 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2516 return;
2517 }
2518 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2519 {
2520 dst_lo = gen_lowpart (word_mode, dst);
2521 dst_hi = gen_highpart (word_mode, dst);
2522
2523 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2524 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2525 return;
2526 }
2527 }
2528
2529 dst_lo = gen_lowpart (word_mode, dst);
2530 dst_hi = gen_highpart (word_mode, dst);
2531 src_lo = gen_lowpart (word_mode, src);
2532 src_hi = gen_highpart_mode (word_mode, mode, src);
2533
2534 /* At most one pairing may overlap. */
2535 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2536 {
2537 aarch64_emit_move (dst_hi, src_hi);
2538 aarch64_emit_move (dst_lo, src_lo);
2539 }
2540 else
2541 {
2542 aarch64_emit_move (dst_lo, src_lo);
2543 aarch64_emit_move (dst_hi, src_hi);
2544 }
2545 }
2546
2547 bool
2548 aarch64_split_128bit_move_p (rtx dst, rtx src)
2549 {
2550 return (! REG_P (src)
2551 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2552 }
2553
2554 /* Split a complex SIMD combine. */
2555
2556 void
2557 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2558 {
2559 machine_mode src_mode = GET_MODE (src1);
2560 machine_mode dst_mode = GET_MODE (dst);
2561
2562 gcc_assert (VECTOR_MODE_P (dst_mode));
2563 gcc_assert (register_operand (dst, dst_mode)
2564 && register_operand (src1, src_mode)
2565 && register_operand (src2, src_mode));
2566
2567 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2568 return;
2569 }
2570
2571 /* Split a complex SIMD move. */
2572
2573 void
2574 aarch64_split_simd_move (rtx dst, rtx src)
2575 {
2576 machine_mode src_mode = GET_MODE (src);
2577 machine_mode dst_mode = GET_MODE (dst);
2578
2579 gcc_assert (VECTOR_MODE_P (dst_mode));
2580
2581 if (REG_P (dst) && REG_P (src))
2582 {
2583 gcc_assert (VECTOR_MODE_P (src_mode));
2584 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2585 }
2586 }
2587
2588 bool
2589 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2590 machine_mode ymode, rtx y)
2591 {
2592 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2593 gcc_assert (r != NULL);
2594 return rtx_equal_p (x, r);
2595 }
2596
2597
2598 /* Return TARGET if it is nonnull and a register of mode MODE.
2599 Otherwise, return a fresh register of mode MODE if we can,
2600 or TARGET reinterpreted as MODE if we can't. */
2601
2602 static rtx
2603 aarch64_target_reg (rtx target, machine_mode mode)
2604 {
2605 if (target && REG_P (target) && GET_MODE (target) == mode)
2606 return target;
2607 if (!can_create_pseudo_p ())
2608 {
2609 gcc_assert (target);
2610 return gen_lowpart (mode, target);
2611 }
2612 return gen_reg_rtx (mode);
2613 }
2614
2615 /* Return a register that contains the constant in BUILDER, given that
2616 the constant is a legitimate move operand. Use TARGET as the register
2617 if it is nonnull and convenient. */
2618
2619 static rtx
2620 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2621 {
2622 rtx src = builder.build ();
2623 target = aarch64_target_reg (target, GET_MODE (src));
2624 emit_insn (gen_rtx_SET (target, src));
2625 return target;
2626 }
2627
2628 static rtx
2629 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2630 {
2631 if (can_create_pseudo_p ())
2632 return force_reg (mode, value);
2633 else
2634 {
2635 gcc_assert (x);
2636 aarch64_emit_move (x, value);
2637 return x;
2638 }
2639 }
2640
2641 /* Return true if predicate value X is a constant in which every element
2642 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2643 value, i.e. as a predicate in which all bits are significant. */
2644
2645 static bool
2646 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2647 {
2648 if (GET_CODE (x) != CONST_VECTOR)
2649 return false;
2650
2651 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2652 GET_MODE_NUNITS (GET_MODE (x)));
2653 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2654 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2655 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2656
2657 unsigned int nelts = const_vector_encoded_nelts (x);
2658 for (unsigned int i = 0; i < nelts; ++i)
2659 {
2660 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2661 if (!CONST_INT_P (elt))
2662 return false;
2663
2664 builder.quick_push (elt);
2665 for (unsigned int j = 1; j < factor; ++j)
2666 builder.quick_push (const0_rtx);
2667 }
2668 builder.finalize ();
2669 return true;
2670 }
2671
2672 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2673 widest predicate element size it can have (that is, the largest size
2674 for which each element would still be 0 or 1). */
2675
2676 unsigned int
2677 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2678 {
2679 /* Start with the most optimistic assumption: that we only need
2680 one bit per pattern. This is what we will use if only the first
2681 bit in each pattern is ever set. */
2682 unsigned int mask = GET_MODE_SIZE (DImode);
2683 mask |= builder.npatterns ();
2684
2685 /* Look for set bits. */
2686 unsigned int nelts = builder.encoded_nelts ();
2687 for (unsigned int i = 1; i < nelts; ++i)
2688 if (INTVAL (builder.elt (i)) != 0)
2689 {
2690 if (i & 1)
2691 return 1;
2692 mask |= i;
2693 }
2694 return mask & -mask;
2695 }
2696
2697 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2698 that the constant would have with predicate element size ELT_SIZE
2699 (ignoring the upper bits in each element) and return:
2700
2701 * -1 if all bits are set
2702 * N if the predicate has N leading set bits followed by all clear bits
2703 * 0 if the predicate does not have any of these forms. */
2704
2705 int
2706 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2707 unsigned int elt_size)
2708 {
2709 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2710 followed by set bits. */
2711 if (builder.nelts_per_pattern () == 3)
2712 return 0;
2713
2714 /* Skip over leading set bits. */
2715 unsigned int nelts = builder.encoded_nelts ();
2716 unsigned int i = 0;
2717 for (; i < nelts; i += elt_size)
2718 if (INTVAL (builder.elt (i)) == 0)
2719 break;
2720 unsigned int vl = i / elt_size;
2721
2722 /* Check for the all-true case. */
2723 if (i == nelts)
2724 return -1;
2725
2726 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2727 repeating pattern of set bits followed by clear bits. */
2728 if (builder.nelts_per_pattern () != 2)
2729 return 0;
2730
2731 /* We have a "foreground" value and a duplicated "background" value.
2732 If the background might repeat and the last set bit belongs to it,
2733 we might have set bits followed by clear bits followed by set bits. */
2734 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2735 return 0;
2736
2737 /* Make sure that the rest are all clear. */
2738 for (; i < nelts; i += elt_size)
2739 if (INTVAL (builder.elt (i)) != 0)
2740 return 0;
2741
2742 return vl;
2743 }
2744
2745 /* See if there is an svpattern that encodes an SVE predicate of mode
2746 PRED_MODE in which the first VL bits are set and the rest are clear.
2747 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2748 A VL of -1 indicates an all-true vector. */
2749
2750 aarch64_svpattern
2751 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2752 {
2753 if (vl < 0)
2754 return AARCH64_SV_ALL;
2755
2756 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2757 return AARCH64_NUM_SVPATTERNS;
2758
2759 if (vl >= 1 && vl <= 8)
2760 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2761
2762 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2763 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2764
2765 int max_vl;
2766 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2767 {
2768 if (vl == (max_vl / 3) * 3)
2769 return AARCH64_SV_MUL3;
2770 /* These would only trigger for non-power-of-2 lengths. */
2771 if (vl == (max_vl & -4))
2772 return AARCH64_SV_MUL4;
2773 if (vl == (1 << floor_log2 (max_vl)))
2774 return AARCH64_SV_POW2;
2775 if (vl == max_vl)
2776 return AARCH64_SV_ALL;
2777 }
2778 return AARCH64_NUM_SVPATTERNS;
2779 }
2780
2781 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2782 bits has the lowest bit set and the upper bits clear. This is the
2783 VNx16BImode equivalent of a PTRUE for controlling elements of
2784 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2785 all bits are significant, even the upper zeros. */
2786
2787 rtx
2788 aarch64_ptrue_all (unsigned int elt_size)
2789 {
2790 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2791 builder.quick_push (const1_rtx);
2792 for (unsigned int i = 1; i < elt_size; ++i)
2793 builder.quick_push (const0_rtx);
2794 return builder.build ();
2795 }
2796
2797 /* Return an all-true predicate register of mode MODE. */
2798
2799 rtx
2800 aarch64_ptrue_reg (machine_mode mode)
2801 {
2802 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2803 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2804 return gen_lowpart (mode, reg);
2805 }
2806
2807 /* Return an all-false predicate register of mode MODE. */
2808
2809 rtx
2810 aarch64_pfalse_reg (machine_mode mode)
2811 {
2812 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2813 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2814 return gen_lowpart (mode, reg);
2815 }
2816
2817 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2818 true, or alternatively if we know that the operation predicated by
2819 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2820 aarch64_sve_gp_strictness operand that describes the operation
2821 predicated by PRED1[0]. */
2822
2823 bool
2824 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2825 {
2826 machine_mode mode = GET_MODE (pred2);
2827 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2828 && mode == GET_MODE (pred1[0])
2829 && aarch64_sve_gp_strictness (pred1[1], SImode));
2830 return (pred1[0] == CONSTM1_RTX (mode)
2831 || INTVAL (pred1[1]) == SVE_RELAXED_GP
2832 || rtx_equal_p (pred1[0], pred2));
2833 }
2834
2835 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2836 for it. PRED2[0] is the predicate for the instruction whose result
2837 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2838 for it. Return true if we can prove that the two predicates are
2839 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2840 with PRED1[0] without changing behavior. */
2841
2842 bool
2843 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2844 {
2845 machine_mode mode = GET_MODE (pred1[0]);
2846 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2847 && mode == GET_MODE (pred2[0])
2848 && aarch64_sve_ptrue_flag (pred1[1], SImode)
2849 && aarch64_sve_ptrue_flag (pred2[1], SImode));
2850
2851 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2852 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2853 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2854 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2855 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2856 }
2857
2858 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2859 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2860 Use TARGET as the target register if nonnull and convenient. */
2861
2862 static rtx
2863 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2864 machine_mode data_mode, rtx op1, rtx op2)
2865 {
2866 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2867 expand_operand ops[5];
2868 create_output_operand (&ops[0], target, pred_mode);
2869 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2870 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2871 create_input_operand (&ops[3], op1, data_mode);
2872 create_input_operand (&ops[4], op2, data_mode);
2873 expand_insn (icode, 5, ops);
2874 return ops[0].value;
2875 }
2876
2877 /* Use a comparison to convert integer vector SRC into MODE, which is
2878 the corresponding SVE predicate mode. Use TARGET for the result
2879 if it's nonnull and convenient. */
2880
2881 static rtx
2882 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2883 {
2884 machine_mode src_mode = GET_MODE (src);
2885 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2886 src, CONST0_RTX (src_mode));
2887 }
2888
2889 /* Return true if we can move VALUE into a register using a single
2890 CNT[BHWD] instruction. */
2891
2892 static bool
2893 aarch64_sve_cnt_immediate_p (poly_int64 value)
2894 {
2895 HOST_WIDE_INT factor = value.coeffs[0];
2896 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2897 return (value.coeffs[1] == factor
2898 && IN_RANGE (factor, 2, 16 * 16)
2899 && (factor & 1) == 0
2900 && factor <= 16 * (factor & -factor));
2901 }
2902
2903 /* Likewise for rtx X. */
2904
2905 bool
2906 aarch64_sve_cnt_immediate_p (rtx x)
2907 {
2908 poly_int64 value;
2909 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2910 }
2911
2912 /* Return the asm string for an instruction with a CNT-like vector size
2913 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2914 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2915 first part of the operands template (the part that comes before the
2916 vector size itself). PATTERN is the pattern to use. FACTOR is the
2917 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
2918 in each quadword. If it is zero, we can use any element size. */
2919
2920 static char *
2921 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2922 aarch64_svpattern pattern,
2923 unsigned int factor,
2924 unsigned int nelts_per_vq)
2925 {
2926 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2927
2928 if (nelts_per_vq == 0)
2929 /* There is some overlap in the ranges of the four CNT instructions.
2930 Here we always use the smallest possible element size, so that the
2931 multiplier is 1 whereever possible. */
2932 nelts_per_vq = factor & -factor;
2933 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2934 gcc_assert (IN_RANGE (shift, 1, 4));
2935 char suffix = "dwhb"[shift - 1];
2936
2937 factor >>= shift;
2938 unsigned int written;
2939 if (pattern == AARCH64_SV_ALL && factor == 1)
2940 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2941 prefix, suffix, operands);
2942 else if (factor == 1)
2943 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
2944 prefix, suffix, operands, svpattern_token (pattern));
2945 else
2946 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
2947 prefix, suffix, operands, svpattern_token (pattern),
2948 factor);
2949 gcc_assert (written < sizeof (buffer));
2950 return buffer;
2951 }
2952
2953 /* Return the asm string for an instruction with a CNT-like vector size
2954 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2955 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2956 first part of the operands template (the part that comes before the
2957 vector size itself). X is the value of the vector size operand,
2958 as a polynomial integer rtx; we need to convert this into an "all"
2959 pattern with a multiplier. */
2960
2961 char *
2962 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2963 rtx x)
2964 {
2965 poly_int64 value = rtx_to_poly_int64 (x);
2966 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2967 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
2968 value.coeffs[1], 0);
2969 }
2970
2971 /* Return true if we can add X using a single SVE INC or DEC instruction. */
2972
2973 bool
2974 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
2975 {
2976 poly_int64 value;
2977 return (poly_int_rtx_p (x, &value)
2978 && (aarch64_sve_cnt_immediate_p (value)
2979 || aarch64_sve_cnt_immediate_p (-value)));
2980 }
2981
2982 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
2983 operand 0. */
2984
2985 char *
2986 aarch64_output_sve_scalar_inc_dec (rtx offset)
2987 {
2988 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2989 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
2990 if (offset_value.coeffs[1] > 0)
2991 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
2992 offset_value.coeffs[1], 0);
2993 else
2994 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
2995 -offset_value.coeffs[1], 0);
2996 }
2997
2998 /* Return true if we can add VALUE to a register using a single ADDVL
2999 or ADDPL instruction. */
3000
3001 static bool
3002 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3003 {
3004 HOST_WIDE_INT factor = value.coeffs[0];
3005 if (factor == 0 || value.coeffs[1] != factor)
3006 return false;
3007 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3008 and a value of 16 is one vector width. */
3009 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3010 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3011 }
3012
3013 /* Likewise for rtx X. */
3014
3015 bool
3016 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3017 {
3018 poly_int64 value;
3019 return (poly_int_rtx_p (x, &value)
3020 && aarch64_sve_addvl_addpl_immediate_p (value));
3021 }
3022
3023 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3024 to operand 1 and storing the result in operand 0. */
3025
3026 char *
3027 aarch64_output_sve_addvl_addpl (rtx offset)
3028 {
3029 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3030 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3031 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3032
3033 int factor = offset_value.coeffs[1];
3034 if ((factor & 15) == 0)
3035 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3036 else
3037 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3038 return buffer;
3039 }
3040
3041 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3042 instruction. If it is, store the number of elements in each vector
3043 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3044 factor in *FACTOR_OUT (if nonnull). */
3045
3046 bool
3047 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3048 unsigned int *nelts_per_vq_out)
3049 {
3050 rtx elt;
3051 poly_int64 value;
3052
3053 if (!const_vec_duplicate_p (x, &elt)
3054 || !poly_int_rtx_p (elt, &value))
3055 return false;
3056
3057 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3058 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3059 /* There's no vector INCB. */
3060 return false;
3061
3062 HOST_WIDE_INT factor = value.coeffs[0];
3063 if (value.coeffs[1] != factor)
3064 return false;
3065
3066 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3067 if ((factor % nelts_per_vq) != 0
3068 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3069 return false;
3070
3071 if (factor_out)
3072 *factor_out = factor;
3073 if (nelts_per_vq_out)
3074 *nelts_per_vq_out = nelts_per_vq;
3075 return true;
3076 }
3077
3078 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3079 instruction. */
3080
3081 bool
3082 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3083 {
3084 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3085 }
3086
3087 /* Return the asm template for an SVE vector INC or DEC instruction.
3088 OPERANDS gives the operands before the vector count and X is the
3089 value of the vector count operand itself. */
3090
3091 char *
3092 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3093 {
3094 int factor;
3095 unsigned int nelts_per_vq;
3096 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3097 gcc_unreachable ();
3098 if (factor < 0)
3099 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3100 -factor, nelts_per_vq);
3101 else
3102 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3103 factor, nelts_per_vq);
3104 }
3105
3106 static int
3107 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3108 scalar_int_mode mode)
3109 {
3110 int i;
3111 unsigned HOST_WIDE_INT val, val2, mask;
3112 int one_match, zero_match;
3113 int num_insns;
3114
3115 val = INTVAL (imm);
3116
3117 if (aarch64_move_imm (val, mode))
3118 {
3119 if (generate)
3120 emit_insn (gen_rtx_SET (dest, imm));
3121 return 1;
3122 }
3123
3124 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3125 (with XXXX non-zero). In that case check to see if the move can be done in
3126 a smaller mode. */
3127 val2 = val & 0xffffffff;
3128 if (mode == DImode
3129 && aarch64_move_imm (val2, SImode)
3130 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3131 {
3132 if (generate)
3133 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3134
3135 /* Check if we have to emit a second instruction by checking to see
3136 if any of the upper 32 bits of the original DI mode value is set. */
3137 if (val == val2)
3138 return 1;
3139
3140 i = (val >> 48) ? 48 : 32;
3141
3142 if (generate)
3143 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3144 GEN_INT ((val >> i) & 0xffff)));
3145
3146 return 2;
3147 }
3148
3149 if ((val >> 32) == 0 || mode == SImode)
3150 {
3151 if (generate)
3152 {
3153 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3154 if (mode == SImode)
3155 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3156 GEN_INT ((val >> 16) & 0xffff)));
3157 else
3158 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3159 GEN_INT ((val >> 16) & 0xffff)));
3160 }
3161 return 2;
3162 }
3163
3164 /* Remaining cases are all for DImode. */
3165
3166 mask = 0xffff;
3167 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3168 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3169 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3170 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3171
3172 if (zero_match != 2 && one_match != 2)
3173 {
3174 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3175 For a 64-bit bitmask try whether changing 16 bits to all ones or
3176 zeroes creates a valid bitmask. To check any repeated bitmask,
3177 try using 16 bits from the other 32-bit half of val. */
3178
3179 for (i = 0; i < 64; i += 16, mask <<= 16)
3180 {
3181 val2 = val & ~mask;
3182 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3183 break;
3184 val2 = val | mask;
3185 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3186 break;
3187 val2 = val2 & ~mask;
3188 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3189 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3190 break;
3191 }
3192 if (i != 64)
3193 {
3194 if (generate)
3195 {
3196 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3197 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3198 GEN_INT ((val >> i) & 0xffff)));
3199 }
3200 return 2;
3201 }
3202 }
3203
3204 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3205 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3206 otherwise skip zero bits. */
3207
3208 num_insns = 1;
3209 mask = 0xffff;
3210 val2 = one_match > zero_match ? ~val : val;
3211 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3212
3213 if (generate)
3214 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3215 ? (val | ~(mask << i))
3216 : (val & (mask << i)))));
3217 for (i += 16; i < 64; i += 16)
3218 {
3219 if ((val2 & (mask << i)) == 0)
3220 continue;
3221 if (generate)
3222 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3223 GEN_INT ((val >> i) & 0xffff)));
3224 num_insns ++;
3225 }
3226
3227 return num_insns;
3228 }
3229
3230 /* Return whether imm is a 128-bit immediate which is simple enough to
3231 expand inline. */
3232 bool
3233 aarch64_mov128_immediate (rtx imm)
3234 {
3235 if (GET_CODE (imm) == CONST_INT)
3236 return true;
3237
3238 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3239
3240 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3241 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3242
3243 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3244 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3245 }
3246
3247
3248 /* Return the number of temporary registers that aarch64_add_offset_1
3249 would need to add OFFSET to a register. */
3250
3251 static unsigned int
3252 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3253 {
3254 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3255 }
3256
3257 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3258 a non-polynomial OFFSET. MODE is the mode of the addition.
3259 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3260 be set and CFA adjustments added to the generated instructions.
3261
3262 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3263 temporary if register allocation is already complete. This temporary
3264 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3265 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3266 the immediate again.
3267
3268 Since this function may be used to adjust the stack pointer, we must
3269 ensure that it cannot cause transient stack deallocation (for example
3270 by first incrementing SP and then decrementing when adjusting by a
3271 large immediate). */
3272
3273 static void
3274 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3275 rtx src, HOST_WIDE_INT offset, rtx temp1,
3276 bool frame_related_p, bool emit_move_imm)
3277 {
3278 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3279 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3280
3281 HOST_WIDE_INT moffset = abs_hwi (offset);
3282 rtx_insn *insn;
3283
3284 if (!moffset)
3285 {
3286 if (!rtx_equal_p (dest, src))
3287 {
3288 insn = emit_insn (gen_rtx_SET (dest, src));
3289 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3290 }
3291 return;
3292 }
3293
3294 /* Single instruction adjustment. */
3295 if (aarch64_uimm12_shift (moffset))
3296 {
3297 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3298 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3299 return;
3300 }
3301
3302 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3303 and either:
3304
3305 a) the offset cannot be loaded by a 16-bit move or
3306 b) there is no spare register into which we can move it. */
3307 if (moffset < 0x1000000
3308 && ((!temp1 && !can_create_pseudo_p ())
3309 || !aarch64_move_imm (moffset, mode)))
3310 {
3311 HOST_WIDE_INT low_off = moffset & 0xfff;
3312
3313 low_off = offset < 0 ? -low_off : low_off;
3314 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3315 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3316 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3317 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3318 return;
3319 }
3320
3321 /* Emit a move immediate if required and an addition/subtraction. */
3322 if (emit_move_imm)
3323 {
3324 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3325 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3326 }
3327 insn = emit_insn (offset < 0
3328 ? gen_sub3_insn (dest, src, temp1)
3329 : gen_add3_insn (dest, src, temp1));
3330 if (frame_related_p)
3331 {
3332 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3333 rtx adj = plus_constant (mode, src, offset);
3334 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3335 }
3336 }
3337
3338 /* Return the number of temporary registers that aarch64_add_offset
3339 would need to move OFFSET into a register or add OFFSET to a register;
3340 ADD_P is true if we want the latter rather than the former. */
3341
3342 static unsigned int
3343 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3344 {
3345 /* This follows the same structure as aarch64_add_offset. */
3346 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3347 return 0;
3348
3349 unsigned int count = 0;
3350 HOST_WIDE_INT factor = offset.coeffs[1];
3351 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3352 poly_int64 poly_offset (factor, factor);
3353 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3354 /* Need one register for the ADDVL/ADDPL result. */
3355 count += 1;
3356 else if (factor != 0)
3357 {
3358 factor = abs (factor);
3359 if (factor > 16 * (factor & -factor))
3360 /* Need one register for the CNT result and one for the multiplication
3361 factor. If necessary, the second temporary can be reused for the
3362 constant part of the offset. */
3363 return 2;
3364 /* Need one register for the CNT result (which might then
3365 be shifted). */
3366 count += 1;
3367 }
3368 return count + aarch64_add_offset_1_temporaries (constant);
3369 }
3370
3371 /* If X can be represented as a poly_int64, return the number
3372 of temporaries that are required to add it to a register.
3373 Return -1 otherwise. */
3374
3375 int
3376 aarch64_add_offset_temporaries (rtx x)
3377 {
3378 poly_int64 offset;
3379 if (!poly_int_rtx_p (x, &offset))
3380 return -1;
3381 return aarch64_offset_temporaries (true, offset);
3382 }
3383
3384 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3385 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3386 be set and CFA adjustments added to the generated instructions.
3387
3388 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3389 temporary if register allocation is already complete. This temporary
3390 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3391 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3392 false to avoid emitting the immediate again.
3393
3394 TEMP2, if nonnull, is a second temporary register that doesn't
3395 overlap either DEST or REG.
3396
3397 Since this function may be used to adjust the stack pointer, we must
3398 ensure that it cannot cause transient stack deallocation (for example
3399 by first incrementing SP and then decrementing when adjusting by a
3400 large immediate). */
3401
3402 static void
3403 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3404 poly_int64 offset, rtx temp1, rtx temp2,
3405 bool frame_related_p, bool emit_move_imm = true)
3406 {
3407 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3408 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3409 gcc_assert (temp1 == NULL_RTX
3410 || !frame_related_p
3411 || !reg_overlap_mentioned_p (temp1, dest));
3412 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3413
3414 /* Try using ADDVL or ADDPL to add the whole value. */
3415 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3416 {
3417 rtx offset_rtx = gen_int_mode (offset, mode);
3418 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3419 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3420 return;
3421 }
3422
3423 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3424 SVE vector register, over and above the minimum size of 128 bits.
3425 This is equivalent to half the value returned by CNTD with a
3426 vector shape of ALL. */
3427 HOST_WIDE_INT factor = offset.coeffs[1];
3428 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3429
3430 /* Try using ADDVL or ADDPL to add the VG-based part. */
3431 poly_int64 poly_offset (factor, factor);
3432 if (src != const0_rtx
3433 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3434 {
3435 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3436 if (frame_related_p)
3437 {
3438 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3439 RTX_FRAME_RELATED_P (insn) = true;
3440 src = dest;
3441 }
3442 else
3443 {
3444 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3445 src = aarch64_force_temporary (mode, temp1, addr);
3446 temp1 = temp2;
3447 temp2 = NULL_RTX;
3448 }
3449 }
3450 /* Otherwise use a CNT-based sequence. */
3451 else if (factor != 0)
3452 {
3453 /* Use a subtraction if we have a negative factor. */
3454 rtx_code code = PLUS;
3455 if (factor < 0)
3456 {
3457 factor = -factor;
3458 code = MINUS;
3459 }
3460
3461 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3462 into the multiplication. */
3463 rtx val;
3464 int shift = 0;
3465 if (factor & 1)
3466 /* Use a right shift by 1. */
3467 shift = -1;
3468 else
3469 factor /= 2;
3470 HOST_WIDE_INT low_bit = factor & -factor;
3471 if (factor <= 16 * low_bit)
3472 {
3473 if (factor > 16 * 8)
3474 {
3475 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3476 the value with the minimum multiplier and shift it into
3477 position. */
3478 int extra_shift = exact_log2 (low_bit);
3479 shift += extra_shift;
3480 factor >>= extra_shift;
3481 }
3482 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3483 }
3484 else
3485 {
3486 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3487 directly, since that should increase the chances of being
3488 able to use a shift and add sequence. If LOW_BIT itself
3489 is out of range, just use CNTD. */
3490 if (low_bit <= 16 * 8)
3491 factor /= low_bit;
3492 else
3493 low_bit = 1;
3494
3495 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3496 val = aarch64_force_temporary (mode, temp1, val);
3497
3498 if (can_create_pseudo_p ())
3499 {
3500 rtx coeff1 = gen_int_mode (factor, mode);
3501 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3502 }
3503 else
3504 {
3505 /* Go back to using a negative multiplication factor if we have
3506 no register from which to subtract. */
3507 if (code == MINUS && src == const0_rtx)
3508 {
3509 factor = -factor;
3510 code = PLUS;
3511 }
3512 rtx coeff1 = gen_int_mode (factor, mode);
3513 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3514 val = gen_rtx_MULT (mode, val, coeff1);
3515 }
3516 }
3517
3518 if (shift > 0)
3519 {
3520 /* Multiply by 1 << SHIFT. */
3521 val = aarch64_force_temporary (mode, temp1, val);
3522 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3523 }
3524 else if (shift == -1)
3525 {
3526 /* Divide by 2. */
3527 val = aarch64_force_temporary (mode, temp1, val);
3528 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3529 }
3530
3531 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3532 if (src != const0_rtx)
3533 {
3534 val = aarch64_force_temporary (mode, temp1, val);
3535 val = gen_rtx_fmt_ee (code, mode, src, val);
3536 }
3537 else if (code == MINUS)
3538 {
3539 val = aarch64_force_temporary (mode, temp1, val);
3540 val = gen_rtx_NEG (mode, val);
3541 }
3542
3543 if (constant == 0 || frame_related_p)
3544 {
3545 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3546 if (frame_related_p)
3547 {
3548 RTX_FRAME_RELATED_P (insn) = true;
3549 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3550 gen_rtx_SET (dest, plus_constant (Pmode, src,
3551 poly_offset)));
3552 }
3553 src = dest;
3554 if (constant == 0)
3555 return;
3556 }
3557 else
3558 {
3559 src = aarch64_force_temporary (mode, temp1, val);
3560 temp1 = temp2;
3561 temp2 = NULL_RTX;
3562 }
3563
3564 emit_move_imm = true;
3565 }
3566
3567 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3568 frame_related_p, emit_move_imm);
3569 }
3570
3571 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3572 than a poly_int64. */
3573
3574 void
3575 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3576 rtx offset_rtx, rtx temp1, rtx temp2)
3577 {
3578 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3579 temp1, temp2, false);
3580 }
3581
3582 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3583 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3584 if TEMP1 already contains abs (DELTA). */
3585
3586 static inline void
3587 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3588 {
3589 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3590 temp1, temp2, true, emit_move_imm);
3591 }
3592
3593 /* Subtract DELTA from the stack pointer, marking the instructions
3594 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3595 if nonnull. */
3596
3597 static inline void
3598 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3599 bool emit_move_imm = true)
3600 {
3601 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3602 temp1, temp2, frame_related_p, emit_move_imm);
3603 }
3604
3605 /* Set DEST to (vec_series BASE STEP). */
3606
3607 static void
3608 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3609 {
3610 machine_mode mode = GET_MODE (dest);
3611 scalar_mode inner = GET_MODE_INNER (mode);
3612
3613 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3614 if (!aarch64_sve_index_immediate_p (base))
3615 base = force_reg (inner, base);
3616 if (!aarch64_sve_index_immediate_p (step))
3617 step = force_reg (inner, step);
3618
3619 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3620 }
3621
3622 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3623 register of mode MODE. Use TARGET for the result if it's nonnull
3624 and convenient.
3625
3626 The two vector modes must have the same element mode. The behavior
3627 is to duplicate architectural lane N of SRC into architectural lanes
3628 N + I * STEP of the result. On big-endian targets, architectural
3629 lane 0 of an Advanced SIMD vector is the last element of the vector
3630 in memory layout, so for big-endian targets this operation has the
3631 effect of reversing SRC before duplicating it. Callers need to
3632 account for this. */
3633
3634 rtx
3635 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3636 {
3637 machine_mode src_mode = GET_MODE (src);
3638 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3639 insn_code icode = (BYTES_BIG_ENDIAN
3640 ? code_for_aarch64_vec_duplicate_vq_be (mode)
3641 : code_for_aarch64_vec_duplicate_vq_le (mode));
3642
3643 unsigned int i = 0;
3644 expand_operand ops[3];
3645 create_output_operand (&ops[i++], target, mode);
3646 create_output_operand (&ops[i++], src, src_mode);
3647 if (BYTES_BIG_ENDIAN)
3648 {
3649 /* Create a PARALLEL describing the reversal of SRC. */
3650 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3651 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3652 nelts_per_vq - 1, -1);
3653 create_fixed_operand (&ops[i++], sel);
3654 }
3655 expand_insn (icode, i, ops);
3656 return ops[0].value;
3657 }
3658
3659 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3660 the memory image into DEST. Return true on success. */
3661
3662 static bool
3663 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3664 {
3665 src = force_const_mem (GET_MODE (src), src);
3666 if (!src)
3667 return false;
3668
3669 /* Make sure that the address is legitimate. */
3670 if (!aarch64_sve_ld1rq_operand_p (src))
3671 {
3672 rtx addr = force_reg (Pmode, XEXP (src, 0));
3673 src = replace_equiv_address (src, addr);
3674 }
3675
3676 machine_mode mode = GET_MODE (dest);
3677 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3678 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3679 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3680 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3681 return true;
3682 }
3683
3684 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3685 SVE data mode and isn't a legitimate constant. Use TARGET for the
3686 result if convenient.
3687
3688 The returned register can have whatever mode seems most natural
3689 given the contents of SRC. */
3690
3691 static rtx
3692 aarch64_expand_sve_const_vector (rtx target, rtx src)
3693 {
3694 machine_mode mode = GET_MODE (src);
3695 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3696 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3697 scalar_mode elt_mode = GET_MODE_INNER (mode);
3698 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3699 unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3700
3701 if (nelts_per_pattern == 1 && encoded_bits == 128)
3702 {
3703 /* The constant is a duplicated quadword but can't be narrowed
3704 beyond a quadword. Get the memory image of the first quadword
3705 as a 128-bit vector and try using LD1RQ to load it from memory.
3706
3707 The effect for both endiannesses is to load memory lane N into
3708 architectural lanes N + I * STEP of the result. On big-endian
3709 targets, the layout of the 128-bit vector in an Advanced SIMD
3710 register would be different from its layout in an SVE register,
3711 but this 128-bit vector is a memory value only. */
3712 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3713 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3714 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3715 return target;
3716 }
3717
3718 if (nelts_per_pattern == 1 && encoded_bits < 128)
3719 {
3720 /* The vector is a repeating sequence of 64 bits or fewer.
3721 See if we can load them using an Advanced SIMD move and then
3722 duplicate it to fill a vector. This is better than using a GPR
3723 move because it keeps everything in the same register file. */
3724 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3725 rtx_vector_builder builder (vq_mode, npatterns, 1);
3726 for (unsigned int i = 0; i < npatterns; ++i)
3727 {
3728 /* We want memory lane N to go into architectural lane N,
3729 so reverse for big-endian targets. The DUP .Q pattern
3730 has a compensating reverse built-in. */
3731 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3732 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3733 }
3734 rtx vq_src = builder.build ();
3735 if (aarch64_simd_valid_immediate (vq_src, NULL))
3736 {
3737 vq_src = force_reg (vq_mode, vq_src);
3738 return aarch64_expand_sve_dupq (target, mode, vq_src);
3739 }
3740
3741 /* Get an integer representation of the repeating part of Advanced
3742 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3743 which for big-endian targets is lane-swapped wrt a normal
3744 Advanced SIMD vector. This means that for both endiannesses,
3745 memory lane N of SVE vector SRC corresponds to architectural
3746 lane N of a register holding VQ_SRC. This in turn means that
3747 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3748 as a single 128-bit value) and thus that memory lane 0 of SRC is
3749 in the lsb of the integer. Duplicating the integer therefore
3750 ensures that memory lane N of SRC goes into architectural lane
3751 N + I * INDEX of the SVE register. */
3752 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3753 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3754 if (elt_value)
3755 {
3756 /* Pretend that we had a vector of INT_MODE to start with. */
3757 elt_mode = int_mode;
3758 mode = aarch64_full_sve_mode (int_mode).require ();
3759
3760 /* If the integer can be moved into a general register by a
3761 single instruction, do that and duplicate the result. */
3762 if (CONST_INT_P (elt_value)
3763 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3764 {
3765 elt_value = force_reg (elt_mode, elt_value);
3766 return expand_vector_broadcast (mode, elt_value);
3767 }
3768 }
3769 else if (npatterns == 1)
3770 /* We're duplicating a single value, but can't do better than
3771 force it to memory and load from there. This handles things
3772 like symbolic constants. */
3773 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3774
3775 if (elt_value)
3776 {
3777 /* Load the element from memory if we can, otherwise move it into
3778 a register and use a DUP. */
3779 rtx op = force_const_mem (elt_mode, elt_value);
3780 if (!op)
3781 op = force_reg (elt_mode, elt_value);
3782 return expand_vector_broadcast (mode, op);
3783 }
3784 }
3785
3786 /* Try using INDEX. */
3787 rtx base, step;
3788 if (const_vec_series_p (src, &base, &step))
3789 {
3790 aarch64_expand_vec_series (target, base, step);
3791 return target;
3792 }
3793
3794 /* From here on, it's better to force the whole constant to memory
3795 if we can. */
3796 if (GET_MODE_NUNITS (mode).is_constant ())
3797 return NULL_RTX;
3798
3799 /* Expand each pattern individually. */
3800 gcc_assert (npatterns > 1);
3801 rtx_vector_builder builder;
3802 auto_vec<rtx, 16> vectors (npatterns);
3803 for (unsigned int i = 0; i < npatterns; ++i)
3804 {
3805 builder.new_vector (mode, 1, nelts_per_pattern);
3806 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3807 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3808 vectors.quick_push (force_reg (mode, builder.build ()));
3809 }
3810
3811 /* Use permutes to interleave the separate vectors. */
3812 while (npatterns > 1)
3813 {
3814 npatterns /= 2;
3815 for (unsigned int i = 0; i < npatterns; ++i)
3816 {
3817 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3818 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3819 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3820 vectors[i] = tmp;
3821 }
3822 }
3823 gcc_assert (vectors[0] == target);
3824 return target;
3825 }
3826
3827 /* Use WHILE to set a predicate register of mode MODE in which the first
3828 VL bits are set and the rest are clear. Use TARGET for the register
3829 if it's nonnull and convenient. */
3830
3831 static rtx
3832 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3833 unsigned int vl)
3834 {
3835 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3836 target = aarch64_target_reg (target, mode);
3837 emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3838 return target;
3839 }
3840
3841 static rtx
3842 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3843
3844 /* BUILDER is a constant predicate in which the index of every set bit
3845 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3846 by inverting every element at a multiple of ELT_SIZE and EORing the
3847 result with an ELT_SIZE PTRUE.
3848
3849 Return a register that contains the constant on success, otherwise
3850 return null. Use TARGET as the register if it is nonnull and
3851 convenient. */
3852
3853 static rtx
3854 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3855 unsigned int elt_size)
3856 {
3857 /* Invert every element at a multiple of ELT_SIZE, keeping the
3858 other bits zero. */
3859 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3860 builder.nelts_per_pattern ());
3861 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3862 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3863 inv_builder.quick_push (const1_rtx);
3864 else
3865 inv_builder.quick_push (const0_rtx);
3866 inv_builder.finalize ();
3867
3868 /* See if we can load the constant cheaply. */
3869 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3870 if (!inv)
3871 return NULL_RTX;
3872
3873 /* EOR the result with an ELT_SIZE PTRUE. */
3874 rtx mask = aarch64_ptrue_all (elt_size);
3875 mask = force_reg (VNx16BImode, mask);
3876 target = aarch64_target_reg (target, VNx16BImode);
3877 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3878 return target;
3879 }
3880
3881 /* BUILDER is a constant predicate in which the index of every set bit
3882 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3883 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3884 register on success, otherwise return null. Use TARGET as the register
3885 if nonnull and convenient. */
3886
3887 static rtx
3888 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3889 unsigned int elt_size,
3890 unsigned int permute_size)
3891 {
3892 /* We're going to split the constant into two new constants A and B,
3893 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3894 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3895
3896 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3897 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3898
3899 where _ indicates elements that will be discarded by the permute.
3900
3901 First calculate the ELT_SIZEs for A and B. */
3902 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3903 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3904 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3905 if (INTVAL (builder.elt (i)) != 0)
3906 {
3907 if (i & permute_size)
3908 b_elt_size |= i - permute_size;
3909 else
3910 a_elt_size |= i;
3911 }
3912 a_elt_size &= -a_elt_size;
3913 b_elt_size &= -b_elt_size;
3914
3915 /* Now construct the vectors themselves. */
3916 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3917 builder.nelts_per_pattern ());
3918 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3919 builder.nelts_per_pattern ());
3920 unsigned int nelts = builder.encoded_nelts ();
3921 for (unsigned int i = 0; i < nelts; ++i)
3922 if (i & (elt_size - 1))
3923 {
3924 a_builder.quick_push (const0_rtx);
3925 b_builder.quick_push (const0_rtx);
3926 }
3927 else if ((i & permute_size) == 0)
3928 {
3929 /* The A and B elements are significant. */
3930 a_builder.quick_push (builder.elt (i));
3931 b_builder.quick_push (builder.elt (i + permute_size));
3932 }
3933 else
3934 {
3935 /* The A and B elements are going to be discarded, so pick whatever
3936 is likely to give a nice constant. We are targeting element
3937 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3938 with the aim of each being a sequence of ones followed by
3939 a sequence of zeros. So:
3940
3941 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3942 duplicate the last X_ELT_SIZE element, to extend the
3943 current sequence of ones or zeros.
3944
3945 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3946 zero, so that the constant really does have X_ELT_SIZE and
3947 not a smaller size. */
3948 if (a_elt_size > permute_size)
3949 a_builder.quick_push (const0_rtx);
3950 else
3951 a_builder.quick_push (a_builder.elt (i - a_elt_size));
3952 if (b_elt_size > permute_size)
3953 b_builder.quick_push (const0_rtx);
3954 else
3955 b_builder.quick_push (b_builder.elt (i - b_elt_size));
3956 }
3957 a_builder.finalize ();
3958 b_builder.finalize ();
3959
3960 /* Try loading A into a register. */
3961 rtx_insn *last = get_last_insn ();
3962 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3963 if (!a)
3964 return NULL_RTX;
3965
3966 /* Try loading B into a register. */
3967 rtx b = a;
3968 if (a_builder != b_builder)
3969 {
3970 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3971 if (!b)
3972 {
3973 delete_insns_since (last);
3974 return NULL_RTX;
3975 }
3976 }
3977
3978 /* Emit the TRN1 itself. */
3979 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3980 target = aarch64_target_reg (target, mode);
3981 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3982 gen_lowpart (mode, a),
3983 gen_lowpart (mode, b)));
3984 return target;
3985 }
3986
3987 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
3988 constant in BUILDER into an SVE predicate register. Return the register
3989 on success, otherwise return null. Use TARGET for the register if
3990 nonnull and convenient.
3991
3992 ALLOW_RECURSE_P is true if we can use methods that would call this
3993 function recursively. */
3994
3995 static rtx
3996 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3997 bool allow_recurse_p)
3998 {
3999 if (builder.encoded_nelts () == 1)
4000 /* A PFALSE or a PTRUE .B ALL. */
4001 return aarch64_emit_set_immediate (target, builder);
4002
4003 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4004 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4005 {
4006 /* If we can load the constant using PTRUE, use it as-is. */
4007 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4008 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4009 return aarch64_emit_set_immediate (target, builder);
4010
4011 /* Otherwise use WHILE to set the first VL bits. */
4012 return aarch64_sve_move_pred_via_while (target, mode, vl);
4013 }
4014
4015 if (!allow_recurse_p)
4016 return NULL_RTX;
4017
4018 /* Try inverting the vector in element size ELT_SIZE and then EORing
4019 the result with an ELT_SIZE PTRUE. */
4020 if (INTVAL (builder.elt (0)) == 0)
4021 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4022 elt_size))
4023 return res;
4024
4025 /* Try using TRN1 to permute two simpler constants. */
4026 for (unsigned int i = elt_size; i <= 8; i *= 2)
4027 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4028 elt_size, i))
4029 return res;
4030
4031 return NULL_RTX;
4032 }
4033
4034 /* Return an SVE predicate register that contains the VNx16BImode
4035 constant in BUILDER, without going through the move expanders.
4036
4037 The returned register can have whatever mode seems most natural
4038 given the contents of BUILDER. Use TARGET for the result if
4039 convenient. */
4040
4041 static rtx
4042 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4043 {
4044 /* Try loading the constant using pure predicate operations. */
4045 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4046 return res;
4047
4048 /* Try forcing the constant to memory. */
4049 if (builder.full_nelts ().is_constant ())
4050 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4051 {
4052 target = aarch64_target_reg (target, VNx16BImode);
4053 emit_move_insn (target, mem);
4054 return target;
4055 }
4056
4057 /* The last resort is to load the constant as an integer and then
4058 compare it against zero. Use -1 for set bits in order to increase
4059 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4060 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4061 builder.nelts_per_pattern ());
4062 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4063 int_builder.quick_push (INTVAL (builder.elt (i))
4064 ? constm1_rtx : const0_rtx);
4065 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4066 int_builder.build ());
4067 }
4068
4069 /* Set DEST to immediate IMM. */
4070
4071 void
4072 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4073 {
4074 machine_mode mode = GET_MODE (dest);
4075
4076 /* Check on what type of symbol it is. */
4077 scalar_int_mode int_mode;
4078 if ((GET_CODE (imm) == SYMBOL_REF
4079 || GET_CODE (imm) == LABEL_REF
4080 || GET_CODE (imm) == CONST
4081 || GET_CODE (imm) == CONST_POLY_INT)
4082 && is_a <scalar_int_mode> (mode, &int_mode))
4083 {
4084 rtx mem;
4085 poly_int64 offset;
4086 HOST_WIDE_INT const_offset;
4087 enum aarch64_symbol_type sty;
4088
4089 /* If we have (const (plus symbol offset)), separate out the offset
4090 before we start classifying the symbol. */
4091 rtx base = strip_offset (imm, &offset);
4092
4093 /* We must always add an offset involving VL separately, rather than
4094 folding it into the relocation. */
4095 if (!offset.is_constant (&const_offset))
4096 {
4097 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4098 emit_insn (gen_rtx_SET (dest, imm));
4099 else
4100 {
4101 /* Do arithmetic on 32-bit values if the result is smaller
4102 than that. */
4103 if (partial_subreg_p (int_mode, SImode))
4104 {
4105 /* It is invalid to do symbol calculations in modes
4106 narrower than SImode. */
4107 gcc_assert (base == const0_rtx);
4108 dest = gen_lowpart (SImode, dest);
4109 int_mode = SImode;
4110 }
4111 if (base != const0_rtx)
4112 {
4113 base = aarch64_force_temporary (int_mode, dest, base);
4114 aarch64_add_offset (int_mode, dest, base, offset,
4115 NULL_RTX, NULL_RTX, false);
4116 }
4117 else
4118 aarch64_add_offset (int_mode, dest, base, offset,
4119 dest, NULL_RTX, false);
4120 }
4121 return;
4122 }
4123
4124 sty = aarch64_classify_symbol (base, const_offset);
4125 switch (sty)
4126 {
4127 case SYMBOL_FORCE_TO_MEM:
4128 if (const_offset != 0
4129 && targetm.cannot_force_const_mem (int_mode, imm))
4130 {
4131 gcc_assert (can_create_pseudo_p ());
4132 base = aarch64_force_temporary (int_mode, dest, base);
4133 aarch64_add_offset (int_mode, dest, base, const_offset,
4134 NULL_RTX, NULL_RTX, false);
4135 return;
4136 }
4137
4138 mem = force_const_mem (ptr_mode, imm);
4139 gcc_assert (mem);
4140
4141 /* If we aren't generating PC relative literals, then
4142 we need to expand the literal pool access carefully.
4143 This is something that needs to be done in a number
4144 of places, so could well live as a separate function. */
4145 if (!aarch64_pcrelative_literal_loads)
4146 {
4147 gcc_assert (can_create_pseudo_p ());
4148 base = gen_reg_rtx (ptr_mode);
4149 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4150 if (ptr_mode != Pmode)
4151 base = convert_memory_address (Pmode, base);
4152 mem = gen_rtx_MEM (ptr_mode, base);
4153 }
4154
4155 if (int_mode != ptr_mode)
4156 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4157
4158 emit_insn (gen_rtx_SET (dest, mem));
4159
4160 return;
4161
4162 case SYMBOL_SMALL_TLSGD:
4163 case SYMBOL_SMALL_TLSDESC:
4164 case SYMBOL_SMALL_TLSIE:
4165 case SYMBOL_SMALL_GOT_28K:
4166 case SYMBOL_SMALL_GOT_4G:
4167 case SYMBOL_TINY_GOT:
4168 case SYMBOL_TINY_TLSIE:
4169 if (const_offset != 0)
4170 {
4171 gcc_assert(can_create_pseudo_p ());
4172 base = aarch64_force_temporary (int_mode, dest, base);
4173 aarch64_add_offset (int_mode, dest, base, const_offset,
4174 NULL_RTX, NULL_RTX, false);
4175 return;
4176 }
4177 /* FALLTHRU */
4178
4179 case SYMBOL_SMALL_ABSOLUTE:
4180 case SYMBOL_TINY_ABSOLUTE:
4181 case SYMBOL_TLSLE12:
4182 case SYMBOL_TLSLE24:
4183 case SYMBOL_TLSLE32:
4184 case SYMBOL_TLSLE48:
4185 aarch64_load_symref_appropriately (dest, imm, sty);
4186 return;
4187
4188 default:
4189 gcc_unreachable ();
4190 }
4191 }
4192
4193 if (!CONST_INT_P (imm))
4194 {
4195 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4196 {
4197 /* Only the low bit of each .H, .S and .D element is defined,
4198 so we can set the upper bits to whatever we like. If the
4199 predicate is all-true in MODE, prefer to set all the undefined
4200 bits as well, so that we can share a single .B predicate for
4201 all modes. */
4202 if (imm == CONSTM1_RTX (mode))
4203 imm = CONSTM1_RTX (VNx16BImode);
4204
4205 /* All methods for constructing predicate modes wider than VNx16BI
4206 will set the upper bits of each element to zero. Expose this
4207 by moving such constants as a VNx16BI, so that all bits are
4208 significant and so that constants for different modes can be
4209 shared. The wider constant will still be available as a
4210 REG_EQUAL note. */
4211 rtx_vector_builder builder;
4212 if (aarch64_get_sve_pred_bits (builder, imm))
4213 {
4214 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4215 if (dest != res)
4216 emit_move_insn (dest, gen_lowpart (mode, res));
4217 return;
4218 }
4219 }
4220
4221 if (GET_CODE (imm) == HIGH
4222 || aarch64_simd_valid_immediate (imm, NULL))
4223 {
4224 emit_insn (gen_rtx_SET (dest, imm));
4225 return;
4226 }
4227
4228 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4229 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4230 {
4231 if (dest != res)
4232 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4233 return;
4234 }
4235
4236 rtx mem = force_const_mem (mode, imm);
4237 gcc_assert (mem);
4238 emit_move_insn (dest, mem);
4239 return;
4240 }
4241
4242 aarch64_internal_mov_immediate (dest, imm, true,
4243 as_a <scalar_int_mode> (mode));
4244 }
4245
4246 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4247 that is known to contain PTRUE. */
4248
4249 void
4250 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4251 {
4252 expand_operand ops[3];
4253 machine_mode mode = GET_MODE (dest);
4254 create_output_operand (&ops[0], dest, mode);
4255 create_input_operand (&ops[1], pred, GET_MODE(pred));
4256 create_input_operand (&ops[2], src, mode);
4257 temporary_volatile_ok v (true);
4258 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4259 }
4260
4261 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4262 operand is in memory. In this case we need to use the predicated LD1
4263 and ST1 instead of LDR and STR, both for correctness on big-endian
4264 targets and because LD1 and ST1 support a wider range of addressing modes.
4265 PRED_MODE is the mode of the predicate.
4266
4267 See the comment at the head of aarch64-sve.md for details about the
4268 big-endian handling. */
4269
4270 void
4271 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4272 {
4273 machine_mode mode = GET_MODE (dest);
4274 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4275 if (!register_operand (src, mode)
4276 && !register_operand (dest, mode))
4277 {
4278 rtx tmp = gen_reg_rtx (mode);
4279 if (MEM_P (src))
4280 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4281 else
4282 emit_move_insn (tmp, src);
4283 src = tmp;
4284 }
4285 aarch64_emit_sve_pred_move (dest, ptrue, src);
4286 }
4287
4288 /* Called only on big-endian targets. See whether an SVE vector move
4289 from SRC to DEST is effectively a REV[BHW] instruction, because at
4290 least one operand is a subreg of an SVE vector that has wider or
4291 narrower elements. Return true and emit the instruction if so.
4292
4293 For example:
4294
4295 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4296
4297 represents a VIEW_CONVERT between the following vectors, viewed
4298 in memory order:
4299
4300 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4301 R1: { [0], [1], [2], [3], ... }
4302
4303 The high part of lane X in R2 should therefore correspond to lane X*2
4304 of R1, but the register representations are:
4305
4306 msb lsb
4307 R2: ...... [1].high [1].low [0].high [0].low
4308 R1: ...... [3] [2] [1] [0]
4309
4310 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4311 We therefore need a reverse operation to swap the high and low values
4312 around.
4313
4314 This is purely an optimization. Without it we would spill the
4315 subreg operand to the stack in one mode and reload it in the
4316 other mode, which has the same effect as the REV. */
4317
4318 bool
4319 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4320 {
4321 gcc_assert (BYTES_BIG_ENDIAN);
4322 if (GET_CODE (dest) == SUBREG)
4323 dest = SUBREG_REG (dest);
4324 if (GET_CODE (src) == SUBREG)
4325 src = SUBREG_REG (src);
4326
4327 /* The optimization handles two single SVE REGs with different element
4328 sizes. */
4329 if (!REG_P (dest)
4330 || !REG_P (src)
4331 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4332 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4333 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4334 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4335 return false;
4336
4337 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4338 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4339 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4340 UNSPEC_REV_SUBREG);
4341 emit_insn (gen_rtx_SET (dest, unspec));
4342 return true;
4343 }
4344
4345 /* Return a copy of X with mode MODE, without changing its other
4346 attributes. Unlike gen_lowpart, this doesn't care whether the
4347 mode change is valid. */
4348
4349 static rtx
4350 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4351 {
4352 if (GET_MODE (x) == mode)
4353 return x;
4354
4355 x = shallow_copy_rtx (x);
4356 set_mode_and_regno (x, mode, REGNO (x));
4357 return x;
4358 }
4359
4360 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4361 stored in wider integer containers. */
4362
4363 static unsigned int
4364 aarch64_sve_rev_unspec (machine_mode mode)
4365 {
4366 switch (GET_MODE_UNIT_SIZE (mode))
4367 {
4368 case 1: return UNSPEC_REVB;
4369 case 2: return UNSPEC_REVH;
4370 case 4: return UNSPEC_REVW;
4371 }
4372 gcc_unreachable ();
4373 }
4374
4375 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4376 operands. */
4377
4378 void
4379 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4380 {
4381 /* Decide which REV operation we need. The mode with wider elements
4382 determines the mode of the operands and the mode with the narrower
4383 elements determines the reverse width. */
4384 machine_mode mode_with_wider_elts = GET_MODE (dest);
4385 machine_mode mode_with_narrower_elts = GET_MODE (src);
4386 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4387 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4388 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4389
4390 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4391 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4392 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4393
4394 /* Get the operands in the appropriate modes and emit the instruction. */
4395 ptrue = gen_lowpart (pred_mode, ptrue);
4396 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4397 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4398 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4399 dest, ptrue, src));
4400 }
4401
4402 static bool
4403 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4404 tree exp ATTRIBUTE_UNUSED)
4405 {
4406 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4407 return false;
4408
4409 return true;
4410 }
4411
4412 /* Implement TARGET_PASS_BY_REFERENCE. */
4413
4414 static bool
4415 aarch64_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
4416 {
4417 HOST_WIDE_INT size;
4418 machine_mode dummymode;
4419 int nregs;
4420
4421 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4422 if (arg.mode == BLKmode && arg.type)
4423 size = int_size_in_bytes (arg.type);
4424 else
4425 /* No frontends can create types with variable-sized modes, so we
4426 shouldn't be asked to pass or return them. */
4427 size = GET_MODE_SIZE (arg.mode).to_constant ();
4428
4429 /* Aggregates are passed by reference based on their size. */
4430 if (arg.aggregate_type_p ())
4431 size = int_size_in_bytes (arg.type);
4432
4433 /* Variable sized arguments are always returned by reference. */
4434 if (size < 0)
4435 return true;
4436
4437 /* Can this be a candidate to be passed in fp/simd register(s)? */
4438 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4439 &dummymode, &nregs,
4440 NULL))
4441 return false;
4442
4443 /* Arguments which are variable sized or larger than 2 registers are
4444 passed by reference unless they are a homogenous floating point
4445 aggregate. */
4446 return size > 2 * UNITS_PER_WORD;
4447 }
4448
4449 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4450 static bool
4451 aarch64_return_in_msb (const_tree valtype)
4452 {
4453 machine_mode dummy_mode;
4454 int dummy_int;
4455
4456 /* Never happens in little-endian mode. */
4457 if (!BYTES_BIG_ENDIAN)
4458 return false;
4459
4460 /* Only composite types smaller than or equal to 16 bytes can
4461 be potentially returned in registers. */
4462 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4463 || int_size_in_bytes (valtype) <= 0
4464 || int_size_in_bytes (valtype) > 16)
4465 return false;
4466
4467 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4468 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4469 is always passed/returned in the least significant bits of fp/simd
4470 register(s). */
4471 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4472 &dummy_mode, &dummy_int, NULL))
4473 return false;
4474
4475 return true;
4476 }
4477
4478 /* Implement TARGET_FUNCTION_VALUE.
4479 Define how to find the value returned by a function. */
4480
4481 static rtx
4482 aarch64_function_value (const_tree type, const_tree func,
4483 bool outgoing ATTRIBUTE_UNUSED)
4484 {
4485 machine_mode mode;
4486 int unsignedp;
4487 int count;
4488 machine_mode ag_mode;
4489
4490 mode = TYPE_MODE (type);
4491 if (INTEGRAL_TYPE_P (type))
4492 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4493
4494 if (aarch64_return_in_msb (type))
4495 {
4496 HOST_WIDE_INT size = int_size_in_bytes (type);
4497
4498 if (size % UNITS_PER_WORD != 0)
4499 {
4500 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4501 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4502 }
4503 }
4504
4505 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4506 &ag_mode, &count, NULL))
4507 {
4508 if (!aarch64_composite_type_p (type, mode))
4509 {
4510 gcc_assert (count == 1 && mode == ag_mode);
4511 return gen_rtx_REG (mode, V0_REGNUM);
4512 }
4513 else
4514 {
4515 int i;
4516 rtx par;
4517
4518 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4519 for (i = 0; i < count; i++)
4520 {
4521 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4522 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4523 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4524 XVECEXP (par, 0, i) = tmp;
4525 }
4526 return par;
4527 }
4528 }
4529 else
4530 return gen_rtx_REG (mode, R0_REGNUM);
4531 }
4532
4533 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4534 Return true if REGNO is the number of a hard register in which the values
4535 of called function may come back. */
4536
4537 static bool
4538 aarch64_function_value_regno_p (const unsigned int regno)
4539 {
4540 /* Maximum of 16 bytes can be returned in the general registers. Examples
4541 of 16-byte return values are: 128-bit integers and 16-byte small
4542 structures (excluding homogeneous floating-point aggregates). */
4543 if (regno == R0_REGNUM || regno == R1_REGNUM)
4544 return true;
4545
4546 /* Up to four fp/simd registers can return a function value, e.g. a
4547 homogeneous floating-point aggregate having four members. */
4548 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4549 return TARGET_FLOAT;
4550
4551 return false;
4552 }
4553
4554 /* Implement TARGET_RETURN_IN_MEMORY.
4555
4556 If the type T of the result of a function is such that
4557 void func (T arg)
4558 would require that arg be passed as a value in a register (or set of
4559 registers) according to the parameter passing rules, then the result
4560 is returned in the same registers as would be used for such an
4561 argument. */
4562
4563 static bool
4564 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4565 {
4566 HOST_WIDE_INT size;
4567 machine_mode ag_mode;
4568 int count;
4569
4570 if (!AGGREGATE_TYPE_P (type)
4571 && TREE_CODE (type) != COMPLEX_TYPE
4572 && TREE_CODE (type) != VECTOR_TYPE)
4573 /* Simple scalar types always returned in registers. */
4574 return false;
4575
4576 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4577 type,
4578 &ag_mode,
4579 &count,
4580 NULL))
4581 return false;
4582
4583 /* Types larger than 2 registers returned in memory. */
4584 size = int_size_in_bytes (type);
4585 return (size < 0 || size > 2 * UNITS_PER_WORD);
4586 }
4587
4588 static bool
4589 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4590 const_tree type, int *nregs)
4591 {
4592 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4593 return aarch64_vfp_is_call_or_return_candidate (mode,
4594 type,
4595 &pcum->aapcs_vfp_rmode,
4596 nregs,
4597 NULL);
4598 }
4599
4600 /* Given MODE and TYPE of a function argument, return the alignment in
4601 bits. The idea is to suppress any stronger alignment requested by
4602 the user and opt for the natural alignment (specified in AAPCS64 \S
4603 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4604 calculated in versions of GCC prior to GCC-9. This is a helper
4605 function for local use only. */
4606
4607 static unsigned int
4608 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4609 bool *abi_break)
4610 {
4611 *abi_break = false;
4612 if (!type)
4613 return GET_MODE_ALIGNMENT (mode);
4614
4615 if (integer_zerop (TYPE_SIZE (type)))
4616 return 0;
4617
4618 gcc_assert (TYPE_MODE (type) == mode);
4619
4620 if (!AGGREGATE_TYPE_P (type))
4621 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4622
4623 if (TREE_CODE (type) == ARRAY_TYPE)
4624 return TYPE_ALIGN (TREE_TYPE (type));
4625
4626 unsigned int alignment = 0;
4627 unsigned int bitfield_alignment = 0;
4628 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4629 if (TREE_CODE (field) == FIELD_DECL)
4630 {
4631 alignment = std::max (alignment, DECL_ALIGN (field));
4632 if (DECL_BIT_FIELD_TYPE (field))
4633 bitfield_alignment
4634 = std::max (bitfield_alignment,
4635 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4636 }
4637
4638 if (bitfield_alignment > alignment)
4639 {
4640 *abi_break = true;
4641 return bitfield_alignment;
4642 }
4643
4644 return alignment;
4645 }
4646
4647 /* Layout a function argument according to the AAPCS64 rules. The rule
4648 numbers refer to the rule numbers in the AAPCS64. */
4649
4650 static void
4651 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4652 const_tree type,
4653 bool named ATTRIBUTE_UNUSED)
4654 {
4655 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4656 int ncrn, nvrn, nregs;
4657 bool allocate_ncrn, allocate_nvrn;
4658 HOST_WIDE_INT size;
4659 bool abi_break;
4660
4661 /* We need to do this once per argument. */
4662 if (pcum->aapcs_arg_processed)
4663 return;
4664
4665 pcum->aapcs_arg_processed = true;
4666
4667 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4668 if (type)
4669 size = int_size_in_bytes (type);
4670 else
4671 /* No frontends can create types with variable-sized modes, so we
4672 shouldn't be asked to pass or return them. */
4673 size = GET_MODE_SIZE (mode).to_constant ();
4674 size = ROUND_UP (size, UNITS_PER_WORD);
4675
4676 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4677 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4678 mode,
4679 type,
4680 &nregs);
4681
4682 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4683 The following code thus handles passing by SIMD/FP registers first. */
4684
4685 nvrn = pcum->aapcs_nvrn;
4686
4687 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4688 and homogenous short-vector aggregates (HVA). */
4689 if (allocate_nvrn)
4690 {
4691 if (!TARGET_FLOAT)
4692 aarch64_err_no_fpadvsimd (mode);
4693
4694 if (nvrn + nregs <= NUM_FP_ARG_REGS)
4695 {
4696 pcum->aapcs_nextnvrn = nvrn + nregs;
4697 if (!aarch64_composite_type_p (type, mode))
4698 {
4699 gcc_assert (nregs == 1);
4700 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4701 }
4702 else
4703 {
4704 rtx par;
4705 int i;
4706 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4707 for (i = 0; i < nregs; i++)
4708 {
4709 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4710 V0_REGNUM + nvrn + i);
4711 rtx offset = gen_int_mode
4712 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4713 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4714 XVECEXP (par, 0, i) = tmp;
4715 }
4716 pcum->aapcs_reg = par;
4717 }
4718 return;
4719 }
4720 else
4721 {
4722 /* C.3 NSRN is set to 8. */
4723 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4724 goto on_stack;
4725 }
4726 }
4727
4728 ncrn = pcum->aapcs_ncrn;
4729 nregs = size / UNITS_PER_WORD;
4730
4731 /* C6 - C9. though the sign and zero extension semantics are
4732 handled elsewhere. This is the case where the argument fits
4733 entirely general registers. */
4734 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4735 {
4736 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4737
4738 /* C.8 if the argument has an alignment of 16 then the NGRN is
4739 rounded up to the next even number. */
4740 if (nregs == 2
4741 && ncrn % 2
4742 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4743 comparison is there because for > 16 * BITS_PER_UNIT
4744 alignment nregs should be > 2 and therefore it should be
4745 passed by reference rather than value. */
4746 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4747 == 16 * BITS_PER_UNIT))
4748 {
4749 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4750 inform (input_location, "parameter passing for argument of type "
4751 "%qT changed in GCC 9.1", type);
4752 ++ncrn;
4753 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4754 }
4755
4756 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4757 A reg is still generated for it, but the caller should be smart
4758 enough not to use it. */
4759 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4760 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4761 else
4762 {
4763 rtx par;
4764 int i;
4765
4766 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4767 for (i = 0; i < nregs; i++)
4768 {
4769 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4770 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4771 GEN_INT (i * UNITS_PER_WORD));
4772 XVECEXP (par, 0, i) = tmp;
4773 }
4774 pcum->aapcs_reg = par;
4775 }
4776
4777 pcum->aapcs_nextncrn = ncrn + nregs;
4778 return;
4779 }
4780
4781 /* C.11 */
4782 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4783
4784 /* The argument is passed on stack; record the needed number of words for
4785 this argument and align the total size if necessary. */
4786 on_stack:
4787 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4788
4789 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4790 == 16 * BITS_PER_UNIT)
4791 {
4792 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4793 if (pcum->aapcs_stack_size != new_size)
4794 {
4795 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4796 inform (input_location, "parameter passing for argument of type "
4797 "%qT changed in GCC 9.1", type);
4798 pcum->aapcs_stack_size = new_size;
4799 }
4800 }
4801 return;
4802 }
4803
4804 /* Implement TARGET_FUNCTION_ARG. */
4805
4806 static rtx
4807 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
4808 {
4809 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4810 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4811
4812 if (arg.end_marker_p ())
4813 return NULL_RTX;
4814
4815 aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4816 return pcum->aapcs_reg;
4817 }
4818
4819 void
4820 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4821 const_tree fntype ATTRIBUTE_UNUSED,
4822 rtx libname ATTRIBUTE_UNUSED,
4823 const_tree fndecl ATTRIBUTE_UNUSED,
4824 unsigned n_named ATTRIBUTE_UNUSED)
4825 {
4826 pcum->aapcs_ncrn = 0;
4827 pcum->aapcs_nvrn = 0;
4828 pcum->aapcs_nextncrn = 0;
4829 pcum->aapcs_nextnvrn = 0;
4830 pcum->pcs_variant = ARM_PCS_AAPCS64;
4831 pcum->aapcs_reg = NULL_RTX;
4832 pcum->aapcs_arg_processed = false;
4833 pcum->aapcs_stack_words = 0;
4834 pcum->aapcs_stack_size = 0;
4835
4836 if (!TARGET_FLOAT
4837 && fndecl && TREE_PUBLIC (fndecl)
4838 && fntype && fntype != error_mark_node)
4839 {
4840 const_tree type = TREE_TYPE (fntype);
4841 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4842 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4843 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4844 &mode, &nregs, NULL))
4845 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4846 }
4847 return;
4848 }
4849
4850 static void
4851 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4852 const function_arg_info &arg)
4853 {
4854 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4855 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4856 {
4857 aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4858 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4859 != (pcum->aapcs_stack_words != 0));
4860 pcum->aapcs_arg_processed = false;
4861 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4862 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4863 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4864 pcum->aapcs_stack_words = 0;
4865 pcum->aapcs_reg = NULL_RTX;
4866 }
4867 }
4868
4869 bool
4870 aarch64_function_arg_regno_p (unsigned regno)
4871 {
4872 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4873 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4874 }
4875
4876 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4877 PARM_BOUNDARY bits of alignment, but will be given anything up
4878 to STACK_BOUNDARY bits if the type requires it. This makes sure
4879 that both before and after the layout of each argument, the Next
4880 Stacked Argument Address (NSAA) will have a minimum alignment of
4881 8 bytes. */
4882
4883 static unsigned int
4884 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4885 {
4886 bool abi_break;
4887 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4888 &abi_break);
4889 if (abi_break & warn_psabi)
4890 inform (input_location, "parameter passing for argument of type "
4891 "%qT changed in GCC 9.1", type);
4892
4893 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4894 }
4895
4896 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4897
4898 static fixed_size_mode
4899 aarch64_get_reg_raw_mode (int regno)
4900 {
4901 if (TARGET_SVE && FP_REGNUM_P (regno))
4902 /* Don't use the SVE part of the register for __builtin_apply and
4903 __builtin_return. The SVE registers aren't used by the normal PCS,
4904 so using them there would be a waste of time. The PCS extensions
4905 for SVE types are fundamentally incompatible with the
4906 __builtin_return/__builtin_apply interface. */
4907 return as_a <fixed_size_mode> (V16QImode);
4908 return default_get_reg_raw_mode (regno);
4909 }
4910
4911 /* Implement TARGET_FUNCTION_ARG_PADDING.
4912
4913 Small aggregate types are placed in the lowest memory address.
4914
4915 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4916
4917 static pad_direction
4918 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4919 {
4920 /* On little-endian targets, the least significant byte of every stack
4921 argument is passed at the lowest byte address of the stack slot. */
4922 if (!BYTES_BIG_ENDIAN)
4923 return PAD_UPWARD;
4924
4925 /* Otherwise, integral, floating-point and pointer types are padded downward:
4926 the least significant byte of a stack argument is passed at the highest
4927 byte address of the stack slot. */
4928 if (type
4929 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4930 || POINTER_TYPE_P (type))
4931 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4932 return PAD_DOWNWARD;
4933
4934 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4935 return PAD_UPWARD;
4936 }
4937
4938 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4939
4940 It specifies padding for the last (may also be the only)
4941 element of a block move between registers and memory. If
4942 assuming the block is in the memory, padding upward means that
4943 the last element is padded after its highest significant byte,
4944 while in downward padding, the last element is padded at the
4945 its least significant byte side.
4946
4947 Small aggregates and small complex types are always padded
4948 upwards.
4949
4950 We don't need to worry about homogeneous floating-point or
4951 short-vector aggregates; their move is not affected by the
4952 padding direction determined here. Regardless of endianness,
4953 each element of such an aggregate is put in the least
4954 significant bits of a fp/simd register.
4955
4956 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4957 register has useful data, and return the opposite if the most
4958 significant byte does. */
4959
4960 bool
4961 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4962 bool first ATTRIBUTE_UNUSED)
4963 {
4964
4965 /* Small composite types are always padded upward. */
4966 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4967 {
4968 HOST_WIDE_INT size;
4969 if (type)
4970 size = int_size_in_bytes (type);
4971 else
4972 /* No frontends can create types with variable-sized modes, so we
4973 shouldn't be asked to pass or return them. */
4974 size = GET_MODE_SIZE (mode).to_constant ();
4975 if (size < 2 * UNITS_PER_WORD)
4976 return true;
4977 }
4978
4979 /* Otherwise, use the default padding. */
4980 return !BYTES_BIG_ENDIAN;
4981 }
4982
4983 static scalar_int_mode
4984 aarch64_libgcc_cmp_return_mode (void)
4985 {
4986 return SImode;
4987 }
4988
4989 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4990
4991 /* We use the 12-bit shifted immediate arithmetic instructions so values
4992 must be multiple of (1 << 12), i.e. 4096. */
4993 #define ARITH_FACTOR 4096
4994
4995 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4996 #error Cannot use simple address calculation for stack probing
4997 #endif
4998
4999 /* The pair of scratch registers used for stack probing. */
5000 #define PROBE_STACK_FIRST_REG R9_REGNUM
5001 #define PROBE_STACK_SECOND_REG R10_REGNUM
5002
5003 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5004 inclusive. These are offsets from the current stack pointer. */
5005
5006 static void
5007 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5008 {
5009 HOST_WIDE_INT size;
5010 if (!poly_size.is_constant (&size))
5011 {
5012 sorry ("stack probes for SVE frames");
5013 return;
5014 }
5015
5016 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5017
5018 /* See the same assertion on PROBE_INTERVAL above. */
5019 gcc_assert ((first % ARITH_FACTOR) == 0);
5020
5021 /* See if we have a constant small number of probes to generate. If so,
5022 that's the easy case. */
5023 if (size <= PROBE_INTERVAL)
5024 {
5025 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5026
5027 emit_set_insn (reg1,
5028 plus_constant (Pmode,
5029 stack_pointer_rtx, -(first + base)));
5030 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5031 }
5032
5033 /* The run-time loop is made up of 8 insns in the generic case while the
5034 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5035 else if (size <= 4 * PROBE_INTERVAL)
5036 {
5037 HOST_WIDE_INT i, rem;
5038
5039 emit_set_insn (reg1,
5040 plus_constant (Pmode,
5041 stack_pointer_rtx,
5042 -(first + PROBE_INTERVAL)));
5043 emit_stack_probe (reg1);
5044
5045 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5046 it exceeds SIZE. If only two probes are needed, this will not
5047 generate any code. Then probe at FIRST + SIZE. */
5048 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5049 {
5050 emit_set_insn (reg1,
5051 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5052 emit_stack_probe (reg1);
5053 }
5054
5055 rem = size - (i - PROBE_INTERVAL);
5056 if (rem > 256)
5057 {
5058 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5059
5060 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5061 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5062 }
5063 else
5064 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5065 }
5066
5067 /* Otherwise, do the same as above, but in a loop. Note that we must be
5068 extra careful with variables wrapping around because we might be at
5069 the very top (or the very bottom) of the address space and we have
5070 to be able to handle this case properly; in particular, we use an
5071 equality test for the loop condition. */
5072 else
5073 {
5074 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5075
5076 /* Step 1: round SIZE to the previous multiple of the interval. */
5077
5078 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5079
5080
5081 /* Step 2: compute initial and final value of the loop counter. */
5082
5083 /* TEST_ADDR = SP + FIRST. */
5084 emit_set_insn (reg1,
5085 plus_constant (Pmode, stack_pointer_rtx, -first));
5086
5087 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5088 HOST_WIDE_INT adjustment = - (first + rounded_size);
5089 if (! aarch64_uimm12_shift (adjustment))
5090 {
5091 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5092 true, Pmode);
5093 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5094 }
5095 else
5096 emit_set_insn (reg2,
5097 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5098
5099 /* Step 3: the loop
5100
5101 do
5102 {
5103 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5104 probe at TEST_ADDR
5105 }
5106 while (TEST_ADDR != LAST_ADDR)
5107
5108 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5109 until it is equal to ROUNDED_SIZE. */
5110
5111 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5112
5113
5114 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5115 that SIZE is equal to ROUNDED_SIZE. */
5116
5117 if (size != rounded_size)
5118 {
5119 HOST_WIDE_INT rem = size - rounded_size;
5120
5121 if (rem > 256)
5122 {
5123 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5124
5125 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5126 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5127 }
5128 else
5129 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5130 }
5131 }
5132
5133 /* Make sure nothing is scheduled before we are done. */
5134 emit_insn (gen_blockage ());
5135 }
5136
5137 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5138 absolute addresses. */
5139
5140 const char *
5141 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5142 {
5143 static int labelno = 0;
5144 char loop_lab[32];
5145 rtx xops[2];
5146
5147 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5148
5149 /* Loop. */
5150 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5151
5152 HOST_WIDE_INT stack_clash_probe_interval
5153 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5154
5155 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5156 xops[0] = reg1;
5157 HOST_WIDE_INT interval;
5158 if (flag_stack_clash_protection)
5159 interval = stack_clash_probe_interval;
5160 else
5161 interval = PROBE_INTERVAL;
5162
5163 gcc_assert (aarch64_uimm12_shift (interval));
5164 xops[1] = GEN_INT (interval);
5165
5166 output_asm_insn ("sub\t%0, %0, %1", xops);
5167
5168 /* If doing stack clash protection then we probe up by the ABI specified
5169 amount. We do this because we're dropping full pages at a time in the
5170 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5171 if (flag_stack_clash_protection)
5172 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5173 else
5174 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5175
5176 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5177 by this amount for each iteration. */
5178 output_asm_insn ("str\txzr, [%0, %1]", xops);
5179
5180 /* Test if TEST_ADDR == LAST_ADDR. */
5181 xops[1] = reg2;
5182 output_asm_insn ("cmp\t%0, %1", xops);
5183
5184 /* Branch. */
5185 fputs ("\tb.ne\t", asm_out_file);
5186 assemble_name_raw (asm_out_file, loop_lab);
5187 fputc ('\n', asm_out_file);
5188
5189 return "";
5190 }
5191
5192 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5193 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5194 of GUARD_SIZE. When a probe is emitted it is done at most
5195 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5196 at most MIN_PROBE_THRESHOLD. By the end of this function
5197 BASE = BASE - ADJUSTMENT. */
5198
5199 const char *
5200 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5201 rtx min_probe_threshold, rtx guard_size)
5202 {
5203 /* This function is not allowed to use any instruction generation function
5204 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5205 so instead emit the code you want using output_asm_insn. */
5206 gcc_assert (flag_stack_clash_protection);
5207 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5208 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5209
5210 /* The minimum required allocation before the residual requires probing. */
5211 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5212
5213 /* Clamp the value down to the nearest value that can be used with a cmp. */
5214 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5215 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5216
5217 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5218 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5219
5220 static int labelno = 0;
5221 char loop_start_lab[32];
5222 char loop_end_lab[32];
5223 rtx xops[2];
5224
5225 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5226 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5227
5228 /* Emit loop start label. */
5229 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5230
5231 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5232 xops[0] = adjustment;
5233 xops[1] = probe_offset_value_rtx;
5234 output_asm_insn ("cmp\t%0, %1", xops);
5235
5236 /* Branch to end if not enough adjustment to probe. */
5237 fputs ("\tb.lt\t", asm_out_file);
5238 assemble_name_raw (asm_out_file, loop_end_lab);
5239 fputc ('\n', asm_out_file);
5240
5241 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5242 xops[0] = base;
5243 xops[1] = probe_offset_value_rtx;
5244 output_asm_insn ("sub\t%0, %0, %1", xops);
5245
5246 /* Probe at BASE. */
5247 xops[1] = const0_rtx;
5248 output_asm_insn ("str\txzr, [%0, %1]", xops);
5249
5250 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5251 xops[0] = adjustment;
5252 xops[1] = probe_offset_value_rtx;
5253 output_asm_insn ("sub\t%0, %0, %1", xops);
5254
5255 /* Branch to start if still more bytes to allocate. */
5256 fputs ("\tb\t", asm_out_file);
5257 assemble_name_raw (asm_out_file, loop_start_lab);
5258 fputc ('\n', asm_out_file);
5259
5260 /* No probe leave. */
5261 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5262
5263 /* BASE = BASE - ADJUSTMENT. */
5264 xops[0] = base;
5265 xops[1] = adjustment;
5266 output_asm_insn ("sub\t%0, %0, %1", xops);
5267 return "";
5268 }
5269
5270 /* Determine whether a frame chain needs to be generated. */
5271 static bool
5272 aarch64_needs_frame_chain (void)
5273 {
5274 /* Force a frame chain for EH returns so the return address is at FP+8. */
5275 if (frame_pointer_needed || crtl->calls_eh_return)
5276 return true;
5277
5278 /* A leaf function cannot have calls or write LR. */
5279 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5280
5281 /* Don't use a frame chain in leaf functions if leaf frame pointers
5282 are disabled. */
5283 if (flag_omit_leaf_frame_pointer && is_leaf)
5284 return false;
5285
5286 return aarch64_use_frame_pointer;
5287 }
5288
5289 /* Mark the registers that need to be saved by the callee and calculate
5290 the size of the callee-saved registers area and frame record (both FP
5291 and LR may be omitted). */
5292 static void
5293 aarch64_layout_frame (void)
5294 {
5295 HOST_WIDE_INT offset = 0;
5296 int regno, last_fp_reg = INVALID_REGNUM;
5297 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5298
5299 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5300
5301 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5302 the mid-end is doing. */
5303 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5304
5305 #define SLOT_NOT_REQUIRED (-2)
5306 #define SLOT_REQUIRED (-1)
5307
5308 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5309 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5310
5311 /* If this is a non-leaf simd function with calls we assume that
5312 at least one of those calls is to a non-simd function and thus
5313 we must save V8 to V23 in the prologue. */
5314
5315 if (simd_function && !crtl->is_leaf)
5316 {
5317 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5318 if (FP_SIMD_SAVED_REGNUM_P (regno))
5319 df_set_regs_ever_live (regno, true);
5320 }
5321
5322 /* First mark all the registers that really need to be saved... */
5323 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5324 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5325
5326 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5327 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5328
5329 /* ... that includes the eh data registers (if needed)... */
5330 if (crtl->calls_eh_return)
5331 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5332 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5333 = SLOT_REQUIRED;
5334
5335 /* ... and any callee saved register that dataflow says is live. */
5336 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5337 if (df_regs_ever_live_p (regno)
5338 && (regno == R30_REGNUM
5339 || !call_used_regs[regno]))
5340 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5341
5342 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5343 if (df_regs_ever_live_p (regno)
5344 && (!call_used_regs[regno]
5345 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5346 {
5347 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5348 last_fp_reg = regno;
5349 }
5350
5351 if (cfun->machine->frame.emit_frame_chain)
5352 {
5353 /* FP and LR are placed in the linkage record. */
5354 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5355 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5356 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5357 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5358 offset = 2 * UNITS_PER_WORD;
5359 }
5360
5361 /* With stack-clash, LR must be saved in non-leaf functions. */
5362 gcc_assert (crtl->is_leaf
5363 || (cfun->machine->frame.reg_offset[R30_REGNUM]
5364 != SLOT_NOT_REQUIRED));
5365
5366 /* Now assign stack slots for them. */
5367 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5368 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5369 {
5370 cfun->machine->frame.reg_offset[regno] = offset;
5371 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5372 cfun->machine->frame.wb_candidate1 = regno;
5373 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5374 cfun->machine->frame.wb_candidate2 = regno;
5375 offset += UNITS_PER_WORD;
5376 }
5377
5378 HOST_WIDE_INT max_int_offset = offset;
5379 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5380 bool has_align_gap = offset != max_int_offset;
5381
5382 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5383 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5384 {
5385 /* If there is an alignment gap between integer and fp callee-saves,
5386 allocate the last fp register to it if possible. */
5387 if (regno == last_fp_reg
5388 && has_align_gap
5389 && !simd_function
5390 && (offset & 8) == 0)
5391 {
5392 cfun->machine->frame.reg_offset[regno] = max_int_offset;
5393 break;
5394 }
5395
5396 cfun->machine->frame.reg_offset[regno] = offset;
5397 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5398 cfun->machine->frame.wb_candidate1 = regno;
5399 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5400 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5401 cfun->machine->frame.wb_candidate2 = regno;
5402 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5403 }
5404
5405 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5406
5407 cfun->machine->frame.saved_regs_size = offset;
5408
5409 HOST_WIDE_INT varargs_and_saved_regs_size
5410 = offset + cfun->machine->frame.saved_varargs_size;
5411
5412 cfun->machine->frame.hard_fp_offset
5413 = aligned_upper_bound (varargs_and_saved_regs_size
5414 + get_frame_size (),
5415 STACK_BOUNDARY / BITS_PER_UNIT);
5416
5417 /* Both these values are already aligned. */
5418 gcc_assert (multiple_p (crtl->outgoing_args_size,
5419 STACK_BOUNDARY / BITS_PER_UNIT));
5420 cfun->machine->frame.frame_size
5421 = (cfun->machine->frame.hard_fp_offset
5422 + crtl->outgoing_args_size);
5423
5424 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5425
5426 cfun->machine->frame.initial_adjust = 0;
5427 cfun->machine->frame.final_adjust = 0;
5428 cfun->machine->frame.callee_adjust = 0;
5429 cfun->machine->frame.callee_offset = 0;
5430
5431 HOST_WIDE_INT max_push_offset = 0;
5432 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5433 max_push_offset = 512;
5434 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5435 max_push_offset = 256;
5436
5437 HOST_WIDE_INT const_size, const_fp_offset;
5438 if (cfun->machine->frame.frame_size.is_constant (&const_size)
5439 && const_size < max_push_offset
5440 && known_eq (crtl->outgoing_args_size, 0))
5441 {
5442 /* Simple, small frame with no outgoing arguments:
5443 stp reg1, reg2, [sp, -frame_size]!
5444 stp reg3, reg4, [sp, 16] */
5445 cfun->machine->frame.callee_adjust = const_size;
5446 }
5447 else if (known_lt (crtl->outgoing_args_size
5448 + cfun->machine->frame.saved_regs_size, 512)
5449 && !(cfun->calls_alloca
5450 && known_lt (cfun->machine->frame.hard_fp_offset,
5451 max_push_offset)))
5452 {
5453 /* Frame with small outgoing arguments:
5454 sub sp, sp, frame_size
5455 stp reg1, reg2, [sp, outgoing_args_size]
5456 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5457 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5458 cfun->machine->frame.callee_offset
5459 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5460 }
5461 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5462 && const_fp_offset < max_push_offset)
5463 {
5464 /* Frame with large outgoing arguments but a small local area:
5465 stp reg1, reg2, [sp, -hard_fp_offset]!
5466 stp reg3, reg4, [sp, 16]
5467 sub sp, sp, outgoing_args_size */
5468 cfun->machine->frame.callee_adjust = const_fp_offset;
5469 cfun->machine->frame.final_adjust
5470 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5471 }
5472 else
5473 {
5474 /* Frame with large local area and outgoing arguments using frame pointer:
5475 sub sp, sp, hard_fp_offset
5476 stp x29, x30, [sp, 0]
5477 add x29, sp, 0
5478 stp reg3, reg4, [sp, 16]
5479 sub sp, sp, outgoing_args_size */
5480 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5481 cfun->machine->frame.final_adjust
5482 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5483 }
5484
5485 cfun->machine->frame.laid_out = true;
5486 }
5487
5488 /* Return true if the register REGNO is saved on entry to
5489 the current function. */
5490
5491 static bool
5492 aarch64_register_saved_on_entry (int regno)
5493 {
5494 return cfun->machine->frame.reg_offset[regno] >= 0;
5495 }
5496
5497 /* Return the next register up from REGNO up to LIMIT for the callee
5498 to save. */
5499
5500 static unsigned
5501 aarch64_next_callee_save (unsigned regno, unsigned limit)
5502 {
5503 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5504 regno ++;
5505 return regno;
5506 }
5507
5508 /* Push the register number REGNO of mode MODE to the stack with write-back
5509 adjusting the stack by ADJUSTMENT. */
5510
5511 static void
5512 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5513 HOST_WIDE_INT adjustment)
5514 {
5515 rtx base_rtx = stack_pointer_rtx;
5516 rtx insn, reg, mem;
5517
5518 reg = gen_rtx_REG (mode, regno);
5519 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5520 plus_constant (Pmode, base_rtx, -adjustment));
5521 mem = gen_frame_mem (mode, mem);
5522
5523 insn = emit_move_insn (mem, reg);
5524 RTX_FRAME_RELATED_P (insn) = 1;
5525 }
5526
5527 /* Generate and return an instruction to store the pair of registers
5528 REG and REG2 of mode MODE to location BASE with write-back adjusting
5529 the stack location BASE by ADJUSTMENT. */
5530
5531 static rtx
5532 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5533 HOST_WIDE_INT adjustment)
5534 {
5535 switch (mode)
5536 {
5537 case E_DImode:
5538 return gen_storewb_pairdi_di (base, base, reg, reg2,
5539 GEN_INT (-adjustment),
5540 GEN_INT (UNITS_PER_WORD - adjustment));
5541 case E_DFmode:
5542 return gen_storewb_pairdf_di (base, base, reg, reg2,
5543 GEN_INT (-adjustment),
5544 GEN_INT (UNITS_PER_WORD - adjustment));
5545 case E_TFmode:
5546 return gen_storewb_pairtf_di (base, base, reg, reg2,
5547 GEN_INT (-adjustment),
5548 GEN_INT (UNITS_PER_VREG - adjustment));
5549 default:
5550 gcc_unreachable ();
5551 }
5552 }
5553
5554 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5555 stack pointer by ADJUSTMENT. */
5556
5557 static void
5558 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5559 {
5560 rtx_insn *insn;
5561 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5562
5563 if (regno2 == INVALID_REGNUM)
5564 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5565
5566 rtx reg1 = gen_rtx_REG (mode, regno1);
5567 rtx reg2 = gen_rtx_REG (mode, regno2);
5568
5569 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5570 reg2, adjustment));
5571 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5572 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5573 RTX_FRAME_RELATED_P (insn) = 1;
5574 }
5575
5576 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5577 adjusting it by ADJUSTMENT afterwards. */
5578
5579 static rtx
5580 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5581 HOST_WIDE_INT adjustment)
5582 {
5583 switch (mode)
5584 {
5585 case E_DImode:
5586 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5587 GEN_INT (UNITS_PER_WORD));
5588 case E_DFmode:
5589 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5590 GEN_INT (UNITS_PER_WORD));
5591 case E_TFmode:
5592 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5593 GEN_INT (UNITS_PER_VREG));
5594 default:
5595 gcc_unreachable ();
5596 }
5597 }
5598
5599 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5600 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5601 into CFI_OPS. */
5602
5603 static void
5604 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5605 rtx *cfi_ops)
5606 {
5607 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5608 rtx reg1 = gen_rtx_REG (mode, regno1);
5609
5610 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5611
5612 if (regno2 == INVALID_REGNUM)
5613 {
5614 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5615 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5616 emit_move_insn (reg1, gen_frame_mem (mode, mem));
5617 }
5618 else
5619 {
5620 rtx reg2 = gen_rtx_REG (mode, regno2);
5621 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5622 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5623 reg2, adjustment));
5624 }
5625 }
5626
5627 /* Generate and return a store pair instruction of mode MODE to store
5628 register REG1 to MEM1 and register REG2 to MEM2. */
5629
5630 static rtx
5631 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5632 rtx reg2)
5633 {
5634 switch (mode)
5635 {
5636 case E_DImode:
5637 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5638
5639 case E_DFmode:
5640 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5641
5642 case E_TFmode:
5643 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5644
5645 default:
5646 gcc_unreachable ();
5647 }
5648 }
5649
5650 /* Generate and regurn a load pair isntruction of mode MODE to load register
5651 REG1 from MEM1 and register REG2 from MEM2. */
5652
5653 static rtx
5654 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5655 rtx mem2)
5656 {
5657 switch (mode)
5658 {
5659 case E_DImode:
5660 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5661
5662 case E_DFmode:
5663 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5664
5665 case E_TFmode:
5666 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5667
5668 default:
5669 gcc_unreachable ();
5670 }
5671 }
5672
5673 /* Return TRUE if return address signing should be enabled for the current
5674 function, otherwise return FALSE. */
5675
5676 bool
5677 aarch64_return_address_signing_enabled (void)
5678 {
5679 /* This function should only be called after frame laid out. */
5680 gcc_assert (cfun->machine->frame.laid_out);
5681
5682 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5683 if its LR is pushed onto stack. */
5684 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5685 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5686 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5687 }
5688
5689 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5690 bool
5691 aarch64_bti_enabled (void)
5692 {
5693 return (aarch64_enable_bti == 1);
5694 }
5695
5696 /* Emit code to save the callee-saved registers from register number START
5697 to LIMIT to the stack at the location starting at offset START_OFFSET,
5698 skipping any write-back candidates if SKIP_WB is true. */
5699
5700 static void
5701 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5702 unsigned start, unsigned limit, bool skip_wb)
5703 {
5704 rtx_insn *insn;
5705 unsigned regno;
5706 unsigned regno2;
5707
5708 for (regno = aarch64_next_callee_save (start, limit);
5709 regno <= limit;
5710 regno = aarch64_next_callee_save (regno + 1, limit))
5711 {
5712 rtx reg, mem;
5713 poly_int64 offset;
5714 int offset_diff;
5715
5716 if (skip_wb
5717 && (regno == cfun->machine->frame.wb_candidate1
5718 || regno == cfun->machine->frame.wb_candidate2))
5719 continue;
5720
5721 if (cfun->machine->reg_is_wrapped_separately[regno])
5722 continue;
5723
5724 reg = gen_rtx_REG (mode, regno);
5725 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5726 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5727 offset));
5728
5729 regno2 = aarch64_next_callee_save (regno + 1, limit);
5730 offset_diff = cfun->machine->frame.reg_offset[regno2]
5731 - cfun->machine->frame.reg_offset[regno];
5732
5733 if (regno2 <= limit
5734 && !cfun->machine->reg_is_wrapped_separately[regno2]
5735 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5736 {
5737 rtx reg2 = gen_rtx_REG (mode, regno2);
5738 rtx mem2;
5739
5740 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5741 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5742 offset));
5743 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5744 reg2));
5745
5746 /* The first part of a frame-related parallel insn is
5747 always assumed to be relevant to the frame
5748 calculations; subsequent parts, are only
5749 frame-related if explicitly marked. */
5750 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5751 regno = regno2;
5752 }
5753 else
5754 insn = emit_move_insn (mem, reg);
5755
5756 RTX_FRAME_RELATED_P (insn) = 1;
5757 }
5758 }
5759
5760 /* Emit code to restore the callee registers of mode MODE from register
5761 number START up to and including LIMIT. Restore from the stack offset
5762 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5763 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5764
5765 static void
5766 aarch64_restore_callee_saves (machine_mode mode,
5767 poly_int64 start_offset, unsigned start,
5768 unsigned limit, bool skip_wb, rtx *cfi_ops)
5769 {
5770 rtx base_rtx = stack_pointer_rtx;
5771 unsigned regno;
5772 unsigned regno2;
5773 poly_int64 offset;
5774
5775 for (regno = aarch64_next_callee_save (start, limit);
5776 regno <= limit;
5777 regno = aarch64_next_callee_save (regno + 1, limit))
5778 {
5779 if (cfun->machine->reg_is_wrapped_separately[regno])
5780 continue;
5781
5782 rtx reg, mem;
5783 int offset_diff;
5784
5785 if (skip_wb
5786 && (regno == cfun->machine->frame.wb_candidate1
5787 || regno == cfun->machine->frame.wb_candidate2))
5788 continue;
5789
5790 reg = gen_rtx_REG (mode, regno);
5791 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5792 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5793
5794 regno2 = aarch64_next_callee_save (regno + 1, limit);
5795 offset_diff = cfun->machine->frame.reg_offset[regno2]
5796 - cfun->machine->frame.reg_offset[regno];
5797
5798 if (regno2 <= limit
5799 && !cfun->machine->reg_is_wrapped_separately[regno2]
5800 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5801 {
5802 rtx reg2 = gen_rtx_REG (mode, regno2);
5803 rtx mem2;
5804
5805 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5806 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5807 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5808
5809 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5810 regno = regno2;
5811 }
5812 else
5813 emit_move_insn (reg, mem);
5814 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5815 }
5816 }
5817
5818 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5819 of MODE. */
5820
5821 static inline bool
5822 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5823 {
5824 HOST_WIDE_INT multiple;
5825 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5826 && IN_RANGE (multiple, -8, 7));
5827 }
5828
5829 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5830 of MODE. */
5831
5832 static inline bool
5833 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5834 {
5835 HOST_WIDE_INT multiple;
5836 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5837 && IN_RANGE (multiple, 0, 63));
5838 }
5839
5840 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5841 of MODE. */
5842
5843 bool
5844 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5845 {
5846 HOST_WIDE_INT multiple;
5847 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5848 && IN_RANGE (multiple, -64, 63));
5849 }
5850
5851 /* Return true if OFFSET is a signed 9-bit value. */
5852
5853 bool
5854 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5855 poly_int64 offset)
5856 {
5857 HOST_WIDE_INT const_offset;
5858 return (offset.is_constant (&const_offset)
5859 && IN_RANGE (const_offset, -256, 255));
5860 }
5861
5862 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5863 of MODE. */
5864
5865 static inline bool
5866 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5867 {
5868 HOST_WIDE_INT multiple;
5869 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5870 && IN_RANGE (multiple, -256, 255));
5871 }
5872
5873 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5874 of MODE. */
5875
5876 static inline bool
5877 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5878 {
5879 HOST_WIDE_INT multiple;
5880 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5881 && IN_RANGE (multiple, 0, 4095));
5882 }
5883
5884 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5885
5886 static sbitmap
5887 aarch64_get_separate_components (void)
5888 {
5889 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5890 bitmap_clear (components);
5891
5892 /* The registers we need saved to the frame. */
5893 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5894 if (aarch64_register_saved_on_entry (regno))
5895 {
5896 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5897 if (!frame_pointer_needed)
5898 offset += cfun->machine->frame.frame_size
5899 - cfun->machine->frame.hard_fp_offset;
5900 /* Check that we can access the stack slot of the register with one
5901 direct load with no adjustments needed. */
5902 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5903 bitmap_set_bit (components, regno);
5904 }
5905
5906 /* Don't mess with the hard frame pointer. */
5907 if (frame_pointer_needed)
5908 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5909
5910 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5911 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5912 /* If registers have been chosen to be stored/restored with
5913 writeback don't interfere with them to avoid having to output explicit
5914 stack adjustment instructions. */
5915 if (reg2 != INVALID_REGNUM)
5916 bitmap_clear_bit (components, reg2);
5917 if (reg1 != INVALID_REGNUM)
5918 bitmap_clear_bit (components, reg1);
5919
5920 bitmap_clear_bit (components, LR_REGNUM);
5921 bitmap_clear_bit (components, SP_REGNUM);
5922
5923 return components;
5924 }
5925
5926 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5927
5928 static sbitmap
5929 aarch64_components_for_bb (basic_block bb)
5930 {
5931 bitmap in = DF_LIVE_IN (bb);
5932 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5933 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5934 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5935
5936 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5937 bitmap_clear (components);
5938
5939 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5940 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5941 if ((!call_used_regs[regno]
5942 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5943 && (bitmap_bit_p (in, regno)
5944 || bitmap_bit_p (gen, regno)
5945 || bitmap_bit_p (kill, regno)))
5946 {
5947 unsigned regno2, offset, offset2;
5948 bitmap_set_bit (components, regno);
5949
5950 /* If there is a callee-save at an adjacent offset, add it too
5951 to increase the use of LDP/STP. */
5952 offset = cfun->machine->frame.reg_offset[regno];
5953 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5954
5955 if (regno2 <= LAST_SAVED_REGNUM)
5956 {
5957 offset2 = cfun->machine->frame.reg_offset[regno2];
5958 if ((offset & ~8) == (offset2 & ~8))
5959 bitmap_set_bit (components, regno2);
5960 }
5961 }
5962
5963 return components;
5964 }
5965
5966 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5967 Nothing to do for aarch64. */
5968
5969 static void
5970 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5971 {
5972 }
5973
5974 /* Return the next set bit in BMP from START onwards. Return the total number
5975 of bits in BMP if no set bit is found at or after START. */
5976
5977 static unsigned int
5978 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5979 {
5980 unsigned int nbits = SBITMAP_SIZE (bmp);
5981 if (start == nbits)
5982 return start;
5983
5984 gcc_assert (start < nbits);
5985 for (unsigned int i = start; i < nbits; i++)
5986 if (bitmap_bit_p (bmp, i))
5987 return i;
5988
5989 return nbits;
5990 }
5991
5992 /* Do the work for aarch64_emit_prologue_components and
5993 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5994 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5995 for these components or the epilogue sequence. That is, it determines
5996 whether we should emit stores or loads and what kind of CFA notes to attach
5997 to the insns. Otherwise the logic for the two sequences is very
5998 similar. */
5999
6000 static void
6001 aarch64_process_components (sbitmap components, bool prologue_p)
6002 {
6003 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6004 ? HARD_FRAME_POINTER_REGNUM
6005 : STACK_POINTER_REGNUM);
6006
6007 unsigned last_regno = SBITMAP_SIZE (components);
6008 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6009 rtx_insn *insn = NULL;
6010
6011 while (regno != last_regno)
6012 {
6013 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6014 so DFmode for the vector registers is enough. For simd functions
6015 we want to save the low 128 bits. */
6016 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
6017
6018 rtx reg = gen_rtx_REG (mode, regno);
6019 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6020 if (!frame_pointer_needed)
6021 offset += cfun->machine->frame.frame_size
6022 - cfun->machine->frame.hard_fp_offset;
6023 rtx addr = plus_constant (Pmode, ptr_reg, offset);
6024 rtx mem = gen_frame_mem (mode, addr);
6025
6026 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6027 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6028 /* No more registers to handle after REGNO.
6029 Emit a single save/restore and exit. */
6030 if (regno2 == last_regno)
6031 {
6032 insn = emit_insn (set);
6033 RTX_FRAME_RELATED_P (insn) = 1;
6034 if (prologue_p)
6035 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6036 else
6037 add_reg_note (insn, REG_CFA_RESTORE, reg);
6038 break;
6039 }
6040
6041 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6042 /* The next register is not of the same class or its offset is not
6043 mergeable with the current one into a pair. */
6044 if (!satisfies_constraint_Ump (mem)
6045 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6046 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
6047 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6048 GET_MODE_SIZE (mode)))
6049 {
6050 insn = emit_insn (set);
6051 RTX_FRAME_RELATED_P (insn) = 1;
6052 if (prologue_p)
6053 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6054 else
6055 add_reg_note (insn, REG_CFA_RESTORE, reg);
6056
6057 regno = regno2;
6058 continue;
6059 }
6060
6061 /* REGNO2 can be saved/restored in a pair with REGNO. */
6062 rtx reg2 = gen_rtx_REG (mode, regno2);
6063 if (!frame_pointer_needed)
6064 offset2 += cfun->machine->frame.frame_size
6065 - cfun->machine->frame.hard_fp_offset;
6066 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6067 rtx mem2 = gen_frame_mem (mode, addr2);
6068 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6069 : gen_rtx_SET (reg2, mem2);
6070
6071 if (prologue_p)
6072 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6073 else
6074 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6075
6076 RTX_FRAME_RELATED_P (insn) = 1;
6077 if (prologue_p)
6078 {
6079 add_reg_note (insn, REG_CFA_OFFSET, set);
6080 add_reg_note (insn, REG_CFA_OFFSET, set2);
6081 }
6082 else
6083 {
6084 add_reg_note (insn, REG_CFA_RESTORE, reg);
6085 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6086 }
6087
6088 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6089 }
6090 }
6091
6092 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6093
6094 static void
6095 aarch64_emit_prologue_components (sbitmap components)
6096 {
6097 aarch64_process_components (components, true);
6098 }
6099
6100 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6101
6102 static void
6103 aarch64_emit_epilogue_components (sbitmap components)
6104 {
6105 aarch64_process_components (components, false);
6106 }
6107
6108 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6109
6110 static void
6111 aarch64_set_handled_components (sbitmap components)
6112 {
6113 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6114 if (bitmap_bit_p (components, regno))
6115 cfun->machine->reg_is_wrapped_separately[regno] = true;
6116 }
6117
6118 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6119 determining the probe offset for alloca. */
6120
6121 static HOST_WIDE_INT
6122 aarch64_stack_clash_protection_alloca_probe_range (void)
6123 {
6124 return STACK_CLASH_CALLER_GUARD;
6125 }
6126
6127
6128 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6129 registers. If POLY_SIZE is not large enough to require a probe this function
6130 will only adjust the stack. When allocating the stack space
6131 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6132 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6133 arguments. If we are then we ensure that any allocation larger than the ABI
6134 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6135 maintained.
6136
6137 We emit barriers after each stack adjustment to prevent optimizations from
6138 breaking the invariant that we never drop the stack more than a page. This
6139 invariant is needed to make it easier to correctly handle asynchronous
6140 events, e.g. if we were to allow the stack to be dropped by more than a page
6141 and then have multiple probes up and we take a signal somewhere in between
6142 then the signal handler doesn't know the state of the stack and can make no
6143 assumptions about which pages have been probed. */
6144
6145 static void
6146 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6147 poly_int64 poly_size,
6148 bool frame_related_p,
6149 bool final_adjustment_p)
6150 {
6151 HOST_WIDE_INT guard_size
6152 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6153 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6154 /* When doing the final adjustment for the outgoing argument size we can't
6155 assume that LR was saved at position 0. So subtract it's offset from the
6156 ABI safe buffer so that we don't accidentally allow an adjustment that
6157 would result in an allocation larger than the ABI buffer without
6158 probing. */
6159 HOST_WIDE_INT min_probe_threshold
6160 = final_adjustment_p
6161 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6162 : guard_size - guard_used_by_caller;
6163
6164 poly_int64 frame_size = cfun->machine->frame.frame_size;
6165
6166 /* We should always have a positive probe threshold. */
6167 gcc_assert (min_probe_threshold > 0);
6168
6169 if (flag_stack_clash_protection && !final_adjustment_p)
6170 {
6171 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6172 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6173
6174 if (known_eq (frame_size, 0))
6175 {
6176 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6177 }
6178 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6179 && known_lt (final_adjust, guard_used_by_caller))
6180 {
6181 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6182 }
6183 }
6184
6185 /* If SIZE is not large enough to require probing, just adjust the stack and
6186 exit. */
6187 if (known_lt (poly_size, min_probe_threshold)
6188 || !flag_stack_clash_protection)
6189 {
6190 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6191 return;
6192 }
6193
6194 HOST_WIDE_INT size;
6195 /* Handle the SVE non-constant case first. */
6196 if (!poly_size.is_constant (&size))
6197 {
6198 if (dump_file)
6199 {
6200 fprintf (dump_file, "Stack clash SVE prologue: ");
6201 print_dec (poly_size, dump_file);
6202 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6203 }
6204
6205 /* First calculate the amount of bytes we're actually spilling. */
6206 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6207 poly_size, temp1, temp2, false, true);
6208
6209 rtx_insn *insn = get_last_insn ();
6210
6211 if (frame_related_p)
6212 {
6213 /* This is done to provide unwinding information for the stack
6214 adjustments we're about to do, however to prevent the optimizers
6215 from removing the R11 move and leaving the CFA note (which would be
6216 very wrong) we tie the old and new stack pointer together.
6217 The tie will expand to nothing but the optimizers will not touch
6218 the instruction. */
6219 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6220 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6221 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6222
6223 /* We want the CFA independent of the stack pointer for the
6224 duration of the loop. */
6225 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6226 RTX_FRAME_RELATED_P (insn) = 1;
6227 }
6228
6229 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6230 rtx guard_const = gen_int_mode (guard_size, Pmode);
6231
6232 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6233 stack_pointer_rtx, temp1,
6234 probe_const, guard_const));
6235
6236 /* Now reset the CFA register if needed. */
6237 if (frame_related_p)
6238 {
6239 add_reg_note (insn, REG_CFA_DEF_CFA,
6240 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6241 gen_int_mode (poly_size, Pmode)));
6242 RTX_FRAME_RELATED_P (insn) = 1;
6243 }
6244
6245 return;
6246 }
6247
6248 if (dump_file)
6249 fprintf (dump_file,
6250 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6251 " bytes, probing will be required.\n", size);
6252
6253 /* Round size to the nearest multiple of guard_size, and calculate the
6254 residual as the difference between the original size and the rounded
6255 size. */
6256 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6257 HOST_WIDE_INT residual = size - rounded_size;
6258
6259 /* We can handle a small number of allocations/probes inline. Otherwise
6260 punt to a loop. */
6261 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6262 {
6263 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6264 {
6265 aarch64_sub_sp (NULL, temp2, guard_size, true);
6266 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6267 guard_used_by_caller));
6268 emit_insn (gen_blockage ());
6269 }
6270 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6271 }
6272 else
6273 {
6274 /* Compute the ending address. */
6275 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6276 temp1, NULL, false, true);
6277 rtx_insn *insn = get_last_insn ();
6278
6279 /* For the initial allocation, we don't have a frame pointer
6280 set up, so we always need CFI notes. If we're doing the
6281 final allocation, then we may have a frame pointer, in which
6282 case it is the CFA, otherwise we need CFI notes.
6283
6284 We can determine which allocation we are doing by looking at
6285 the value of FRAME_RELATED_P since the final allocations are not
6286 frame related. */
6287 if (frame_related_p)
6288 {
6289 /* We want the CFA independent of the stack pointer for the
6290 duration of the loop. */
6291 add_reg_note (insn, REG_CFA_DEF_CFA,
6292 plus_constant (Pmode, temp1, rounded_size));
6293 RTX_FRAME_RELATED_P (insn) = 1;
6294 }
6295
6296 /* This allocates and probes the stack. Note that this re-uses some of
6297 the existing Ada stack protection code. However we are guaranteed not
6298 to enter the non loop or residual branches of that code.
6299
6300 The non-loop part won't be entered because if our allocation amount
6301 doesn't require a loop, the case above would handle it.
6302
6303 The residual amount won't be entered because TEMP1 is a mutliple of
6304 the allocation size. The residual will always be 0. As such, the only
6305 part we are actually using from that code is the loop setup. The
6306 actual probing is done in aarch64_output_probe_stack_range. */
6307 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6308 stack_pointer_rtx, temp1));
6309
6310 /* Now reset the CFA register if needed. */
6311 if (frame_related_p)
6312 {
6313 add_reg_note (insn, REG_CFA_DEF_CFA,
6314 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6315 RTX_FRAME_RELATED_P (insn) = 1;
6316 }
6317
6318 emit_insn (gen_blockage ());
6319 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6320 }
6321
6322 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6323 be probed. This maintains the requirement that each page is probed at
6324 least once. For initial probing we probe only if the allocation is
6325 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6326 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6327 GUARD_SIZE. This works that for any allocation that is large enough to
6328 trigger a probe here, we'll have at least one, and if they're not large
6329 enough for this code to emit anything for them, The page would have been
6330 probed by the saving of FP/LR either by this function or any callees. If
6331 we don't have any callees then we won't have more stack adjustments and so
6332 are still safe. */
6333 if (residual)
6334 {
6335 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6336 /* If we're doing final adjustments, and we've done any full page
6337 allocations then any residual needs to be probed. */
6338 if (final_adjustment_p && rounded_size != 0)
6339 min_probe_threshold = 0;
6340 /* If doing a small final adjustment, we always probe at offset 0.
6341 This is done to avoid issues when LR is not at position 0 or when
6342 the final adjustment is smaller than the probing offset. */
6343 else if (final_adjustment_p && rounded_size == 0)
6344 residual_probe_offset = 0;
6345
6346 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6347 if (residual >= min_probe_threshold)
6348 {
6349 if (dump_file)
6350 fprintf (dump_file,
6351 "Stack clash AArch64 prologue residuals: "
6352 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6353 "\n", residual);
6354
6355 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6356 residual_probe_offset));
6357 emit_insn (gen_blockage ());
6358 }
6359 }
6360 }
6361
6362 /* Return 1 if the register is used by the epilogue. We need to say the
6363 return register is used, but only after epilogue generation is complete.
6364 Note that in the case of sibcalls, the values "used by the epilogue" are
6365 considered live at the start of the called function.
6366
6367 For SIMD functions we need to return 1 for FP registers that are saved and
6368 restored by a function but are not zero in call_used_regs. If we do not do
6369 this optimizations may remove the restore of the register. */
6370
6371 int
6372 aarch64_epilogue_uses (int regno)
6373 {
6374 if (epilogue_completed)
6375 {
6376 if (regno == LR_REGNUM)
6377 return 1;
6378 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6379 return 1;
6380 }
6381 return 0;
6382 }
6383
6384 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6385 is saved at BASE + OFFSET. */
6386
6387 static void
6388 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6389 rtx base, poly_int64 offset)
6390 {
6391 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6392 add_reg_note (insn, REG_CFA_EXPRESSION,
6393 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6394 }
6395
6396 /* AArch64 stack frames generated by this compiler look like:
6397
6398 +-------------------------------+
6399 | |
6400 | incoming stack arguments |
6401 | |
6402 +-------------------------------+
6403 | | <-- incoming stack pointer (aligned)
6404 | callee-allocated save area |
6405 | for register varargs |
6406 | |
6407 +-------------------------------+
6408 | local variables | <-- frame_pointer_rtx
6409 | |
6410 +-------------------------------+
6411 | padding | \
6412 +-------------------------------+ |
6413 | callee-saved registers | | frame.saved_regs_size
6414 +-------------------------------+ |
6415 | LR' | |
6416 +-------------------------------+ |
6417 | FP' | / <- hard_frame_pointer_rtx (aligned)
6418 +-------------------------------+
6419 | dynamic allocation |
6420 +-------------------------------+
6421 | padding |
6422 +-------------------------------+
6423 | outgoing stack arguments | <-- arg_pointer
6424 | |
6425 +-------------------------------+
6426 | | <-- stack_pointer_rtx (aligned)
6427
6428 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6429 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6430 unchanged.
6431
6432 By default for stack-clash we assume the guard is at least 64KB, but this
6433 value is configurable to either 4KB or 64KB. We also force the guard size to
6434 be the same as the probing interval and both values are kept in sync.
6435
6436 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6437 on the guard size) of stack space without probing.
6438
6439 When probing is needed, we emit a probe at the start of the prologue
6440 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6441
6442 We have to track how much space has been allocated and the only stores
6443 to the stack we track as implicit probes are the FP/LR stores.
6444
6445 For outgoing arguments we probe if the size is larger than 1KB, such that
6446 the ABI specified buffer is maintained for the next callee.
6447
6448 The following registers are reserved during frame layout and should not be
6449 used for any other purpose:
6450
6451 - r11: Used by stack clash protection when SVE is enabled.
6452 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6453 - r14 and r15: Used for speculation tracking.
6454 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6455 - r30(LR), r29(FP): Used by standard frame layout.
6456
6457 These registers must be avoided in frame layout related code unless the
6458 explicit intention is to interact with one of the features listed above. */
6459
6460 /* Generate the prologue instructions for entry into a function.
6461 Establish the stack frame by decreasing the stack pointer with a
6462 properly calculated size and, if necessary, create a frame record
6463 filled with the values of LR and previous frame pointer. The
6464 current FP is also set up if it is in use. */
6465
6466 void
6467 aarch64_expand_prologue (void)
6468 {
6469 poly_int64 frame_size = cfun->machine->frame.frame_size;
6470 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6471 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6472 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6473 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6474 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6475 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6476 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6477 rtx_insn *insn;
6478
6479 /* Sign return address for functions. */
6480 if (aarch64_return_address_signing_enabled ())
6481 {
6482 switch (aarch64_ra_sign_key)
6483 {
6484 case AARCH64_KEY_A:
6485 insn = emit_insn (gen_paciasp ());
6486 break;
6487 case AARCH64_KEY_B:
6488 insn = emit_insn (gen_pacibsp ());
6489 break;
6490 default:
6491 gcc_unreachable ();
6492 }
6493 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6494 RTX_FRAME_RELATED_P (insn) = 1;
6495 }
6496
6497 if (flag_stack_usage_info)
6498 current_function_static_stack_size = constant_lower_bound (frame_size);
6499
6500 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6501 {
6502 if (crtl->is_leaf && !cfun->calls_alloca)
6503 {
6504 if (maybe_gt (frame_size, PROBE_INTERVAL)
6505 && maybe_gt (frame_size, get_stack_check_protect ()))
6506 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6507 (frame_size
6508 - get_stack_check_protect ()));
6509 }
6510 else if (maybe_gt (frame_size, 0))
6511 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6512 }
6513
6514 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6515 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6516
6517 /* In theory we should never have both an initial adjustment
6518 and a callee save adjustment. Verify that is the case since the
6519 code below does not handle it for -fstack-clash-protection. */
6520 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6521
6522 /* Will only probe if the initial adjustment is larger than the guard
6523 less the amount of the guard reserved for use by the caller's
6524 outgoing args. */
6525 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6526 true, false);
6527
6528 if (callee_adjust != 0)
6529 aarch64_push_regs (reg1, reg2, callee_adjust);
6530
6531 if (emit_frame_chain)
6532 {
6533 poly_int64 reg_offset = callee_adjust;
6534 if (callee_adjust == 0)
6535 {
6536 reg1 = R29_REGNUM;
6537 reg2 = R30_REGNUM;
6538 reg_offset = callee_offset;
6539 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6540 }
6541 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6542 stack_pointer_rtx, callee_offset,
6543 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6544 if (frame_pointer_needed && !frame_size.is_constant ())
6545 {
6546 /* Variable-sized frames need to describe the save slot
6547 address using DW_CFA_expression rather than DW_CFA_offset.
6548 This means that, without taking further action, the
6549 locations of the registers that we've already saved would
6550 remain based on the stack pointer even after we redefine
6551 the CFA based on the frame pointer. We therefore need new
6552 DW_CFA_expressions to re-express the save slots with addresses
6553 based on the frame pointer. */
6554 rtx_insn *insn = get_last_insn ();
6555 gcc_assert (RTX_FRAME_RELATED_P (insn));
6556
6557 /* Add an explicit CFA definition if this was previously
6558 implicit. */
6559 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6560 {
6561 rtx src = plus_constant (Pmode, stack_pointer_rtx,
6562 callee_offset);
6563 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6564 gen_rtx_SET (hard_frame_pointer_rtx, src));
6565 }
6566
6567 /* Change the save slot expressions for the registers that
6568 we've already saved. */
6569 reg_offset -= callee_offset;
6570 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6571 reg_offset + UNITS_PER_WORD);
6572 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6573 reg_offset);
6574 }
6575 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6576 }
6577
6578 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6579 callee_adjust != 0 || emit_frame_chain);
6580 if (aarch64_simd_decl_p (cfun->decl))
6581 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6582 callee_adjust != 0 || emit_frame_chain);
6583 else
6584 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6585 callee_adjust != 0 || emit_frame_chain);
6586
6587 /* We may need to probe the final adjustment if it is larger than the guard
6588 that is assumed by the called. */
6589 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6590 !frame_pointer_needed, true);
6591 }
6592
6593 /* Return TRUE if we can use a simple_return insn.
6594
6595 This function checks whether the callee saved stack is empty, which
6596 means no restore actions are need. The pro_and_epilogue will use
6597 this to check whether shrink-wrapping opt is feasible. */
6598
6599 bool
6600 aarch64_use_return_insn_p (void)
6601 {
6602 if (!reload_completed)
6603 return false;
6604
6605 if (crtl->profile)
6606 return false;
6607
6608 return known_eq (cfun->machine->frame.frame_size, 0);
6609 }
6610
6611 /* Return false for non-leaf SIMD functions in order to avoid
6612 shrink-wrapping them. Doing this will lose the necessary
6613 save/restore of FP registers. */
6614
6615 bool
6616 aarch64_use_simple_return_insn_p (void)
6617 {
6618 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6619 return false;
6620
6621 return true;
6622 }
6623
6624 /* Generate the epilogue instructions for returning from a function.
6625 This is almost exactly the reverse of the prolog sequence, except
6626 that we need to insert barriers to avoid scheduling loads that read
6627 from a deallocated stack, and we optimize the unwind records by
6628 emitting them all together if possible. */
6629 void
6630 aarch64_expand_epilogue (bool for_sibcall)
6631 {
6632 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6633 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6634 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6635 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6636 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6637 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6638 rtx cfi_ops = NULL;
6639 rtx_insn *insn;
6640 /* A stack clash protection prologue may not have left EP0_REGNUM or
6641 EP1_REGNUM in a usable state. The same is true for allocations
6642 with an SVE component, since we then need both temporary registers
6643 for each allocation. For stack clash we are in a usable state if
6644 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6645 HOST_WIDE_INT guard_size
6646 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6647 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6648
6649 /* We can re-use the registers when the allocation amount is smaller than
6650 guard_size - guard_used_by_caller because we won't be doing any probes
6651 then. In such situations the register should remain live with the correct
6652 value. */
6653 bool can_inherit_p = (initial_adjust.is_constant ()
6654 && final_adjust.is_constant ())
6655 && (!flag_stack_clash_protection
6656 || known_lt (initial_adjust,
6657 guard_size - guard_used_by_caller));
6658
6659 /* We need to add memory barrier to prevent read from deallocated stack. */
6660 bool need_barrier_p
6661 = maybe_ne (get_frame_size ()
6662 + cfun->machine->frame.saved_varargs_size, 0);
6663
6664 /* Emit a barrier to prevent loads from a deallocated stack. */
6665 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6666 || cfun->calls_alloca
6667 || crtl->calls_eh_return)
6668 {
6669 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6670 need_barrier_p = false;
6671 }
6672
6673 /* Restore the stack pointer from the frame pointer if it may not
6674 be the same as the stack pointer. */
6675 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6676 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6677 if (frame_pointer_needed
6678 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6679 /* If writeback is used when restoring callee-saves, the CFA
6680 is restored on the instruction doing the writeback. */
6681 aarch64_add_offset (Pmode, stack_pointer_rtx,
6682 hard_frame_pointer_rtx, -callee_offset,
6683 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6684 else
6685 /* The case where we need to re-use the register here is very rare, so
6686 avoid the complicated condition and just always emit a move if the
6687 immediate doesn't fit. */
6688 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6689
6690 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6691 callee_adjust != 0, &cfi_ops);
6692 if (aarch64_simd_decl_p (cfun->decl))
6693 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6694 callee_adjust != 0, &cfi_ops);
6695 else
6696 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6697 callee_adjust != 0, &cfi_ops);
6698
6699 if (need_barrier_p)
6700 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6701
6702 if (callee_adjust != 0)
6703 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6704
6705 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6706 {
6707 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6708 insn = get_last_insn ();
6709 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6710 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6711 RTX_FRAME_RELATED_P (insn) = 1;
6712 cfi_ops = NULL;
6713 }
6714
6715 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6716 add restriction on emit_move optimization to leaf functions. */
6717 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6718 (!can_inherit_p || !crtl->is_leaf
6719 || df_regs_ever_live_p (EP0_REGNUM)));
6720
6721 if (cfi_ops)
6722 {
6723 /* Emit delayed restores and reset the CFA to be SP. */
6724 insn = get_last_insn ();
6725 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6726 REG_NOTES (insn) = cfi_ops;
6727 RTX_FRAME_RELATED_P (insn) = 1;
6728 }
6729
6730 /* We prefer to emit the combined return/authenticate instruction RETAA,
6731 however there are three cases in which we must instead emit an explicit
6732 authentication instruction.
6733
6734 1) Sibcalls don't return in a normal way, so if we're about to call one
6735 we must authenticate.
6736
6737 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6738 generating code for !TARGET_ARMV8_3 we can't use it and must
6739 explicitly authenticate.
6740
6741 3) On an eh_return path we make extra stack adjustments to update the
6742 canonical frame address to be the exception handler's CFA. We want
6743 to authenticate using the CFA of the function which calls eh_return.
6744 */
6745 if (aarch64_return_address_signing_enabled ()
6746 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6747 {
6748 switch (aarch64_ra_sign_key)
6749 {
6750 case AARCH64_KEY_A:
6751 insn = emit_insn (gen_autiasp ());
6752 break;
6753 case AARCH64_KEY_B:
6754 insn = emit_insn (gen_autibsp ());
6755 break;
6756 default:
6757 gcc_unreachable ();
6758 }
6759 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6760 RTX_FRAME_RELATED_P (insn) = 1;
6761 }
6762
6763 /* Stack adjustment for exception handler. */
6764 if (crtl->calls_eh_return && !for_sibcall)
6765 {
6766 /* We need to unwind the stack by the offset computed by
6767 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6768 to be SP; letting the CFA move during this adjustment
6769 is just as correct as retaining the CFA from the body
6770 of the function. Therefore, do nothing special. */
6771 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6772 }
6773
6774 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6775 if (!for_sibcall)
6776 emit_jump_insn (ret_rtx);
6777 }
6778
6779 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6780 normally or return to a previous frame after unwinding.
6781
6782 An EH return uses a single shared return sequence. The epilogue is
6783 exactly like a normal epilogue except that it has an extra input
6784 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6785 that must be applied after the frame has been destroyed. An extra label
6786 is inserted before the epilogue which initializes this register to zero,
6787 and this is the entry point for a normal return.
6788
6789 An actual EH return updates the return address, initializes the stack
6790 adjustment and jumps directly into the epilogue (bypassing the zeroing
6791 of the adjustment). Since the return address is typically saved on the
6792 stack when a function makes a call, the saved LR must be updated outside
6793 the epilogue.
6794
6795 This poses problems as the store is generated well before the epilogue,
6796 so the offset of LR is not known yet. Also optimizations will remove the
6797 store as it appears dead, even after the epilogue is generated (as the
6798 base or offset for loading LR is different in many cases).
6799
6800 To avoid these problems this implementation forces the frame pointer
6801 in eh_return functions so that the location of LR is fixed and known early.
6802 It also marks the store volatile, so no optimization is permitted to
6803 remove the store. */
6804 rtx
6805 aarch64_eh_return_handler_rtx (void)
6806 {
6807 rtx tmp = gen_frame_mem (Pmode,
6808 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6809
6810 /* Mark the store volatile, so no optimization is permitted to remove it. */
6811 MEM_VOLATILE_P (tmp) = true;
6812 return tmp;
6813 }
6814
6815 /* Output code to add DELTA to the first argument, and then jump
6816 to FUNCTION. Used for C++ multiple inheritance. */
6817 static void
6818 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6819 HOST_WIDE_INT delta,
6820 HOST_WIDE_INT vcall_offset,
6821 tree function)
6822 {
6823 /* The this pointer is always in x0. Note that this differs from
6824 Arm where the this pointer maybe bumped to r1 if r0 is required
6825 to return a pointer to an aggregate. On AArch64 a result value
6826 pointer will be in x8. */
6827 int this_regno = R0_REGNUM;
6828 rtx this_rtx, temp0, temp1, addr, funexp;
6829 rtx_insn *insn;
6830 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6831
6832 if (aarch64_bti_enabled ())
6833 emit_insn (gen_bti_c());
6834
6835 reload_completed = 1;
6836 emit_note (NOTE_INSN_PROLOGUE_END);
6837
6838 this_rtx = gen_rtx_REG (Pmode, this_regno);
6839 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6840 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6841
6842 if (vcall_offset == 0)
6843 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6844 else
6845 {
6846 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6847
6848 addr = this_rtx;
6849 if (delta != 0)
6850 {
6851 if (delta >= -256 && delta < 256)
6852 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6853 plus_constant (Pmode, this_rtx, delta));
6854 else
6855 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6856 temp1, temp0, false);
6857 }
6858
6859 if (Pmode == ptr_mode)
6860 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6861 else
6862 aarch64_emit_move (temp0,
6863 gen_rtx_ZERO_EXTEND (Pmode,
6864 gen_rtx_MEM (ptr_mode, addr)));
6865
6866 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6867 addr = plus_constant (Pmode, temp0, vcall_offset);
6868 else
6869 {
6870 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6871 Pmode);
6872 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6873 }
6874
6875 if (Pmode == ptr_mode)
6876 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6877 else
6878 aarch64_emit_move (temp1,
6879 gen_rtx_SIGN_EXTEND (Pmode,
6880 gen_rtx_MEM (ptr_mode, addr)));
6881
6882 emit_insn (gen_add2_insn (this_rtx, temp1));
6883 }
6884
6885 /* Generate a tail call to the target function. */
6886 if (!TREE_USED (function))
6887 {
6888 assemble_external (function);
6889 TREE_USED (function) = 1;
6890 }
6891 funexp = XEXP (DECL_RTL (function), 0);
6892 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6893 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6894 SIBLING_CALL_P (insn) = 1;
6895
6896 insn = get_insns ();
6897 shorten_branches (insn);
6898
6899 assemble_start_function (thunk, fnname);
6900 final_start_function (insn, file, 1);
6901 final (insn, file, 1);
6902 final_end_function ();
6903 assemble_end_function (thunk, fnname);
6904
6905 /* Stop pretending to be a post-reload pass. */
6906 reload_completed = 0;
6907 }
6908
6909 static bool
6910 aarch64_tls_referenced_p (rtx x)
6911 {
6912 if (!TARGET_HAVE_TLS)
6913 return false;
6914 subrtx_iterator::array_type array;
6915 FOR_EACH_SUBRTX (iter, array, x, ALL)
6916 {
6917 const_rtx x = *iter;
6918 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6919 return true;
6920 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6921 TLS offsets, not real symbol references. */
6922 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6923 iter.skip_subrtxes ();
6924 }
6925 return false;
6926 }
6927
6928
6929 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6930 a left shift of 0 or 12 bits. */
6931 bool
6932 aarch64_uimm12_shift (HOST_WIDE_INT val)
6933 {
6934 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6935 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6936 );
6937 }
6938
6939 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6940 that can be created with a left shift of 0 or 12. */
6941 static HOST_WIDE_INT
6942 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6943 {
6944 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6945 handle correctly. */
6946 gcc_assert ((val & 0xffffff) == val);
6947
6948 if (((val & 0xfff) << 0) == val)
6949 return val;
6950
6951 return val & (0xfff << 12);
6952 }
6953
6954 /* Return true if val is an immediate that can be loaded into a
6955 register by a MOVZ instruction. */
6956 static bool
6957 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6958 {
6959 if (GET_MODE_SIZE (mode) > 4)
6960 {
6961 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6962 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6963 return 1;
6964 }
6965 else
6966 {
6967 /* Ignore sign extension. */
6968 val &= (HOST_WIDE_INT) 0xffffffff;
6969 }
6970 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6971 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6972 }
6973
6974 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6975 64-bit (DImode) integer. */
6976
6977 static unsigned HOST_WIDE_INT
6978 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6979 {
6980 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6981 while (size < 64)
6982 {
6983 val &= (HOST_WIDE_INT_1U << size) - 1;
6984 val |= val << size;
6985 size *= 2;
6986 }
6987 return val;
6988 }
6989
6990 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6991
6992 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6993 {
6994 0x0000000100000001ull,
6995 0x0001000100010001ull,
6996 0x0101010101010101ull,
6997 0x1111111111111111ull,
6998 0x5555555555555555ull,
6999 };
7000
7001
7002 /* Return true if val is a valid bitmask immediate. */
7003
7004 bool
7005 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7006 {
7007 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7008 int bits;
7009
7010 /* Check for a single sequence of one bits and return quickly if so.
7011 The special cases of all ones and all zeroes returns false. */
7012 val = aarch64_replicate_bitmask_imm (val_in, mode);
7013 tmp = val + (val & -val);
7014
7015 if (tmp == (tmp & -tmp))
7016 return (val + 1) > 1;
7017
7018 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7019 if (mode == SImode)
7020 val = (val << 32) | (val & 0xffffffff);
7021
7022 /* Invert if the immediate doesn't start with a zero bit - this means we
7023 only need to search for sequences of one bits. */
7024 if (val & 1)
7025 val = ~val;
7026
7027 /* Find the first set bit and set tmp to val with the first sequence of one
7028 bits removed. Return success if there is a single sequence of ones. */
7029 first_one = val & -val;
7030 tmp = val & (val + first_one);
7031
7032 if (tmp == 0)
7033 return true;
7034
7035 /* Find the next set bit and compute the difference in bit position. */
7036 next_one = tmp & -tmp;
7037 bits = clz_hwi (first_one) - clz_hwi (next_one);
7038 mask = val ^ tmp;
7039
7040 /* Check the bit position difference is a power of 2, and that the first
7041 sequence of one bits fits within 'bits' bits. */
7042 if ((mask >> bits) != 0 || bits != (bits & -bits))
7043 return false;
7044
7045 /* Check the sequence of one bits is repeated 64/bits times. */
7046 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7047 }
7048
7049 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7050 Assumed precondition: VAL_IN Is not zero. */
7051
7052 unsigned HOST_WIDE_INT
7053 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7054 {
7055 int lowest_bit_set = ctz_hwi (val_in);
7056 int highest_bit_set = floor_log2 (val_in);
7057 gcc_assert (val_in != 0);
7058
7059 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7060 (HOST_WIDE_INT_1U << lowest_bit_set));
7061 }
7062
7063 /* Create constant where bits outside of lowest bit set to highest bit set
7064 are set to 1. */
7065
7066 unsigned HOST_WIDE_INT
7067 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7068 {
7069 return val_in | ~aarch64_and_split_imm1 (val_in);
7070 }
7071
7072 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7073
7074 bool
7075 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7076 {
7077 scalar_int_mode int_mode;
7078 if (!is_a <scalar_int_mode> (mode, &int_mode))
7079 return false;
7080
7081 if (aarch64_bitmask_imm (val_in, int_mode))
7082 return false;
7083
7084 if (aarch64_move_imm (val_in, int_mode))
7085 return false;
7086
7087 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7088
7089 return aarch64_bitmask_imm (imm2, int_mode);
7090 }
7091
7092 /* Return true if val is an immediate that can be loaded into a
7093 register in a single instruction. */
7094 bool
7095 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7096 {
7097 scalar_int_mode int_mode;
7098 if (!is_a <scalar_int_mode> (mode, &int_mode))
7099 return false;
7100
7101 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7102 return 1;
7103 return aarch64_bitmask_imm (val, int_mode);
7104 }
7105
7106 static bool
7107 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7108 {
7109 rtx base, offset;
7110
7111 if (GET_CODE (x) == HIGH)
7112 return true;
7113
7114 /* There's no way to calculate VL-based values using relocations. */
7115 subrtx_iterator::array_type array;
7116 FOR_EACH_SUBRTX (iter, array, x, ALL)
7117 if (GET_CODE (*iter) == CONST_POLY_INT)
7118 return true;
7119
7120 split_const (x, &base, &offset);
7121 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7122 {
7123 if (aarch64_classify_symbol (base, INTVAL (offset))
7124 != SYMBOL_FORCE_TO_MEM)
7125 return true;
7126 else
7127 /* Avoid generating a 64-bit relocation in ILP32; leave
7128 to aarch64_expand_mov_immediate to handle it properly. */
7129 return mode != ptr_mode;
7130 }
7131
7132 return aarch64_tls_referenced_p (x);
7133 }
7134
7135 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7136 The expansion for a table switch is quite expensive due to the number
7137 of instructions, the table lookup and hard to predict indirect jump.
7138 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7139 set, otherwise use tables for > 16 cases as a tradeoff between size and
7140 performance. When optimizing for size, use the default setting. */
7141
7142 static unsigned int
7143 aarch64_case_values_threshold (void)
7144 {
7145 /* Use the specified limit for the number of cases before using jump
7146 tables at higher optimization levels. */
7147 if (optimize > 2
7148 && selected_cpu->tune->max_case_values != 0)
7149 return selected_cpu->tune->max_case_values;
7150 else
7151 return optimize_size ? default_case_values_threshold () : 17;
7152 }
7153
7154 /* Return true if register REGNO is a valid index register.
7155 STRICT_P is true if REG_OK_STRICT is in effect. */
7156
7157 bool
7158 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7159 {
7160 if (!HARD_REGISTER_NUM_P (regno))
7161 {
7162 if (!strict_p)
7163 return true;
7164
7165 if (!reg_renumber)
7166 return false;
7167
7168 regno = reg_renumber[regno];
7169 }
7170 return GP_REGNUM_P (regno);
7171 }
7172
7173 /* Return true if register REGNO is a valid base register for mode MODE.
7174 STRICT_P is true if REG_OK_STRICT is in effect. */
7175
7176 bool
7177 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7178 {
7179 if (!HARD_REGISTER_NUM_P (regno))
7180 {
7181 if (!strict_p)
7182 return true;
7183
7184 if (!reg_renumber)
7185 return false;
7186
7187 regno = reg_renumber[regno];
7188 }
7189
7190 /* The fake registers will be eliminated to either the stack or
7191 hard frame pointer, both of which are usually valid base registers.
7192 Reload deals with the cases where the eliminated form isn't valid. */
7193 return (GP_REGNUM_P (regno)
7194 || regno == SP_REGNUM
7195 || regno == FRAME_POINTER_REGNUM
7196 || regno == ARG_POINTER_REGNUM);
7197 }
7198
7199 /* Return true if X is a valid base register for mode MODE.
7200 STRICT_P is true if REG_OK_STRICT is in effect. */
7201
7202 static bool
7203 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7204 {
7205 if (!strict_p
7206 && GET_CODE (x) == SUBREG
7207 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7208 x = SUBREG_REG (x);
7209
7210 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7211 }
7212
7213 /* Return true if address offset is a valid index. If it is, fill in INFO
7214 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7215
7216 static bool
7217 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7218 machine_mode mode, bool strict_p)
7219 {
7220 enum aarch64_address_type type;
7221 rtx index;
7222 int shift;
7223
7224 /* (reg:P) */
7225 if ((REG_P (x) || GET_CODE (x) == SUBREG)
7226 && GET_MODE (x) == Pmode)
7227 {
7228 type = ADDRESS_REG_REG;
7229 index = x;
7230 shift = 0;
7231 }
7232 /* (sign_extend:DI (reg:SI)) */
7233 else if ((GET_CODE (x) == SIGN_EXTEND
7234 || GET_CODE (x) == ZERO_EXTEND)
7235 && GET_MODE (x) == DImode
7236 && GET_MODE (XEXP (x, 0)) == SImode)
7237 {
7238 type = (GET_CODE (x) == SIGN_EXTEND)
7239 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7240 index = XEXP (x, 0);
7241 shift = 0;
7242 }
7243 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7244 else if (GET_CODE (x) == MULT
7245 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7246 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7247 && GET_MODE (XEXP (x, 0)) == DImode
7248 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7249 && CONST_INT_P (XEXP (x, 1)))
7250 {
7251 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7252 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7253 index = XEXP (XEXP (x, 0), 0);
7254 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7255 }
7256 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7257 else if (GET_CODE (x) == ASHIFT
7258 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7259 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7260 && GET_MODE (XEXP (x, 0)) == DImode
7261 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7262 && CONST_INT_P (XEXP (x, 1)))
7263 {
7264 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7265 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7266 index = XEXP (XEXP (x, 0), 0);
7267 shift = INTVAL (XEXP (x, 1));
7268 }
7269 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7270 else if ((GET_CODE (x) == SIGN_EXTRACT
7271 || GET_CODE (x) == ZERO_EXTRACT)
7272 && GET_MODE (x) == DImode
7273 && GET_CODE (XEXP (x, 0)) == MULT
7274 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7275 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7276 {
7277 type = (GET_CODE (x) == SIGN_EXTRACT)
7278 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7279 index = XEXP (XEXP (x, 0), 0);
7280 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7281 if (INTVAL (XEXP (x, 1)) != 32 + shift
7282 || INTVAL (XEXP (x, 2)) != 0)
7283 shift = -1;
7284 }
7285 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7286 (const_int 0xffffffff<<shift)) */
7287 else if (GET_CODE (x) == AND
7288 && GET_MODE (x) == DImode
7289 && GET_CODE (XEXP (x, 0)) == MULT
7290 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7291 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7292 && CONST_INT_P (XEXP (x, 1)))
7293 {
7294 type = ADDRESS_REG_UXTW;
7295 index = XEXP (XEXP (x, 0), 0);
7296 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7297 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7298 shift = -1;
7299 }
7300 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7301 else if ((GET_CODE (x) == SIGN_EXTRACT
7302 || GET_CODE (x) == ZERO_EXTRACT)
7303 && GET_MODE (x) == DImode
7304 && GET_CODE (XEXP (x, 0)) == ASHIFT
7305 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7306 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7307 {
7308 type = (GET_CODE (x) == SIGN_EXTRACT)
7309 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7310 index = XEXP (XEXP (x, 0), 0);
7311 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7312 if (INTVAL (XEXP (x, 1)) != 32 + shift
7313 || INTVAL (XEXP (x, 2)) != 0)
7314 shift = -1;
7315 }
7316 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7317 (const_int 0xffffffff<<shift)) */
7318 else if (GET_CODE (x) == AND
7319 && GET_MODE (x) == DImode
7320 && GET_CODE (XEXP (x, 0)) == ASHIFT
7321 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7322 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7323 && CONST_INT_P (XEXP (x, 1)))
7324 {
7325 type = ADDRESS_REG_UXTW;
7326 index = XEXP (XEXP (x, 0), 0);
7327 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7328 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7329 shift = -1;
7330 }
7331 /* (mult:P (reg:P) (const_int scale)) */
7332 else if (GET_CODE (x) == MULT
7333 && GET_MODE (x) == Pmode
7334 && GET_MODE (XEXP (x, 0)) == Pmode
7335 && CONST_INT_P (XEXP (x, 1)))
7336 {
7337 type = ADDRESS_REG_REG;
7338 index = XEXP (x, 0);
7339 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7340 }
7341 /* (ashift:P (reg:P) (const_int shift)) */
7342 else if (GET_CODE (x) == ASHIFT
7343 && GET_MODE (x) == Pmode
7344 && GET_MODE (XEXP (x, 0)) == Pmode
7345 && CONST_INT_P (XEXP (x, 1)))
7346 {
7347 type = ADDRESS_REG_REG;
7348 index = XEXP (x, 0);
7349 shift = INTVAL (XEXP (x, 1));
7350 }
7351 else
7352 return false;
7353
7354 if (!strict_p
7355 && GET_CODE (index) == SUBREG
7356 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7357 index = SUBREG_REG (index);
7358
7359 if (aarch64_sve_data_mode_p (mode))
7360 {
7361 if (type != ADDRESS_REG_REG
7362 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7363 return false;
7364 }
7365 else
7366 {
7367 if (shift != 0
7368 && !(IN_RANGE (shift, 1, 3)
7369 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7370 return false;
7371 }
7372
7373 if (REG_P (index)
7374 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7375 {
7376 info->type = type;
7377 info->offset = index;
7378 info->shift = shift;
7379 return true;
7380 }
7381
7382 return false;
7383 }
7384
7385 /* Return true if MODE is one of the modes for which we
7386 support LDP/STP operations. */
7387
7388 static bool
7389 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7390 {
7391 return mode == SImode || mode == DImode
7392 || mode == SFmode || mode == DFmode
7393 || (aarch64_vector_mode_supported_p (mode)
7394 && (known_eq (GET_MODE_SIZE (mode), 8)
7395 || (known_eq (GET_MODE_SIZE (mode), 16)
7396 && (aarch64_tune_params.extra_tuning_flags
7397 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7398 }
7399
7400 /* Return true if REGNO is a virtual pointer register, or an eliminable
7401 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7402 include stack_pointer or hard_frame_pointer. */
7403 static bool
7404 virt_or_elim_regno_p (unsigned regno)
7405 {
7406 return ((regno >= FIRST_VIRTUAL_REGISTER
7407 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7408 || regno == FRAME_POINTER_REGNUM
7409 || regno == ARG_POINTER_REGNUM);
7410 }
7411
7412 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7413 If it is, fill in INFO appropriately. STRICT_P is true if
7414 REG_OK_STRICT is in effect. */
7415
7416 bool
7417 aarch64_classify_address (struct aarch64_address_info *info,
7418 rtx x, machine_mode mode, bool strict_p,
7419 aarch64_addr_query_type type)
7420 {
7421 enum rtx_code code = GET_CODE (x);
7422 rtx op0, op1;
7423 poly_int64 offset;
7424
7425 HOST_WIDE_INT const_size;
7426
7427 /* On BE, we use load/store pair for all large int mode load/stores.
7428 TI/TFmode may also use a load/store pair. */
7429 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7430 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7431 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7432 || type == ADDR_QUERY_LDP_STP_N
7433 || mode == TImode
7434 || mode == TFmode
7435 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7436
7437 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7438 corresponds to the actual size of the memory being loaded/stored and the
7439 mode of the corresponding addressing mode is half of that. */
7440 if (type == ADDR_QUERY_LDP_STP_N
7441 && known_eq (GET_MODE_SIZE (mode), 16))
7442 mode = DFmode;
7443
7444 bool allow_reg_index_p = (!load_store_pair_p
7445 && (known_lt (GET_MODE_SIZE (mode), 16)
7446 || vec_flags == VEC_ADVSIMD
7447 || vec_flags & VEC_SVE_DATA));
7448
7449 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7450 [Rn, #offset, MUL VL]. */
7451 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7452 && (code != REG && code != PLUS))
7453 return false;
7454
7455 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7456 REG addressing. */
7457 if (advsimd_struct_p
7458 && !BYTES_BIG_ENDIAN
7459 && (code != POST_INC && code != REG))
7460 return false;
7461
7462 gcc_checking_assert (GET_MODE (x) == VOIDmode
7463 || SCALAR_INT_MODE_P (GET_MODE (x)));
7464
7465 switch (code)
7466 {
7467 case REG:
7468 case SUBREG:
7469 info->type = ADDRESS_REG_IMM;
7470 info->base = x;
7471 info->offset = const0_rtx;
7472 info->const_offset = 0;
7473 return aarch64_base_register_rtx_p (x, strict_p);
7474
7475 case PLUS:
7476 op0 = XEXP (x, 0);
7477 op1 = XEXP (x, 1);
7478
7479 if (! strict_p
7480 && REG_P (op0)
7481 && virt_or_elim_regno_p (REGNO (op0))
7482 && poly_int_rtx_p (op1, &offset))
7483 {
7484 info->type = ADDRESS_REG_IMM;
7485 info->base = op0;
7486 info->offset = op1;
7487 info->const_offset = offset;
7488
7489 return true;
7490 }
7491
7492 if (maybe_ne (GET_MODE_SIZE (mode), 0)
7493 && aarch64_base_register_rtx_p (op0, strict_p)
7494 && poly_int_rtx_p (op1, &offset))
7495 {
7496 info->type = ADDRESS_REG_IMM;
7497 info->base = op0;
7498 info->offset = op1;
7499 info->const_offset = offset;
7500
7501 /* TImode and TFmode values are allowed in both pairs of X
7502 registers and individual Q registers. The available
7503 address modes are:
7504 X,X: 7-bit signed scaled offset
7505 Q: 9-bit signed offset
7506 We conservatively require an offset representable in either mode.
7507 When performing the check for pairs of X registers i.e. LDP/STP
7508 pass down DImode since that is the natural size of the LDP/STP
7509 instruction memory accesses. */
7510 if (mode == TImode || mode == TFmode)
7511 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7512 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7513 || offset_12bit_unsigned_scaled_p (mode, offset)));
7514
7515 /* A 7bit offset check because OImode will emit a ldp/stp
7516 instruction (only big endian will get here).
7517 For ldp/stp instructions, the offset is scaled for the size of a
7518 single element of the pair. */
7519 if (mode == OImode)
7520 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7521
7522 /* Three 9/12 bit offsets checks because CImode will emit three
7523 ldr/str instructions (only big endian will get here). */
7524 if (mode == CImode)
7525 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7526 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7527 offset + 32)
7528 || offset_12bit_unsigned_scaled_p (V16QImode,
7529 offset + 32)));
7530
7531 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7532 instructions (only big endian will get here). */
7533 if (mode == XImode)
7534 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7535 && aarch64_offset_7bit_signed_scaled_p (TImode,
7536 offset + 32));
7537
7538 /* Make "m" use the LD1 offset range for SVE data modes, so
7539 that pre-RTL optimizers like ivopts will work to that
7540 instead of the wider LDR/STR range. */
7541 if (vec_flags == VEC_SVE_DATA)
7542 return (type == ADDR_QUERY_M
7543 ? offset_4bit_signed_scaled_p (mode, offset)
7544 : offset_9bit_signed_scaled_p (mode, offset));
7545
7546 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7547 {
7548 poly_int64 end_offset = (offset
7549 + GET_MODE_SIZE (mode)
7550 - BYTES_PER_SVE_VECTOR);
7551 return (type == ADDR_QUERY_M
7552 ? offset_4bit_signed_scaled_p (mode, offset)
7553 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7554 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7555 end_offset)));
7556 }
7557
7558 if (vec_flags == VEC_SVE_PRED)
7559 return offset_9bit_signed_scaled_p (mode, offset);
7560
7561 if (load_store_pair_p)
7562 return ((known_eq (GET_MODE_SIZE (mode), 4)
7563 || known_eq (GET_MODE_SIZE (mode), 8)
7564 || known_eq (GET_MODE_SIZE (mode), 16))
7565 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7566 else
7567 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7568 || offset_12bit_unsigned_scaled_p (mode, offset));
7569 }
7570
7571 if (allow_reg_index_p)
7572 {
7573 /* Look for base + (scaled/extended) index register. */
7574 if (aarch64_base_register_rtx_p (op0, strict_p)
7575 && aarch64_classify_index (info, op1, mode, strict_p))
7576 {
7577 info->base = op0;
7578 return true;
7579 }
7580 if (aarch64_base_register_rtx_p (op1, strict_p)
7581 && aarch64_classify_index (info, op0, mode, strict_p))
7582 {
7583 info->base = op1;
7584 return true;
7585 }
7586 }
7587
7588 return false;
7589
7590 case POST_INC:
7591 case POST_DEC:
7592 case PRE_INC:
7593 case PRE_DEC:
7594 info->type = ADDRESS_REG_WB;
7595 info->base = XEXP (x, 0);
7596 info->offset = NULL_RTX;
7597 return aarch64_base_register_rtx_p (info->base, strict_p);
7598
7599 case POST_MODIFY:
7600 case PRE_MODIFY:
7601 info->type = ADDRESS_REG_WB;
7602 info->base = XEXP (x, 0);
7603 if (GET_CODE (XEXP (x, 1)) == PLUS
7604 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7605 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7606 && aarch64_base_register_rtx_p (info->base, strict_p))
7607 {
7608 info->offset = XEXP (XEXP (x, 1), 1);
7609 info->const_offset = offset;
7610
7611 /* TImode and TFmode values are allowed in both pairs of X
7612 registers and individual Q registers. The available
7613 address modes are:
7614 X,X: 7-bit signed scaled offset
7615 Q: 9-bit signed offset
7616 We conservatively require an offset representable in either mode.
7617 */
7618 if (mode == TImode || mode == TFmode)
7619 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7620 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7621
7622 if (load_store_pair_p)
7623 return ((known_eq (GET_MODE_SIZE (mode), 4)
7624 || known_eq (GET_MODE_SIZE (mode), 8)
7625 || known_eq (GET_MODE_SIZE (mode), 16))
7626 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7627 else
7628 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7629 }
7630 return false;
7631
7632 case CONST:
7633 case SYMBOL_REF:
7634 case LABEL_REF:
7635 /* load literal: pc-relative constant pool entry. Only supported
7636 for SI mode or larger. */
7637 info->type = ADDRESS_SYMBOLIC;
7638
7639 if (!load_store_pair_p
7640 && GET_MODE_SIZE (mode).is_constant (&const_size)
7641 && const_size >= 4)
7642 {
7643 rtx sym, addend;
7644
7645 split_const (x, &sym, &addend);
7646 return ((GET_CODE (sym) == LABEL_REF
7647 || (GET_CODE (sym) == SYMBOL_REF
7648 && CONSTANT_POOL_ADDRESS_P (sym)
7649 && aarch64_pcrelative_literal_loads)));
7650 }
7651 return false;
7652
7653 case LO_SUM:
7654 info->type = ADDRESS_LO_SUM;
7655 info->base = XEXP (x, 0);
7656 info->offset = XEXP (x, 1);
7657 if (allow_reg_index_p
7658 && aarch64_base_register_rtx_p (info->base, strict_p))
7659 {
7660 rtx sym, offs;
7661 split_const (info->offset, &sym, &offs);
7662 if (GET_CODE (sym) == SYMBOL_REF
7663 && (aarch64_classify_symbol (sym, INTVAL (offs))
7664 == SYMBOL_SMALL_ABSOLUTE))
7665 {
7666 /* The symbol and offset must be aligned to the access size. */
7667 unsigned int align;
7668
7669 if (CONSTANT_POOL_ADDRESS_P (sym))
7670 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7671 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7672 {
7673 tree exp = SYMBOL_REF_DECL (sym);
7674 align = TYPE_ALIGN (TREE_TYPE (exp));
7675 align = aarch64_constant_alignment (exp, align);
7676 }
7677 else if (SYMBOL_REF_DECL (sym))
7678 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7679 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7680 && SYMBOL_REF_BLOCK (sym) != NULL)
7681 align = SYMBOL_REF_BLOCK (sym)->alignment;
7682 else
7683 align = BITS_PER_UNIT;
7684
7685 poly_int64 ref_size = GET_MODE_SIZE (mode);
7686 if (known_eq (ref_size, 0))
7687 ref_size = GET_MODE_SIZE (DImode);
7688
7689 return (multiple_p (INTVAL (offs), ref_size)
7690 && multiple_p (align / BITS_PER_UNIT, ref_size));
7691 }
7692 }
7693 return false;
7694
7695 default:
7696 return false;
7697 }
7698 }
7699
7700 /* Return true if the address X is valid for a PRFM instruction.
7701 STRICT_P is true if we should do strict checking with
7702 aarch64_classify_address. */
7703
7704 bool
7705 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7706 {
7707 struct aarch64_address_info addr;
7708
7709 /* PRFM accepts the same addresses as DImode... */
7710 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7711 if (!res)
7712 return false;
7713
7714 /* ... except writeback forms. */
7715 return addr.type != ADDRESS_REG_WB;
7716 }
7717
7718 bool
7719 aarch64_symbolic_address_p (rtx x)
7720 {
7721 rtx offset;
7722
7723 split_const (x, &x, &offset);
7724 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7725 }
7726
7727 /* Classify the base of symbolic expression X. */
7728
7729 enum aarch64_symbol_type
7730 aarch64_classify_symbolic_expression (rtx x)
7731 {
7732 rtx offset;
7733
7734 split_const (x, &x, &offset);
7735 return aarch64_classify_symbol (x, INTVAL (offset));
7736 }
7737
7738
7739 /* Return TRUE if X is a legitimate address for accessing memory in
7740 mode MODE. */
7741 static bool
7742 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7743 {
7744 struct aarch64_address_info addr;
7745
7746 return aarch64_classify_address (&addr, x, mode, strict_p);
7747 }
7748
7749 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7750 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7751 bool
7752 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7753 aarch64_addr_query_type type)
7754 {
7755 struct aarch64_address_info addr;
7756
7757 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7758 }
7759
7760 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7761
7762 static bool
7763 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7764 poly_int64 orig_offset,
7765 machine_mode mode)
7766 {
7767 HOST_WIDE_INT size;
7768 if (GET_MODE_SIZE (mode).is_constant (&size))
7769 {
7770 HOST_WIDE_INT const_offset, second_offset;
7771
7772 /* A general SVE offset is A * VQ + B. Remove the A component from
7773 coefficient 0 in order to get the constant B. */
7774 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7775
7776 /* Split an out-of-range address displacement into a base and
7777 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7778 range otherwise to increase opportunities for sharing the base
7779 address of different sizes. Unaligned accesses use the signed
7780 9-bit range, TImode/TFmode use the intersection of signed
7781 scaled 7-bit and signed 9-bit offset. */
7782 if (mode == TImode || mode == TFmode)
7783 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7784 else if ((const_offset & (size - 1)) != 0)
7785 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7786 else
7787 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7788
7789 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7790 return false;
7791
7792 /* Split the offset into second_offset and the rest. */
7793 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7794 *offset2 = gen_int_mode (second_offset, Pmode);
7795 return true;
7796 }
7797 else
7798 {
7799 /* Get the mode we should use as the basis of the range. For structure
7800 modes this is the mode of one vector. */
7801 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7802 machine_mode step_mode
7803 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7804
7805 /* Get the "mul vl" multiplier we'd like to use. */
7806 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7807 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7808 if (vec_flags & VEC_SVE_DATA)
7809 /* LDR supports a 9-bit range, but the move patterns for
7810 structure modes require all vectors to be in range of the
7811 same base. The simplest way of accomodating that while still
7812 promoting reuse of anchor points between different modes is
7813 to use an 8-bit range unconditionally. */
7814 vnum = ((vnum + 128) & 255) - 128;
7815 else
7816 /* Predicates are only handled singly, so we might as well use
7817 the full range. */
7818 vnum = ((vnum + 256) & 511) - 256;
7819 if (vnum == 0)
7820 return false;
7821
7822 /* Convert the "mul vl" multiplier into a byte offset. */
7823 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7824 if (known_eq (second_offset, orig_offset))
7825 return false;
7826
7827 /* Split the offset into second_offset and the rest. */
7828 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7829 *offset2 = gen_int_mode (second_offset, Pmode);
7830 return true;
7831 }
7832 }
7833
7834 /* Return the binary representation of floating point constant VALUE in INTVAL.
7835 If the value cannot be converted, return false without setting INTVAL.
7836 The conversion is done in the given MODE. */
7837 bool
7838 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7839 {
7840
7841 /* We make a general exception for 0. */
7842 if (aarch64_float_const_zero_rtx_p (value))
7843 {
7844 *intval = 0;
7845 return true;
7846 }
7847
7848 scalar_float_mode mode;
7849 if (GET_CODE (value) != CONST_DOUBLE
7850 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7851 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7852 /* Only support up to DF mode. */
7853 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7854 return false;
7855
7856 unsigned HOST_WIDE_INT ival = 0;
7857
7858 long res[2];
7859 real_to_target (res,
7860 CONST_DOUBLE_REAL_VALUE (value),
7861 REAL_MODE_FORMAT (mode));
7862
7863 if (mode == DFmode)
7864 {
7865 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7866 ival = zext_hwi (res[order], 32);
7867 ival |= (zext_hwi (res[1 - order], 32) << 32);
7868 }
7869 else
7870 ival = zext_hwi (res[0], 32);
7871
7872 *intval = ival;
7873 return true;
7874 }
7875
7876 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7877 single MOV(+MOVK) followed by an FMOV. */
7878 bool
7879 aarch64_float_const_rtx_p (rtx x)
7880 {
7881 machine_mode mode = GET_MODE (x);
7882 if (mode == VOIDmode)
7883 return false;
7884
7885 /* Determine whether it's cheaper to write float constants as
7886 mov/movk pairs over ldr/adrp pairs. */
7887 unsigned HOST_WIDE_INT ival;
7888
7889 if (GET_CODE (x) == CONST_DOUBLE
7890 && SCALAR_FLOAT_MODE_P (mode)
7891 && aarch64_reinterpret_float_as_int (x, &ival))
7892 {
7893 scalar_int_mode imode = (mode == HFmode
7894 ? SImode
7895 : int_mode_for_mode (mode).require ());
7896 int num_instr = aarch64_internal_mov_immediate
7897 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7898 return num_instr < 3;
7899 }
7900
7901 return false;
7902 }
7903
7904 /* Return TRUE if rtx X is immediate constant 0.0 */
7905 bool
7906 aarch64_float_const_zero_rtx_p (rtx x)
7907 {
7908 if (GET_MODE (x) == VOIDmode)
7909 return false;
7910
7911 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7912 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7913 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7914 }
7915
7916 /* Return TRUE if rtx X is immediate constant that fits in a single
7917 MOVI immediate operation. */
7918 bool
7919 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7920 {
7921 if (!TARGET_SIMD)
7922 return false;
7923
7924 machine_mode vmode;
7925 scalar_int_mode imode;
7926 unsigned HOST_WIDE_INT ival;
7927
7928 if (GET_CODE (x) == CONST_DOUBLE
7929 && SCALAR_FLOAT_MODE_P (mode))
7930 {
7931 if (!aarch64_reinterpret_float_as_int (x, &ival))
7932 return false;
7933
7934 /* We make a general exception for 0. */
7935 if (aarch64_float_const_zero_rtx_p (x))
7936 return true;
7937
7938 imode = int_mode_for_mode (mode).require ();
7939 }
7940 else if (GET_CODE (x) == CONST_INT
7941 && is_a <scalar_int_mode> (mode, &imode))
7942 ival = INTVAL (x);
7943 else
7944 return false;
7945
7946 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7947 a 128 bit vector mode. */
7948 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7949
7950 vmode = aarch64_simd_container_mode (imode, width);
7951 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7952
7953 return aarch64_simd_valid_immediate (v_op, NULL);
7954 }
7955
7956
7957 /* Return the fixed registers used for condition codes. */
7958
7959 static bool
7960 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7961 {
7962 *p1 = CC_REGNUM;
7963 *p2 = INVALID_REGNUM;
7964 return true;
7965 }
7966
7967 /* This function is used by the call expanders of the machine description.
7968 RESULT is the register in which the result is returned. It's NULL for
7969 "call" and "sibcall".
7970 MEM is the location of the function call.
7971 SIBCALL indicates whether this function call is normal call or sibling call.
7972 It will generate different pattern accordingly. */
7973
7974 void
7975 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7976 {
7977 rtx call, callee, tmp;
7978 rtvec vec;
7979 machine_mode mode;
7980
7981 gcc_assert (MEM_P (mem));
7982 callee = XEXP (mem, 0);
7983 mode = GET_MODE (callee);
7984 gcc_assert (mode == Pmode);
7985
7986 /* Decide if we should generate indirect calls by loading the
7987 address of the callee into a register before performing
7988 the branch-and-link. */
7989 if (SYMBOL_REF_P (callee)
7990 ? (aarch64_is_long_call_p (callee)
7991 || aarch64_is_noplt_call_p (callee))
7992 : !REG_P (callee))
7993 XEXP (mem, 0) = force_reg (mode, callee);
7994
7995 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7996
7997 if (result != NULL_RTX)
7998 call = gen_rtx_SET (result, call);
7999
8000 if (sibcall)
8001 tmp = ret_rtx;
8002 else
8003 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8004
8005 vec = gen_rtvec (2, call, tmp);
8006 call = gen_rtx_PARALLEL (VOIDmode, vec);
8007
8008 aarch64_emit_call_insn (call);
8009 }
8010
8011 /* Emit call insn with PAT and do aarch64-specific handling. */
8012
8013 void
8014 aarch64_emit_call_insn (rtx pat)
8015 {
8016 rtx insn = emit_call_insn (pat);
8017
8018 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8019 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8020 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8021 }
8022
8023 machine_mode
8024 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8025 {
8026 machine_mode mode_x = GET_MODE (x);
8027 rtx_code code_x = GET_CODE (x);
8028
8029 /* All floating point compares return CCFP if it is an equality
8030 comparison, and CCFPE otherwise. */
8031 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8032 {
8033 switch (code)
8034 {
8035 case EQ:
8036 case NE:
8037 case UNORDERED:
8038 case ORDERED:
8039 case UNLT:
8040 case UNLE:
8041 case UNGT:
8042 case UNGE:
8043 case UNEQ:
8044 return CCFPmode;
8045
8046 case LT:
8047 case LE:
8048 case GT:
8049 case GE:
8050 case LTGT:
8051 return CCFPEmode;
8052
8053 default:
8054 gcc_unreachable ();
8055 }
8056 }
8057
8058 /* Equality comparisons of short modes against zero can be performed
8059 using the TST instruction with the appropriate bitmask. */
8060 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8061 && (code == EQ || code == NE)
8062 && (mode_x == HImode || mode_x == QImode))
8063 return CC_NZmode;
8064
8065 /* Similarly, comparisons of zero_extends from shorter modes can
8066 be performed using an ANDS with an immediate mask. */
8067 if (y == const0_rtx && code_x == ZERO_EXTEND
8068 && (mode_x == SImode || mode_x == DImode)
8069 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8070 && (code == EQ || code == NE))
8071 return CC_NZmode;
8072
8073 if ((mode_x == SImode || mode_x == DImode)
8074 && y == const0_rtx
8075 && (code == EQ || code == NE || code == LT || code == GE)
8076 && (code_x == PLUS || code_x == MINUS || code_x == AND
8077 || code_x == NEG
8078 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8079 && CONST_INT_P (XEXP (x, 2)))))
8080 return CC_NZmode;
8081
8082 /* A compare with a shifted operand. Because of canonicalization,
8083 the comparison will have to be swapped when we emit the assembly
8084 code. */
8085 if ((mode_x == SImode || mode_x == DImode)
8086 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8087 && (code_x == ASHIFT || code_x == ASHIFTRT
8088 || code_x == LSHIFTRT
8089 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8090 return CC_SWPmode;
8091
8092 /* Similarly for a negated operand, but we can only do this for
8093 equalities. */
8094 if ((mode_x == SImode || mode_x == DImode)
8095 && (REG_P (y) || GET_CODE (y) == SUBREG)
8096 && (code == EQ || code == NE)
8097 && code_x == NEG)
8098 return CC_Zmode;
8099
8100 /* A test for unsigned overflow from an addition. */
8101 if ((mode_x == DImode || mode_x == TImode)
8102 && (code == LTU || code == GEU)
8103 && code_x == PLUS
8104 && rtx_equal_p (XEXP (x, 0), y))
8105 return CC_Cmode;
8106
8107 /* A test for unsigned overflow from an add with carry. */
8108 if ((mode_x == DImode || mode_x == TImode)
8109 && (code == LTU || code == GEU)
8110 && code_x == PLUS
8111 && CONST_SCALAR_INT_P (y)
8112 && (rtx_mode_t (y, mode_x)
8113 == (wi::shwi (1, mode_x)
8114 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8115 return CC_ADCmode;
8116
8117 /* A test for signed overflow. */
8118 if ((mode_x == DImode || mode_x == TImode)
8119 && code == NE
8120 && code_x == PLUS
8121 && GET_CODE (y) == SIGN_EXTEND)
8122 return CC_Vmode;
8123
8124 /* For everything else, return CCmode. */
8125 return CCmode;
8126 }
8127
8128 static int
8129 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8130
8131 int
8132 aarch64_get_condition_code (rtx x)
8133 {
8134 machine_mode mode = GET_MODE (XEXP (x, 0));
8135 enum rtx_code comp_code = GET_CODE (x);
8136
8137 if (GET_MODE_CLASS (mode) != MODE_CC)
8138 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8139 return aarch64_get_condition_code_1 (mode, comp_code);
8140 }
8141
8142 static int
8143 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8144 {
8145 switch (mode)
8146 {
8147 case E_CCFPmode:
8148 case E_CCFPEmode:
8149 switch (comp_code)
8150 {
8151 case GE: return AARCH64_GE;
8152 case GT: return AARCH64_GT;
8153 case LE: return AARCH64_LS;
8154 case LT: return AARCH64_MI;
8155 case NE: return AARCH64_NE;
8156 case EQ: return AARCH64_EQ;
8157 case ORDERED: return AARCH64_VC;
8158 case UNORDERED: return AARCH64_VS;
8159 case UNLT: return AARCH64_LT;
8160 case UNLE: return AARCH64_LE;
8161 case UNGT: return AARCH64_HI;
8162 case UNGE: return AARCH64_PL;
8163 default: return -1;
8164 }
8165 break;
8166
8167 case E_CCmode:
8168 switch (comp_code)
8169 {
8170 case NE: return AARCH64_NE;
8171 case EQ: return AARCH64_EQ;
8172 case GE: return AARCH64_GE;
8173 case GT: return AARCH64_GT;
8174 case LE: return AARCH64_LE;
8175 case LT: return AARCH64_LT;
8176 case GEU: return AARCH64_CS;
8177 case GTU: return AARCH64_HI;
8178 case LEU: return AARCH64_LS;
8179 case LTU: return AARCH64_CC;
8180 default: return -1;
8181 }
8182 break;
8183
8184 case E_CC_SWPmode:
8185 switch (comp_code)
8186 {
8187 case NE: return AARCH64_NE;
8188 case EQ: return AARCH64_EQ;
8189 case GE: return AARCH64_LE;
8190 case GT: return AARCH64_LT;
8191 case LE: return AARCH64_GE;
8192 case LT: return AARCH64_GT;
8193 case GEU: return AARCH64_LS;
8194 case GTU: return AARCH64_CC;
8195 case LEU: return AARCH64_CS;
8196 case LTU: return AARCH64_HI;
8197 default: return -1;
8198 }
8199 break;
8200
8201 case E_CC_NZCmode:
8202 switch (comp_code)
8203 {
8204 case NE: return AARCH64_NE; /* = any */
8205 case EQ: return AARCH64_EQ; /* = none */
8206 case GE: return AARCH64_PL; /* = nfrst */
8207 case LT: return AARCH64_MI; /* = first */
8208 case GEU: return AARCH64_CS; /* = nlast */
8209 case GTU: return AARCH64_HI; /* = pmore */
8210 case LEU: return AARCH64_LS; /* = plast */
8211 case LTU: return AARCH64_CC; /* = last */
8212 default: return -1;
8213 }
8214 break;
8215
8216 case E_CC_NZmode:
8217 switch (comp_code)
8218 {
8219 case NE: return AARCH64_NE;
8220 case EQ: return AARCH64_EQ;
8221 case GE: return AARCH64_PL;
8222 case LT: return AARCH64_MI;
8223 default: return -1;
8224 }
8225 break;
8226
8227 case E_CC_Zmode:
8228 switch (comp_code)
8229 {
8230 case NE: return AARCH64_NE;
8231 case EQ: return AARCH64_EQ;
8232 default: return -1;
8233 }
8234 break;
8235
8236 case E_CC_Cmode:
8237 switch (comp_code)
8238 {
8239 case LTU: return AARCH64_CS;
8240 case GEU: return AARCH64_CC;
8241 default: return -1;
8242 }
8243 break;
8244
8245 case E_CC_ADCmode:
8246 switch (comp_code)
8247 {
8248 case GEU: return AARCH64_CS;
8249 case LTU: return AARCH64_CC;
8250 default: return -1;
8251 }
8252 break;
8253
8254 case E_CC_Vmode:
8255 switch (comp_code)
8256 {
8257 case NE: return AARCH64_VS;
8258 case EQ: return AARCH64_VC;
8259 default: return -1;
8260 }
8261 break;
8262
8263 default:
8264 return -1;
8265 }
8266
8267 return -1;
8268 }
8269
8270 bool
8271 aarch64_const_vec_all_same_in_range_p (rtx x,
8272 HOST_WIDE_INT minval,
8273 HOST_WIDE_INT maxval)
8274 {
8275 rtx elt;
8276 return (const_vec_duplicate_p (x, &elt)
8277 && CONST_INT_P (elt)
8278 && IN_RANGE (INTVAL (elt), minval, maxval));
8279 }
8280
8281 bool
8282 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8283 {
8284 return aarch64_const_vec_all_same_in_range_p (x, val, val);
8285 }
8286
8287 /* Return true if VEC is a constant in which every element is in the range
8288 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8289
8290 static bool
8291 aarch64_const_vec_all_in_range_p (rtx vec,
8292 HOST_WIDE_INT minval,
8293 HOST_WIDE_INT maxval)
8294 {
8295 if (GET_CODE (vec) != CONST_VECTOR
8296 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8297 return false;
8298
8299 int nunits;
8300 if (!CONST_VECTOR_STEPPED_P (vec))
8301 nunits = const_vector_encoded_nelts (vec);
8302 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8303 return false;
8304
8305 for (int i = 0; i < nunits; i++)
8306 {
8307 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8308 if (!CONST_INT_P (vec_elem)
8309 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8310 return false;
8311 }
8312 return true;
8313 }
8314
8315 /* N Z C V. */
8316 #define AARCH64_CC_V 1
8317 #define AARCH64_CC_C (1 << 1)
8318 #define AARCH64_CC_Z (1 << 2)
8319 #define AARCH64_CC_N (1 << 3)
8320
8321 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8322 static const int aarch64_nzcv_codes[] =
8323 {
8324 0, /* EQ, Z == 1. */
8325 AARCH64_CC_Z, /* NE, Z == 0. */
8326 0, /* CS, C == 1. */
8327 AARCH64_CC_C, /* CC, C == 0. */
8328 0, /* MI, N == 1. */
8329 AARCH64_CC_N, /* PL, N == 0. */
8330 0, /* VS, V == 1. */
8331 AARCH64_CC_V, /* VC, V == 0. */
8332 0, /* HI, C ==1 && Z == 0. */
8333 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
8334 AARCH64_CC_V, /* GE, N == V. */
8335 0, /* LT, N != V. */
8336 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
8337 0, /* LE, !(Z == 0 && N == V). */
8338 0, /* AL, Any. */
8339 0 /* NV, Any. */
8340 };
8341
8342 /* Print floating-point vector immediate operand X to F, negating it
8343 first if NEGATE is true. Return true on success, false if it isn't
8344 a constant we can handle. */
8345
8346 static bool
8347 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8348 {
8349 rtx elt;
8350
8351 if (!const_vec_duplicate_p (x, &elt))
8352 return false;
8353
8354 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8355 if (negate)
8356 r = real_value_negate (&r);
8357
8358 /* Handle the SVE single-bit immediates specially, since they have a
8359 fixed form in the assembly syntax. */
8360 if (real_equal (&r, &dconst0))
8361 asm_fprintf (f, "0.0");
8362 else if (real_equal (&r, &dconst2))
8363 asm_fprintf (f, "2.0");
8364 else if (real_equal (&r, &dconst1))
8365 asm_fprintf (f, "1.0");
8366 else if (real_equal (&r, &dconsthalf))
8367 asm_fprintf (f, "0.5");
8368 else
8369 {
8370 const int buf_size = 20;
8371 char float_buf[buf_size] = {'\0'};
8372 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8373 1, GET_MODE (elt));
8374 asm_fprintf (f, "%s", float_buf);
8375 }
8376
8377 return true;
8378 }
8379
8380 /* Return the equivalent letter for size. */
8381 static char
8382 sizetochar (int size)
8383 {
8384 switch (size)
8385 {
8386 case 64: return 'd';
8387 case 32: return 's';
8388 case 16: return 'h';
8389 case 8 : return 'b';
8390 default: gcc_unreachable ();
8391 }
8392 }
8393
8394 /* Print operand X to file F in a target specific manner according to CODE.
8395 The acceptable formatting commands given by CODE are:
8396 'c': An integer or symbol address without a preceding #
8397 sign.
8398 'C': Take the duplicated element in a vector constant
8399 and print it in hex.
8400 'D': Take the duplicated element in a vector constant
8401 and print it as an unsigned integer, in decimal.
8402 'e': Print the sign/zero-extend size as a character 8->b,
8403 16->h, 32->w. Can also be used for masks:
8404 0xff->b, 0xffff->h, 0xffffffff->w.
8405 'I': If the operand is a duplicated vector constant,
8406 replace it with the duplicated scalar. If the
8407 operand is then a floating-point constant, replace
8408 it with the integer bit representation. Print the
8409 transformed constant as a signed decimal number.
8410 'p': Prints N such that 2^N == X (X must be power of 2 and
8411 const int).
8412 'P': Print the number of non-zero bits in X (a const_int).
8413 'H': Print the higher numbered register of a pair (TImode)
8414 of regs.
8415 'm': Print a condition (eq, ne, etc).
8416 'M': Same as 'm', but invert condition.
8417 'N': Take the duplicated element in a vector constant
8418 and print the negative of it in decimal.
8419 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8420 'S/T/U/V': Print a FP/SIMD register name for a register list.
8421 The register printed is the FP/SIMD register name
8422 of X + 0/1/2/3 for S/T/U/V.
8423 'R': Print a scalar FP/SIMD register name + 1.
8424 'X': Print bottom 16 bits of integer constant in hex.
8425 'w/x': Print a general register name or the zero register
8426 (32-bit or 64-bit).
8427 '0': Print a normal operand, if it's a general register,
8428 then we assume DImode.
8429 'k': Print NZCV for conditional compare instructions.
8430 'A': Output address constant representing the first
8431 argument of X, specifying a relocation offset
8432 if appropriate.
8433 'L': Output constant address specified by X
8434 with a relocation offset if appropriate.
8435 'G': Prints address of X, specifying a PC relative
8436 relocation mode if appropriate.
8437 'y': Output address of LDP or STP - this is used for
8438 some LDP/STPs which don't use a PARALLEL in their
8439 pattern (so the mode needs to be adjusted).
8440 'z': Output address of a typical LDP or STP. */
8441
8442 static void
8443 aarch64_print_operand (FILE *f, rtx x, int code)
8444 {
8445 rtx elt;
8446 switch (code)
8447 {
8448 case 'c':
8449 switch (GET_CODE (x))
8450 {
8451 case CONST_INT:
8452 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8453 break;
8454
8455 case SYMBOL_REF:
8456 output_addr_const (f, x);
8457 break;
8458
8459 case CONST:
8460 if (GET_CODE (XEXP (x, 0)) == PLUS
8461 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8462 {
8463 output_addr_const (f, x);
8464 break;
8465 }
8466 /* Fall through. */
8467
8468 default:
8469 output_operand_lossage ("unsupported operand for code '%c'", code);
8470 }
8471 break;
8472
8473 case 'e':
8474 {
8475 x = unwrap_const_vec_duplicate (x);
8476 if (!CONST_INT_P (x))
8477 {
8478 output_operand_lossage ("invalid operand for '%%%c'", code);
8479 return;
8480 }
8481
8482 HOST_WIDE_INT val = INTVAL (x);
8483 if ((val & ~7) == 8 || val == 0xff)
8484 fputc ('b', f);
8485 else if ((val & ~7) == 16 || val == 0xffff)
8486 fputc ('h', f);
8487 else if ((val & ~7) == 32 || val == 0xffffffff)
8488 fputc ('w', f);
8489 else
8490 {
8491 output_operand_lossage ("invalid operand for '%%%c'", code);
8492 return;
8493 }
8494 }
8495 break;
8496
8497 case 'p':
8498 {
8499 int n;
8500
8501 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8502 {
8503 output_operand_lossage ("invalid operand for '%%%c'", code);
8504 return;
8505 }
8506
8507 asm_fprintf (f, "%d", n);
8508 }
8509 break;
8510
8511 case 'P':
8512 if (!CONST_INT_P (x))
8513 {
8514 output_operand_lossage ("invalid operand for '%%%c'", code);
8515 return;
8516 }
8517
8518 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8519 break;
8520
8521 case 'H':
8522 if (x == const0_rtx)
8523 {
8524 asm_fprintf (f, "xzr");
8525 break;
8526 }
8527
8528 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8529 {
8530 output_operand_lossage ("invalid operand for '%%%c'", code);
8531 return;
8532 }
8533
8534 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8535 break;
8536
8537 case 'I':
8538 {
8539 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8540 if (CONST_INT_P (x))
8541 asm_fprintf (f, "%wd", INTVAL (x));
8542 else
8543 {
8544 output_operand_lossage ("invalid operand for '%%%c'", code);
8545 return;
8546 }
8547 break;
8548 }
8549
8550 case 'M':
8551 case 'm':
8552 {
8553 int cond_code;
8554 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8555 if (x == const_true_rtx)
8556 {
8557 if (code == 'M')
8558 fputs ("nv", f);
8559 return;
8560 }
8561
8562 if (!COMPARISON_P (x))
8563 {
8564 output_operand_lossage ("invalid operand for '%%%c'", code);
8565 return;
8566 }
8567
8568 cond_code = aarch64_get_condition_code (x);
8569 gcc_assert (cond_code >= 0);
8570 if (code == 'M')
8571 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8572 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8573 fputs (aarch64_sve_condition_codes[cond_code], f);
8574 else
8575 fputs (aarch64_condition_codes[cond_code], f);
8576 }
8577 break;
8578
8579 case 'N':
8580 if (!const_vec_duplicate_p (x, &elt))
8581 {
8582 output_operand_lossage ("invalid vector constant");
8583 return;
8584 }
8585
8586 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8587 asm_fprintf (f, "%wd", -INTVAL (elt));
8588 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8589 && aarch64_print_vector_float_operand (f, x, true))
8590 ;
8591 else
8592 {
8593 output_operand_lossage ("invalid vector constant");
8594 return;
8595 }
8596 break;
8597
8598 case 'b':
8599 case 'h':
8600 case 's':
8601 case 'd':
8602 case 'q':
8603 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8604 {
8605 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8606 return;
8607 }
8608 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8609 break;
8610
8611 case 'S':
8612 case 'T':
8613 case 'U':
8614 case 'V':
8615 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8616 {
8617 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8618 return;
8619 }
8620 asm_fprintf (f, "%c%d",
8621 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8622 REGNO (x) - V0_REGNUM + (code - 'S'));
8623 break;
8624
8625 case 'R':
8626 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8627 {
8628 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8629 return;
8630 }
8631 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8632 break;
8633
8634 case 'X':
8635 if (!CONST_INT_P (x))
8636 {
8637 output_operand_lossage ("invalid operand for '%%%c'", code);
8638 return;
8639 }
8640 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8641 break;
8642
8643 case 'C':
8644 {
8645 /* Print a replicated constant in hex. */
8646 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8647 {
8648 output_operand_lossage ("invalid operand for '%%%c'", code);
8649 return;
8650 }
8651 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8652 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8653 }
8654 break;
8655
8656 case 'D':
8657 {
8658 /* Print a replicated constant in decimal, treating it as
8659 unsigned. */
8660 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8661 {
8662 output_operand_lossage ("invalid operand for '%%%c'", code);
8663 return;
8664 }
8665 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8666 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8667 }
8668 break;
8669
8670 case 'w':
8671 case 'x':
8672 if (x == const0_rtx
8673 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8674 {
8675 asm_fprintf (f, "%czr", code);
8676 break;
8677 }
8678
8679 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8680 {
8681 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8682 break;
8683 }
8684
8685 if (REG_P (x) && REGNO (x) == SP_REGNUM)
8686 {
8687 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8688 break;
8689 }
8690
8691 /* Fall through */
8692
8693 case 0:
8694 if (x == NULL)
8695 {
8696 output_operand_lossage ("missing operand");
8697 return;
8698 }
8699
8700 switch (GET_CODE (x))
8701 {
8702 case REG:
8703 if (aarch64_sve_data_mode_p (GET_MODE (x)))
8704 {
8705 if (REG_NREGS (x) == 1)
8706 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8707 else
8708 {
8709 char suffix
8710 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8711 asm_fprintf (f, "{z%d.%c - z%d.%c}",
8712 REGNO (x) - V0_REGNUM, suffix,
8713 END_REGNO (x) - V0_REGNUM - 1, suffix);
8714 }
8715 }
8716 else
8717 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8718 break;
8719
8720 case MEM:
8721 output_address (GET_MODE (x), XEXP (x, 0));
8722 break;
8723
8724 case LABEL_REF:
8725 case SYMBOL_REF:
8726 output_addr_const (asm_out_file, x);
8727 break;
8728
8729 case CONST_INT:
8730 asm_fprintf (f, "%wd", INTVAL (x));
8731 break;
8732
8733 case CONST:
8734 if (!VECTOR_MODE_P (GET_MODE (x)))
8735 {
8736 output_addr_const (asm_out_file, x);
8737 break;
8738 }
8739 /* fall through */
8740
8741 case CONST_VECTOR:
8742 if (!const_vec_duplicate_p (x, &elt))
8743 {
8744 output_operand_lossage ("invalid vector constant");
8745 return;
8746 }
8747
8748 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8749 asm_fprintf (f, "%wd", INTVAL (elt));
8750 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8751 && aarch64_print_vector_float_operand (f, x, false))
8752 ;
8753 else
8754 {
8755 output_operand_lossage ("invalid vector constant");
8756 return;
8757 }
8758 break;
8759
8760 case CONST_DOUBLE:
8761 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8762 be getting CONST_DOUBLEs holding integers. */
8763 gcc_assert (GET_MODE (x) != VOIDmode);
8764 if (aarch64_float_const_zero_rtx_p (x))
8765 {
8766 fputc ('0', f);
8767 break;
8768 }
8769 else if (aarch64_float_const_representable_p (x))
8770 {
8771 #define buf_size 20
8772 char float_buf[buf_size] = {'\0'};
8773 real_to_decimal_for_mode (float_buf,
8774 CONST_DOUBLE_REAL_VALUE (x),
8775 buf_size, buf_size,
8776 1, GET_MODE (x));
8777 asm_fprintf (asm_out_file, "%s", float_buf);
8778 break;
8779 #undef buf_size
8780 }
8781 output_operand_lossage ("invalid constant");
8782 return;
8783 default:
8784 output_operand_lossage ("invalid operand");
8785 return;
8786 }
8787 break;
8788
8789 case 'A':
8790 if (GET_CODE (x) == HIGH)
8791 x = XEXP (x, 0);
8792
8793 switch (aarch64_classify_symbolic_expression (x))
8794 {
8795 case SYMBOL_SMALL_GOT_4G:
8796 asm_fprintf (asm_out_file, ":got:");
8797 break;
8798
8799 case SYMBOL_SMALL_TLSGD:
8800 asm_fprintf (asm_out_file, ":tlsgd:");
8801 break;
8802
8803 case SYMBOL_SMALL_TLSDESC:
8804 asm_fprintf (asm_out_file, ":tlsdesc:");
8805 break;
8806
8807 case SYMBOL_SMALL_TLSIE:
8808 asm_fprintf (asm_out_file, ":gottprel:");
8809 break;
8810
8811 case SYMBOL_TLSLE24:
8812 asm_fprintf (asm_out_file, ":tprel:");
8813 break;
8814
8815 case SYMBOL_TINY_GOT:
8816 gcc_unreachable ();
8817 break;
8818
8819 default:
8820 break;
8821 }
8822 output_addr_const (asm_out_file, x);
8823 break;
8824
8825 case 'L':
8826 switch (aarch64_classify_symbolic_expression (x))
8827 {
8828 case SYMBOL_SMALL_GOT_4G:
8829 asm_fprintf (asm_out_file, ":lo12:");
8830 break;
8831
8832 case SYMBOL_SMALL_TLSGD:
8833 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8834 break;
8835
8836 case SYMBOL_SMALL_TLSDESC:
8837 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8838 break;
8839
8840 case SYMBOL_SMALL_TLSIE:
8841 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8842 break;
8843
8844 case SYMBOL_TLSLE12:
8845 asm_fprintf (asm_out_file, ":tprel_lo12:");
8846 break;
8847
8848 case SYMBOL_TLSLE24:
8849 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8850 break;
8851
8852 case SYMBOL_TINY_GOT:
8853 asm_fprintf (asm_out_file, ":got:");
8854 break;
8855
8856 case SYMBOL_TINY_TLSIE:
8857 asm_fprintf (asm_out_file, ":gottprel:");
8858 break;
8859
8860 default:
8861 break;
8862 }
8863 output_addr_const (asm_out_file, x);
8864 break;
8865
8866 case 'G':
8867 switch (aarch64_classify_symbolic_expression (x))
8868 {
8869 case SYMBOL_TLSLE24:
8870 asm_fprintf (asm_out_file, ":tprel_hi12:");
8871 break;
8872 default:
8873 break;
8874 }
8875 output_addr_const (asm_out_file, x);
8876 break;
8877
8878 case 'k':
8879 {
8880 HOST_WIDE_INT cond_code;
8881
8882 if (!CONST_INT_P (x))
8883 {
8884 output_operand_lossage ("invalid operand for '%%%c'", code);
8885 return;
8886 }
8887
8888 cond_code = INTVAL (x);
8889 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8890 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8891 }
8892 break;
8893
8894 case 'y':
8895 case 'z':
8896 {
8897 machine_mode mode = GET_MODE (x);
8898
8899 if (GET_CODE (x) != MEM
8900 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8901 {
8902 output_operand_lossage ("invalid operand for '%%%c'", code);
8903 return;
8904 }
8905
8906 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8907 code == 'y'
8908 ? ADDR_QUERY_LDP_STP_N
8909 : ADDR_QUERY_LDP_STP))
8910 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8911 }
8912 break;
8913
8914 default:
8915 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8916 return;
8917 }
8918 }
8919
8920 /* Print address 'x' of a memory access with mode 'mode'.
8921 'op' is the context required by aarch64_classify_address. It can either be
8922 MEM for a normal memory access or PARALLEL for LDP/STP. */
8923 static bool
8924 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8925 aarch64_addr_query_type type)
8926 {
8927 struct aarch64_address_info addr;
8928 unsigned int size;
8929
8930 /* Check all addresses are Pmode - including ILP32. */
8931 if (GET_MODE (x) != Pmode
8932 && (!CONST_INT_P (x)
8933 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8934 {
8935 output_operand_lossage ("invalid address mode");
8936 return false;
8937 }
8938
8939 if (aarch64_classify_address (&addr, x, mode, true, type))
8940 switch (addr.type)
8941 {
8942 case ADDRESS_REG_IMM:
8943 if (known_eq (addr.const_offset, 0))
8944 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8945 else if (aarch64_sve_data_mode_p (mode))
8946 {
8947 HOST_WIDE_INT vnum
8948 = exact_div (addr.const_offset,
8949 BYTES_PER_SVE_VECTOR).to_constant ();
8950 asm_fprintf (f, "[%s, #%wd, mul vl]",
8951 reg_names[REGNO (addr.base)], vnum);
8952 }
8953 else if (aarch64_sve_pred_mode_p (mode))
8954 {
8955 HOST_WIDE_INT vnum
8956 = exact_div (addr.const_offset,
8957 BYTES_PER_SVE_PRED).to_constant ();
8958 asm_fprintf (f, "[%s, #%wd, mul vl]",
8959 reg_names[REGNO (addr.base)], vnum);
8960 }
8961 else
8962 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8963 INTVAL (addr.offset));
8964 return true;
8965
8966 case ADDRESS_REG_REG:
8967 if (addr.shift == 0)
8968 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8969 reg_names [REGNO (addr.offset)]);
8970 else
8971 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8972 reg_names [REGNO (addr.offset)], addr.shift);
8973 return true;
8974
8975 case ADDRESS_REG_UXTW:
8976 if (addr.shift == 0)
8977 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8978 REGNO (addr.offset) - R0_REGNUM);
8979 else
8980 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8981 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8982 return true;
8983
8984 case ADDRESS_REG_SXTW:
8985 if (addr.shift == 0)
8986 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8987 REGNO (addr.offset) - R0_REGNUM);
8988 else
8989 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8990 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8991 return true;
8992
8993 case ADDRESS_REG_WB:
8994 /* Writeback is only supported for fixed-width modes. */
8995 size = GET_MODE_SIZE (mode).to_constant ();
8996 switch (GET_CODE (x))
8997 {
8998 case PRE_INC:
8999 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9000 return true;
9001 case POST_INC:
9002 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9003 return true;
9004 case PRE_DEC:
9005 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9006 return true;
9007 case POST_DEC:
9008 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9009 return true;
9010 case PRE_MODIFY:
9011 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9012 INTVAL (addr.offset));
9013 return true;
9014 case POST_MODIFY:
9015 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9016 INTVAL (addr.offset));
9017 return true;
9018 default:
9019 break;
9020 }
9021 break;
9022
9023 case ADDRESS_LO_SUM:
9024 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9025 output_addr_const (f, addr.offset);
9026 asm_fprintf (f, "]");
9027 return true;
9028
9029 case ADDRESS_SYMBOLIC:
9030 output_addr_const (f, x);
9031 return true;
9032 }
9033
9034 return false;
9035 }
9036
9037 /* Print address 'x' of a memory access with mode 'mode'. */
9038 static void
9039 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9040 {
9041 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9042 output_addr_const (f, x);
9043 }
9044
9045 bool
9046 aarch64_label_mentioned_p (rtx x)
9047 {
9048 const char *fmt;
9049 int i;
9050
9051 if (GET_CODE (x) == LABEL_REF)
9052 return true;
9053
9054 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9055 referencing instruction, but they are constant offsets, not
9056 symbols. */
9057 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9058 return false;
9059
9060 fmt = GET_RTX_FORMAT (GET_CODE (x));
9061 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9062 {
9063 if (fmt[i] == 'E')
9064 {
9065 int j;
9066
9067 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9068 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9069 return 1;
9070 }
9071 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9072 return 1;
9073 }
9074
9075 return 0;
9076 }
9077
9078 /* Implement REGNO_REG_CLASS. */
9079
9080 enum reg_class
9081 aarch64_regno_regclass (unsigned regno)
9082 {
9083 if (GP_REGNUM_P (regno))
9084 return GENERAL_REGS;
9085
9086 if (regno == SP_REGNUM)
9087 return STACK_REG;
9088
9089 if (regno == FRAME_POINTER_REGNUM
9090 || regno == ARG_POINTER_REGNUM)
9091 return POINTER_REGS;
9092
9093 if (FP_REGNUM_P (regno))
9094 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9095 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9096
9097 if (PR_REGNUM_P (regno))
9098 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9099
9100 return NO_REGS;
9101 }
9102
9103 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9104 If OFFSET is out of range, return an offset of an anchor point
9105 that is in range. Return 0 otherwise. */
9106
9107 static HOST_WIDE_INT
9108 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9109 machine_mode mode)
9110 {
9111 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9112 if (size > 16)
9113 return (offset + 0x400) & ~0x7f0;
9114
9115 /* For offsets that aren't a multiple of the access size, the limit is
9116 -256...255. */
9117 if (offset & (size - 1))
9118 {
9119 /* BLKmode typically uses LDP of X-registers. */
9120 if (mode == BLKmode)
9121 return (offset + 512) & ~0x3ff;
9122 return (offset + 0x100) & ~0x1ff;
9123 }
9124
9125 /* Small negative offsets are supported. */
9126 if (IN_RANGE (offset, -256, 0))
9127 return 0;
9128
9129 if (mode == TImode || mode == TFmode)
9130 return (offset + 0x100) & ~0x1ff;
9131
9132 /* Use 12-bit offset by access size. */
9133 return offset & (~0xfff * size);
9134 }
9135
9136 static rtx
9137 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
9138 {
9139 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9140 where mask is selected by alignment and size of the offset.
9141 We try to pick as large a range for the offset as possible to
9142 maximize the chance of a CSE. However, for aligned addresses
9143 we limit the range to 4k so that structures with different sized
9144 elements are likely to use the same base. We need to be careful
9145 not to split a CONST for some forms of address expression, otherwise
9146 it will generate sub-optimal code. */
9147
9148 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9149 {
9150 rtx base = XEXP (x, 0);
9151 rtx offset_rtx = XEXP (x, 1);
9152 HOST_WIDE_INT offset = INTVAL (offset_rtx);
9153
9154 if (GET_CODE (base) == PLUS)
9155 {
9156 rtx op0 = XEXP (base, 0);
9157 rtx op1 = XEXP (base, 1);
9158
9159 /* Force any scaling into a temp for CSE. */
9160 op0 = force_reg (Pmode, op0);
9161 op1 = force_reg (Pmode, op1);
9162
9163 /* Let the pointer register be in op0. */
9164 if (REG_POINTER (op1))
9165 std::swap (op0, op1);
9166
9167 /* If the pointer is virtual or frame related, then we know that
9168 virtual register instantiation or register elimination is going
9169 to apply a second constant. We want the two constants folded
9170 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9171 if (virt_or_elim_regno_p (REGNO (op0)))
9172 {
9173 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9174 NULL_RTX, true, OPTAB_DIRECT);
9175 return gen_rtx_PLUS (Pmode, base, op1);
9176 }
9177
9178 /* Otherwise, in order to encourage CSE (and thence loop strength
9179 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9180 base = expand_binop (Pmode, add_optab, op0, op1,
9181 NULL_RTX, true, OPTAB_DIRECT);
9182 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9183 }
9184
9185 HOST_WIDE_INT size;
9186 if (GET_MODE_SIZE (mode).is_constant (&size))
9187 {
9188 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9189 mode);
9190 if (base_offset != 0)
9191 {
9192 base = plus_constant (Pmode, base, base_offset);
9193 base = force_operand (base, NULL_RTX);
9194 return plus_constant (Pmode, base, offset - base_offset);
9195 }
9196 }
9197 }
9198
9199 return x;
9200 }
9201
9202 static reg_class_t
9203 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9204 reg_class_t rclass,
9205 machine_mode mode,
9206 secondary_reload_info *sri)
9207 {
9208 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9209 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9210 comment at the head of aarch64-sve.md for more details about the
9211 big-endian handling. */
9212 if (BYTES_BIG_ENDIAN
9213 && reg_class_subset_p (rclass, FP_REGS)
9214 && !((REG_P (x) && HARD_REGISTER_P (x))
9215 || aarch64_simd_valid_immediate (x, NULL))
9216 && aarch64_sve_data_mode_p (mode))
9217 {
9218 sri->icode = CODE_FOR_aarch64_sve_reload_be;
9219 return NO_REGS;
9220 }
9221
9222 /* If we have to disable direct literal pool loads and stores because the
9223 function is too big, then we need a scratch register. */
9224 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9225 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9226 || targetm.vector_mode_supported_p (GET_MODE (x)))
9227 && !aarch64_pcrelative_literal_loads)
9228 {
9229 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9230 return NO_REGS;
9231 }
9232
9233 /* Without the TARGET_SIMD instructions we cannot move a Q register
9234 to a Q register directly. We need a scratch. */
9235 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9236 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9237 && reg_class_subset_p (rclass, FP_REGS))
9238 {
9239 sri->icode = code_for_aarch64_reload_mov (mode);
9240 return NO_REGS;
9241 }
9242
9243 /* A TFmode or TImode memory access should be handled via an FP_REGS
9244 because AArch64 has richer addressing modes for LDR/STR instructions
9245 than LDP/STP instructions. */
9246 if (TARGET_FLOAT && rclass == GENERAL_REGS
9247 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9248 return FP_REGS;
9249
9250 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9251 return GENERAL_REGS;
9252
9253 return NO_REGS;
9254 }
9255
9256 static bool
9257 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9258 {
9259 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9260
9261 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9262 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9263 if (frame_pointer_needed)
9264 return to == HARD_FRAME_POINTER_REGNUM;
9265 return true;
9266 }
9267
9268 poly_int64
9269 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9270 {
9271 if (to == HARD_FRAME_POINTER_REGNUM)
9272 {
9273 if (from == ARG_POINTER_REGNUM)
9274 return cfun->machine->frame.hard_fp_offset;
9275
9276 if (from == FRAME_POINTER_REGNUM)
9277 return cfun->machine->frame.hard_fp_offset
9278 - cfun->machine->frame.locals_offset;
9279 }
9280
9281 if (to == STACK_POINTER_REGNUM)
9282 {
9283 if (from == FRAME_POINTER_REGNUM)
9284 return cfun->machine->frame.frame_size
9285 - cfun->machine->frame.locals_offset;
9286 }
9287
9288 return cfun->machine->frame.frame_size;
9289 }
9290
9291 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9292 previous frame. */
9293
9294 rtx
9295 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9296 {
9297 if (count != 0)
9298 return const0_rtx;
9299 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9300 }
9301
9302
9303 static void
9304 aarch64_asm_trampoline_template (FILE *f)
9305 {
9306 int offset1 = 16;
9307 int offset2 = 20;
9308
9309 if (aarch64_bti_enabled ())
9310 {
9311 asm_fprintf (f, "\thint\t34 // bti c\n");
9312 offset1 -= 4;
9313 offset2 -= 4;
9314 }
9315
9316 if (TARGET_ILP32)
9317 {
9318 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9319 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9320 offset1);
9321 }
9322 else
9323 {
9324 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9325 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9326 offset2);
9327 }
9328 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9329
9330 /* The trampoline needs an extra padding instruction. In case if BTI is
9331 enabled the padding instruction is replaced by the BTI instruction at
9332 the beginning. */
9333 if (!aarch64_bti_enabled ())
9334 assemble_aligned_integer (4, const0_rtx);
9335
9336 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9337 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9338 }
9339
9340 static void
9341 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9342 {
9343 rtx fnaddr, mem, a_tramp;
9344 const int tramp_code_sz = 16;
9345
9346 /* Don't need to copy the trailing D-words, we fill those in below. */
9347 emit_block_move (m_tramp, assemble_trampoline_template (),
9348 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9349 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9350 fnaddr = XEXP (DECL_RTL (fndecl), 0);
9351 if (GET_MODE (fnaddr) != ptr_mode)
9352 fnaddr = convert_memory_address (ptr_mode, fnaddr);
9353 emit_move_insn (mem, fnaddr);
9354
9355 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9356 emit_move_insn (mem, chain_value);
9357
9358 /* XXX We should really define a "clear_cache" pattern and use
9359 gen_clear_cache(). */
9360 a_tramp = XEXP (m_tramp, 0);
9361 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9362 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9363 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9364 ptr_mode);
9365 }
9366
9367 static unsigned char
9368 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9369 {
9370 /* ??? Logically we should only need to provide a value when
9371 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9372 can hold MODE, but at the moment we need to handle all modes.
9373 Just ignore any runtime parts for registers that can't store them. */
9374 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9375 unsigned int nregs;
9376 switch (regclass)
9377 {
9378 case TAILCALL_ADDR_REGS:
9379 case POINTER_REGS:
9380 case GENERAL_REGS:
9381 case ALL_REGS:
9382 case POINTER_AND_FP_REGS:
9383 case FP_REGS:
9384 case FP_LO_REGS:
9385 case FP_LO8_REGS:
9386 if (aarch64_sve_data_mode_p (mode)
9387 && constant_multiple_p (GET_MODE_SIZE (mode),
9388 BYTES_PER_SVE_VECTOR, &nregs))
9389 return nregs;
9390 return (aarch64_vector_data_mode_p (mode)
9391 ? CEIL (lowest_size, UNITS_PER_VREG)
9392 : CEIL (lowest_size, UNITS_PER_WORD));
9393 case STACK_REG:
9394 case PR_REGS:
9395 case PR_LO_REGS:
9396 case PR_HI_REGS:
9397 return 1;
9398
9399 case NO_REGS:
9400 return 0;
9401
9402 default:
9403 break;
9404 }
9405 gcc_unreachable ();
9406 }
9407
9408 static reg_class_t
9409 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9410 {
9411 if (regclass == POINTER_REGS)
9412 return GENERAL_REGS;
9413
9414 if (regclass == STACK_REG)
9415 {
9416 if (REG_P(x)
9417 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9418 return regclass;
9419
9420 return NO_REGS;
9421 }
9422
9423 /* Register eliminiation can result in a request for
9424 SP+constant->FP_REGS. We cannot support such operations which
9425 use SP as source and an FP_REG as destination, so reject out
9426 right now. */
9427 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9428 {
9429 rtx lhs = XEXP (x, 0);
9430
9431 /* Look through a possible SUBREG introduced by ILP32. */
9432 if (GET_CODE (lhs) == SUBREG)
9433 lhs = SUBREG_REG (lhs);
9434
9435 gcc_assert (REG_P (lhs));
9436 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9437 POINTER_REGS));
9438 return NO_REGS;
9439 }
9440
9441 return regclass;
9442 }
9443
9444 void
9445 aarch64_asm_output_labelref (FILE* f, const char *name)
9446 {
9447 asm_fprintf (f, "%U%s", name);
9448 }
9449
9450 static void
9451 aarch64_elf_asm_constructor (rtx symbol, int priority)
9452 {
9453 if (priority == DEFAULT_INIT_PRIORITY)
9454 default_ctor_section_asm_out_constructor (symbol, priority);
9455 else
9456 {
9457 section *s;
9458 /* While priority is known to be in range [0, 65535], so 18 bytes
9459 would be enough, the compiler might not know that. To avoid
9460 -Wformat-truncation false positive, use a larger size. */
9461 char buf[23];
9462 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9463 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9464 switch_to_section (s);
9465 assemble_align (POINTER_SIZE);
9466 assemble_aligned_integer (POINTER_BYTES, symbol);
9467 }
9468 }
9469
9470 static void
9471 aarch64_elf_asm_destructor (rtx symbol, int priority)
9472 {
9473 if (priority == DEFAULT_INIT_PRIORITY)
9474 default_dtor_section_asm_out_destructor (symbol, priority);
9475 else
9476 {
9477 section *s;
9478 /* While priority is known to be in range [0, 65535], so 18 bytes
9479 would be enough, the compiler might not know that. To avoid
9480 -Wformat-truncation false positive, use a larger size. */
9481 char buf[23];
9482 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9483 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9484 switch_to_section (s);
9485 assemble_align (POINTER_SIZE);
9486 assemble_aligned_integer (POINTER_BYTES, symbol);
9487 }
9488 }
9489
9490 const char*
9491 aarch64_output_casesi (rtx *operands)
9492 {
9493 char buf[100];
9494 char label[100];
9495 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9496 int index;
9497 static const char *const patterns[4][2] =
9498 {
9499 {
9500 "ldrb\t%w3, [%0,%w1,uxtw]",
9501 "add\t%3, %4, %w3, sxtb #2"
9502 },
9503 {
9504 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9505 "add\t%3, %4, %w3, sxth #2"
9506 },
9507 {
9508 "ldr\t%w3, [%0,%w1,uxtw #2]",
9509 "add\t%3, %4, %w3, sxtw #2"
9510 },
9511 /* We assume that DImode is only generated when not optimizing and
9512 that we don't really need 64-bit address offsets. That would
9513 imply an object file with 8GB of code in a single function! */
9514 {
9515 "ldr\t%w3, [%0,%w1,uxtw #2]",
9516 "add\t%3, %4, %w3, sxtw #2"
9517 }
9518 };
9519
9520 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9521
9522 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9523 index = exact_log2 (GET_MODE_SIZE (mode));
9524
9525 gcc_assert (index >= 0 && index <= 3);
9526
9527 /* Need to implement table size reduction, by chaning the code below. */
9528 output_asm_insn (patterns[index][0], operands);
9529 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9530 snprintf (buf, sizeof (buf),
9531 "adr\t%%4, %s", targetm.strip_name_encoding (label));
9532 output_asm_insn (buf, operands);
9533 output_asm_insn (patterns[index][1], operands);
9534 output_asm_insn ("br\t%3", operands);
9535 assemble_label (asm_out_file, label);
9536 return "";
9537 }
9538
9539
9540 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9541 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9542 operator. */
9543
9544 int
9545 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9546 {
9547 if (shift >= 0 && shift <= 3)
9548 {
9549 int size;
9550 for (size = 8; size <= 32; size *= 2)
9551 {
9552 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9553 if (mask == bits << shift)
9554 return size;
9555 }
9556 }
9557 return 0;
9558 }
9559
9560 /* Constant pools are per function only when PC relative
9561 literal loads are true or we are in the large memory
9562 model. */
9563
9564 static inline bool
9565 aarch64_can_use_per_function_literal_pools_p (void)
9566 {
9567 return (aarch64_pcrelative_literal_loads
9568 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9569 }
9570
9571 static bool
9572 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9573 {
9574 /* We can't use blocks for constants when we're using a per-function
9575 constant pool. */
9576 return !aarch64_can_use_per_function_literal_pools_p ();
9577 }
9578
9579 /* Select appropriate section for constants depending
9580 on where we place literal pools. */
9581
9582 static section *
9583 aarch64_select_rtx_section (machine_mode mode,
9584 rtx x,
9585 unsigned HOST_WIDE_INT align)
9586 {
9587 if (aarch64_can_use_per_function_literal_pools_p ())
9588 return function_section (current_function_decl);
9589
9590 return default_elf_select_rtx_section (mode, x, align);
9591 }
9592
9593 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9594 void
9595 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9596 HOST_WIDE_INT offset)
9597 {
9598 /* When using per-function literal pools, we must ensure that any code
9599 section is aligned to the minimal instruction length, lest we get
9600 errors from the assembler re "unaligned instructions". */
9601 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9602 ASM_OUTPUT_ALIGN (f, 2);
9603 }
9604
9605 /* Costs. */
9606
9607 /* Helper function for rtx cost calculation. Strip a shift expression
9608 from X. Returns the inner operand if successful, or the original
9609 expression on failure. */
9610 static rtx
9611 aarch64_strip_shift (rtx x)
9612 {
9613 rtx op = x;
9614
9615 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9616 we can convert both to ROR during final output. */
9617 if ((GET_CODE (op) == ASHIFT
9618 || GET_CODE (op) == ASHIFTRT
9619 || GET_CODE (op) == LSHIFTRT
9620 || GET_CODE (op) == ROTATERT
9621 || GET_CODE (op) == ROTATE)
9622 && CONST_INT_P (XEXP (op, 1)))
9623 return XEXP (op, 0);
9624
9625 if (GET_CODE (op) == MULT
9626 && CONST_INT_P (XEXP (op, 1))
9627 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9628 return XEXP (op, 0);
9629
9630 return x;
9631 }
9632
9633 /* Helper function for rtx cost calculation. Strip an extend
9634 expression from X. Returns the inner operand if successful, or the
9635 original expression on failure. We deal with a number of possible
9636 canonicalization variations here. If STRIP_SHIFT is true, then
9637 we can strip off a shift also. */
9638 static rtx
9639 aarch64_strip_extend (rtx x, bool strip_shift)
9640 {
9641 scalar_int_mode mode;
9642 rtx op = x;
9643
9644 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9645 return op;
9646
9647 /* Zero and sign extraction of a widened value. */
9648 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9649 && XEXP (op, 2) == const0_rtx
9650 && GET_CODE (XEXP (op, 0)) == MULT
9651 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9652 XEXP (op, 1)))
9653 return XEXP (XEXP (op, 0), 0);
9654
9655 /* It can also be represented (for zero-extend) as an AND with an
9656 immediate. */
9657 if (GET_CODE (op) == AND
9658 && GET_CODE (XEXP (op, 0)) == MULT
9659 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9660 && CONST_INT_P (XEXP (op, 1))
9661 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9662 INTVAL (XEXP (op, 1))) != 0)
9663 return XEXP (XEXP (op, 0), 0);
9664
9665 /* Now handle extended register, as this may also have an optional
9666 left shift by 1..4. */
9667 if (strip_shift
9668 && GET_CODE (op) == ASHIFT
9669 && CONST_INT_P (XEXP (op, 1))
9670 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9671 op = XEXP (op, 0);
9672
9673 if (GET_CODE (op) == ZERO_EXTEND
9674 || GET_CODE (op) == SIGN_EXTEND)
9675 op = XEXP (op, 0);
9676
9677 if (op != x)
9678 return op;
9679
9680 return x;
9681 }
9682
9683 /* Return true iff CODE is a shift supported in combination
9684 with arithmetic instructions. */
9685
9686 static bool
9687 aarch64_shift_p (enum rtx_code code)
9688 {
9689 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9690 }
9691
9692
9693 /* Return true iff X is a cheap shift without a sign extend. */
9694
9695 static bool
9696 aarch64_cheap_mult_shift_p (rtx x)
9697 {
9698 rtx op0, op1;
9699
9700 op0 = XEXP (x, 0);
9701 op1 = XEXP (x, 1);
9702
9703 if (!(aarch64_tune_params.extra_tuning_flags
9704 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9705 return false;
9706
9707 if (GET_CODE (op0) == SIGN_EXTEND)
9708 return false;
9709
9710 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9711 && UINTVAL (op1) <= 4)
9712 return true;
9713
9714 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9715 return false;
9716
9717 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9718
9719 if (l2 > 0 && l2 <= 4)
9720 return true;
9721
9722 return false;
9723 }
9724
9725 /* Helper function for rtx cost calculation. Calculate the cost of
9726 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9727 Return the calculated cost of the expression, recursing manually in to
9728 operands where needed. */
9729
9730 static int
9731 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9732 {
9733 rtx op0, op1;
9734 const struct cpu_cost_table *extra_cost
9735 = aarch64_tune_params.insn_extra_cost;
9736 int cost = 0;
9737 bool compound_p = (outer == PLUS || outer == MINUS);
9738 machine_mode mode = GET_MODE (x);
9739
9740 gcc_checking_assert (code == MULT);
9741
9742 op0 = XEXP (x, 0);
9743 op1 = XEXP (x, 1);
9744
9745 if (VECTOR_MODE_P (mode))
9746 mode = GET_MODE_INNER (mode);
9747
9748 /* Integer multiply/fma. */
9749 if (GET_MODE_CLASS (mode) == MODE_INT)
9750 {
9751 /* The multiply will be canonicalized as a shift, cost it as such. */
9752 if (aarch64_shift_p (GET_CODE (x))
9753 || (CONST_INT_P (op1)
9754 && exact_log2 (INTVAL (op1)) > 0))
9755 {
9756 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9757 || GET_CODE (op0) == SIGN_EXTEND;
9758 if (speed)
9759 {
9760 if (compound_p)
9761 {
9762 /* If the shift is considered cheap,
9763 then don't add any cost. */
9764 if (aarch64_cheap_mult_shift_p (x))
9765 ;
9766 else if (REG_P (op1))
9767 /* ARITH + shift-by-register. */
9768 cost += extra_cost->alu.arith_shift_reg;
9769 else if (is_extend)
9770 /* ARITH + extended register. We don't have a cost field
9771 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9772 cost += extra_cost->alu.extend_arith;
9773 else
9774 /* ARITH + shift-by-immediate. */
9775 cost += extra_cost->alu.arith_shift;
9776 }
9777 else
9778 /* LSL (immediate). */
9779 cost += extra_cost->alu.shift;
9780
9781 }
9782 /* Strip extends as we will have costed them in the case above. */
9783 if (is_extend)
9784 op0 = aarch64_strip_extend (op0, true);
9785
9786 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9787
9788 return cost;
9789 }
9790
9791 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9792 compound and let the below cases handle it. After all, MNEG is a
9793 special-case alias of MSUB. */
9794 if (GET_CODE (op0) == NEG)
9795 {
9796 op0 = XEXP (op0, 0);
9797 compound_p = true;
9798 }
9799
9800 /* Integer multiplies or FMAs have zero/sign extending variants. */
9801 if ((GET_CODE (op0) == ZERO_EXTEND
9802 && GET_CODE (op1) == ZERO_EXTEND)
9803 || (GET_CODE (op0) == SIGN_EXTEND
9804 && GET_CODE (op1) == SIGN_EXTEND))
9805 {
9806 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9807 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9808
9809 if (speed)
9810 {
9811 if (compound_p)
9812 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9813 cost += extra_cost->mult[0].extend_add;
9814 else
9815 /* MUL/SMULL/UMULL. */
9816 cost += extra_cost->mult[0].extend;
9817 }
9818
9819 return cost;
9820 }
9821
9822 /* This is either an integer multiply or a MADD. In both cases
9823 we want to recurse and cost the operands. */
9824 cost += rtx_cost (op0, mode, MULT, 0, speed);
9825 cost += rtx_cost (op1, mode, MULT, 1, speed);
9826
9827 if (speed)
9828 {
9829 if (compound_p)
9830 /* MADD/MSUB. */
9831 cost += extra_cost->mult[mode == DImode].add;
9832 else
9833 /* MUL. */
9834 cost += extra_cost->mult[mode == DImode].simple;
9835 }
9836
9837 return cost;
9838 }
9839 else
9840 {
9841 if (speed)
9842 {
9843 /* Floating-point FMA/FMUL can also support negations of the
9844 operands, unless the rounding mode is upward or downward in
9845 which case FNMUL is different than FMUL with operand negation. */
9846 bool neg0 = GET_CODE (op0) == NEG;
9847 bool neg1 = GET_CODE (op1) == NEG;
9848 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9849 {
9850 if (neg0)
9851 op0 = XEXP (op0, 0);
9852 if (neg1)
9853 op1 = XEXP (op1, 0);
9854 }
9855
9856 if (compound_p)
9857 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9858 cost += extra_cost->fp[mode == DFmode].fma;
9859 else
9860 /* FMUL/FNMUL. */
9861 cost += extra_cost->fp[mode == DFmode].mult;
9862 }
9863
9864 cost += rtx_cost (op0, mode, MULT, 0, speed);
9865 cost += rtx_cost (op1, mode, MULT, 1, speed);
9866 return cost;
9867 }
9868 }
9869
9870 static int
9871 aarch64_address_cost (rtx x,
9872 machine_mode mode,
9873 addr_space_t as ATTRIBUTE_UNUSED,
9874 bool speed)
9875 {
9876 enum rtx_code c = GET_CODE (x);
9877 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9878 struct aarch64_address_info info;
9879 int cost = 0;
9880 info.shift = 0;
9881
9882 if (!aarch64_classify_address (&info, x, mode, false))
9883 {
9884 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9885 {
9886 /* This is a CONST or SYMBOL ref which will be split
9887 in a different way depending on the code model in use.
9888 Cost it through the generic infrastructure. */
9889 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9890 /* Divide through by the cost of one instruction to
9891 bring it to the same units as the address costs. */
9892 cost_symbol_ref /= COSTS_N_INSNS (1);
9893 /* The cost is then the cost of preparing the address,
9894 followed by an immediate (possibly 0) offset. */
9895 return cost_symbol_ref + addr_cost->imm_offset;
9896 }
9897 else
9898 {
9899 /* This is most likely a jump table from a case
9900 statement. */
9901 return addr_cost->register_offset;
9902 }
9903 }
9904
9905 switch (info.type)
9906 {
9907 case ADDRESS_LO_SUM:
9908 case ADDRESS_SYMBOLIC:
9909 case ADDRESS_REG_IMM:
9910 cost += addr_cost->imm_offset;
9911 break;
9912
9913 case ADDRESS_REG_WB:
9914 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9915 cost += addr_cost->pre_modify;
9916 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9917 cost += addr_cost->post_modify;
9918 else
9919 gcc_unreachable ();
9920
9921 break;
9922
9923 case ADDRESS_REG_REG:
9924 cost += addr_cost->register_offset;
9925 break;
9926
9927 case ADDRESS_REG_SXTW:
9928 cost += addr_cost->register_sextend;
9929 break;
9930
9931 case ADDRESS_REG_UXTW:
9932 cost += addr_cost->register_zextend;
9933 break;
9934
9935 default:
9936 gcc_unreachable ();
9937 }
9938
9939
9940 if (info.shift > 0)
9941 {
9942 /* For the sake of calculating the cost of the shifted register
9943 component, we can treat same sized modes in the same way. */
9944 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9945 cost += addr_cost->addr_scale_costs.hi;
9946 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9947 cost += addr_cost->addr_scale_costs.si;
9948 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9949 cost += addr_cost->addr_scale_costs.di;
9950 else
9951 /* We can't tell, or this is a 128-bit vector. */
9952 cost += addr_cost->addr_scale_costs.ti;
9953 }
9954
9955 return cost;
9956 }
9957
9958 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9959 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9960 to be taken. */
9961
9962 int
9963 aarch64_branch_cost (bool speed_p, bool predictable_p)
9964 {
9965 /* When optimizing for speed, use the cost of unpredictable branches. */
9966 const struct cpu_branch_cost *branch_costs =
9967 aarch64_tune_params.branch_costs;
9968
9969 if (!speed_p || predictable_p)
9970 return branch_costs->predictable;
9971 else
9972 return branch_costs->unpredictable;
9973 }
9974
9975 /* Return true if the RTX X in mode MODE is a zero or sign extract
9976 usable in an ADD or SUB (extended register) instruction. */
9977 static bool
9978 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9979 {
9980 /* Catch add with a sign extract.
9981 This is add_<optab><mode>_multp2. */
9982 if (GET_CODE (x) == SIGN_EXTRACT
9983 || GET_CODE (x) == ZERO_EXTRACT)
9984 {
9985 rtx op0 = XEXP (x, 0);
9986 rtx op1 = XEXP (x, 1);
9987 rtx op2 = XEXP (x, 2);
9988
9989 if (GET_CODE (op0) == MULT
9990 && CONST_INT_P (op1)
9991 && op2 == const0_rtx
9992 && CONST_INT_P (XEXP (op0, 1))
9993 && aarch64_is_extend_from_extract (mode,
9994 XEXP (op0, 1),
9995 op1))
9996 {
9997 return true;
9998 }
9999 }
10000 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10001 No shift. */
10002 else if (GET_CODE (x) == SIGN_EXTEND
10003 || GET_CODE (x) == ZERO_EXTEND)
10004 return REG_P (XEXP (x, 0));
10005
10006 return false;
10007 }
10008
10009 static bool
10010 aarch64_frint_unspec_p (unsigned int u)
10011 {
10012 switch (u)
10013 {
10014 case UNSPEC_FRINTZ:
10015 case UNSPEC_FRINTP:
10016 case UNSPEC_FRINTM:
10017 case UNSPEC_FRINTA:
10018 case UNSPEC_FRINTN:
10019 case UNSPEC_FRINTX:
10020 case UNSPEC_FRINTI:
10021 return true;
10022
10023 default:
10024 return false;
10025 }
10026 }
10027
10028 /* Return true iff X is an rtx that will match an extr instruction
10029 i.e. as described in the *extr<mode>5_insn family of patterns.
10030 OP0 and OP1 will be set to the operands of the shifts involved
10031 on success and will be NULL_RTX otherwise. */
10032
10033 static bool
10034 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10035 {
10036 rtx op0, op1;
10037 scalar_int_mode mode;
10038 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10039 return false;
10040
10041 *res_op0 = NULL_RTX;
10042 *res_op1 = NULL_RTX;
10043
10044 if (GET_CODE (x) != IOR)
10045 return false;
10046
10047 op0 = XEXP (x, 0);
10048 op1 = XEXP (x, 1);
10049
10050 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10051 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10052 {
10053 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10054 if (GET_CODE (op1) == ASHIFT)
10055 std::swap (op0, op1);
10056
10057 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10058 return false;
10059
10060 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10061 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10062
10063 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10064 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10065 {
10066 *res_op0 = XEXP (op0, 0);
10067 *res_op1 = XEXP (op1, 0);
10068 return true;
10069 }
10070 }
10071
10072 return false;
10073 }
10074
10075 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10076 storing it in *COST. Result is true if the total cost of the operation
10077 has now been calculated. */
10078 static bool
10079 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10080 {
10081 rtx inner;
10082 rtx comparator;
10083 enum rtx_code cmpcode;
10084
10085 if (COMPARISON_P (op0))
10086 {
10087 inner = XEXP (op0, 0);
10088 comparator = XEXP (op0, 1);
10089 cmpcode = GET_CODE (op0);
10090 }
10091 else
10092 {
10093 inner = op0;
10094 comparator = const0_rtx;
10095 cmpcode = NE;
10096 }
10097
10098 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10099 {
10100 /* Conditional branch. */
10101 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10102 return true;
10103 else
10104 {
10105 if (cmpcode == NE || cmpcode == EQ)
10106 {
10107 if (comparator == const0_rtx)
10108 {
10109 /* TBZ/TBNZ/CBZ/CBNZ. */
10110 if (GET_CODE (inner) == ZERO_EXTRACT)
10111 /* TBZ/TBNZ. */
10112 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10113 ZERO_EXTRACT, 0, speed);
10114 else
10115 /* CBZ/CBNZ. */
10116 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10117
10118 return true;
10119 }
10120 }
10121 else if (cmpcode == LT || cmpcode == GE)
10122 {
10123 /* TBZ/TBNZ. */
10124 if (comparator == const0_rtx)
10125 return true;
10126 }
10127 }
10128 }
10129 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10130 {
10131 /* CCMP. */
10132 if (GET_CODE (op1) == COMPARE)
10133 {
10134 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10135 if (XEXP (op1, 1) == const0_rtx)
10136 *cost += 1;
10137 if (speed)
10138 {
10139 machine_mode mode = GET_MODE (XEXP (op1, 0));
10140 const struct cpu_cost_table *extra_cost
10141 = aarch64_tune_params.insn_extra_cost;
10142
10143 if (GET_MODE_CLASS (mode) == MODE_INT)
10144 *cost += extra_cost->alu.arith;
10145 else
10146 *cost += extra_cost->fp[mode == DFmode].compare;
10147 }
10148 return true;
10149 }
10150
10151 /* It's a conditional operation based on the status flags,
10152 so it must be some flavor of CSEL. */
10153
10154 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10155 if (GET_CODE (op1) == NEG
10156 || GET_CODE (op1) == NOT
10157 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10158 op1 = XEXP (op1, 0);
10159 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10160 {
10161 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10162 op1 = XEXP (op1, 0);
10163 op2 = XEXP (op2, 0);
10164 }
10165
10166 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10167 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10168 return true;
10169 }
10170
10171 /* We don't know what this is, cost all operands. */
10172 return false;
10173 }
10174
10175 /* Check whether X is a bitfield operation of the form shift + extend that
10176 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10177 operand to which the bitfield operation is applied. Otherwise return
10178 NULL_RTX. */
10179
10180 static rtx
10181 aarch64_extend_bitfield_pattern_p (rtx x)
10182 {
10183 rtx_code outer_code = GET_CODE (x);
10184 machine_mode outer_mode = GET_MODE (x);
10185
10186 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10187 && outer_mode != SImode && outer_mode != DImode)
10188 return NULL_RTX;
10189
10190 rtx inner = XEXP (x, 0);
10191 rtx_code inner_code = GET_CODE (inner);
10192 machine_mode inner_mode = GET_MODE (inner);
10193 rtx op = NULL_RTX;
10194
10195 switch (inner_code)
10196 {
10197 case ASHIFT:
10198 if (CONST_INT_P (XEXP (inner, 1))
10199 && (inner_mode == QImode || inner_mode == HImode))
10200 op = XEXP (inner, 0);
10201 break;
10202 case LSHIFTRT:
10203 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10204 && (inner_mode == QImode || inner_mode == HImode))
10205 op = XEXP (inner, 0);
10206 break;
10207 case ASHIFTRT:
10208 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10209 && (inner_mode == QImode || inner_mode == HImode))
10210 op = XEXP (inner, 0);
10211 break;
10212 default:
10213 break;
10214 }
10215
10216 return op;
10217 }
10218
10219 /* Return true if the mask and a shift amount from an RTX of the form
10220 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10221 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10222
10223 bool
10224 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10225 rtx shft_amnt)
10226 {
10227 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10228 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10229 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10230 && (INTVAL (mask)
10231 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10232 }
10233
10234 /* Return true if the masks and a shift amount from an RTX of the form
10235 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10236 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10237
10238 bool
10239 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10240 unsigned HOST_WIDE_INT mask1,
10241 unsigned HOST_WIDE_INT shft_amnt,
10242 unsigned HOST_WIDE_INT mask2)
10243 {
10244 unsigned HOST_WIDE_INT t;
10245
10246 /* Verify that there is no overlap in what bits are set in the two masks. */
10247 if (mask1 != ~mask2)
10248 return false;
10249
10250 /* Verify that mask2 is not all zeros or ones. */
10251 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10252 return false;
10253
10254 /* The shift amount should always be less than the mode size. */
10255 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10256
10257 /* Verify that the mask being shifted is contiguous and would be in the
10258 least significant bits after shifting by shft_amnt. */
10259 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10260 return (t == (t & -t));
10261 }
10262
10263 /* Calculate the cost of calculating X, storing it in *COST. Result
10264 is true if the total cost of the operation has now been calculated. */
10265 static bool
10266 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10267 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10268 {
10269 rtx op0, op1, op2;
10270 const struct cpu_cost_table *extra_cost
10271 = aarch64_tune_params.insn_extra_cost;
10272 int code = GET_CODE (x);
10273 scalar_int_mode int_mode;
10274
10275 /* By default, assume that everything has equivalent cost to the
10276 cheapest instruction. Any additional costs are applied as a delta
10277 above this default. */
10278 *cost = COSTS_N_INSNS (1);
10279
10280 switch (code)
10281 {
10282 case SET:
10283 /* The cost depends entirely on the operands to SET. */
10284 *cost = 0;
10285 op0 = SET_DEST (x);
10286 op1 = SET_SRC (x);
10287
10288 switch (GET_CODE (op0))
10289 {
10290 case MEM:
10291 if (speed)
10292 {
10293 rtx address = XEXP (op0, 0);
10294 if (VECTOR_MODE_P (mode))
10295 *cost += extra_cost->ldst.storev;
10296 else if (GET_MODE_CLASS (mode) == MODE_INT)
10297 *cost += extra_cost->ldst.store;
10298 else if (mode == SFmode)
10299 *cost += extra_cost->ldst.storef;
10300 else if (mode == DFmode)
10301 *cost += extra_cost->ldst.stored;
10302
10303 *cost +=
10304 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10305 0, speed));
10306 }
10307
10308 *cost += rtx_cost (op1, mode, SET, 1, speed);
10309 return true;
10310
10311 case SUBREG:
10312 if (! REG_P (SUBREG_REG (op0)))
10313 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10314
10315 /* Fall through. */
10316 case REG:
10317 /* The cost is one per vector-register copied. */
10318 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10319 {
10320 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10321 *cost = COSTS_N_INSNS (nregs);
10322 }
10323 /* const0_rtx is in general free, but we will use an
10324 instruction to set a register to 0. */
10325 else if (REG_P (op1) || op1 == const0_rtx)
10326 {
10327 /* The cost is 1 per register copied. */
10328 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10329 *cost = COSTS_N_INSNS (nregs);
10330 }
10331 else
10332 /* Cost is just the cost of the RHS of the set. */
10333 *cost += rtx_cost (op1, mode, SET, 1, speed);
10334 return true;
10335
10336 case ZERO_EXTRACT:
10337 case SIGN_EXTRACT:
10338 /* Bit-field insertion. Strip any redundant widening of
10339 the RHS to meet the width of the target. */
10340 if (GET_CODE (op1) == SUBREG)
10341 op1 = SUBREG_REG (op1);
10342 if ((GET_CODE (op1) == ZERO_EXTEND
10343 || GET_CODE (op1) == SIGN_EXTEND)
10344 && CONST_INT_P (XEXP (op0, 1))
10345 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10346 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10347 op1 = XEXP (op1, 0);
10348
10349 if (CONST_INT_P (op1))
10350 {
10351 /* MOV immediate is assumed to always be cheap. */
10352 *cost = COSTS_N_INSNS (1);
10353 }
10354 else
10355 {
10356 /* BFM. */
10357 if (speed)
10358 *cost += extra_cost->alu.bfi;
10359 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10360 }
10361
10362 return true;
10363
10364 default:
10365 /* We can't make sense of this, assume default cost. */
10366 *cost = COSTS_N_INSNS (1);
10367 return false;
10368 }
10369 return false;
10370
10371 case CONST_INT:
10372 /* If an instruction can incorporate a constant within the
10373 instruction, the instruction's expression avoids calling
10374 rtx_cost() on the constant. If rtx_cost() is called on a
10375 constant, then it is usually because the constant must be
10376 moved into a register by one or more instructions.
10377
10378 The exception is constant 0, which can be expressed
10379 as XZR/WZR and is therefore free. The exception to this is
10380 if we have (set (reg) (const0_rtx)) in which case we must cost
10381 the move. However, we can catch that when we cost the SET, so
10382 we don't need to consider that here. */
10383 if (x == const0_rtx)
10384 *cost = 0;
10385 else
10386 {
10387 /* To an approximation, building any other constant is
10388 proportionally expensive to the number of instructions
10389 required to build that constant. This is true whether we
10390 are compiling for SPEED or otherwise. */
10391 if (!is_a <scalar_int_mode> (mode, &int_mode))
10392 int_mode = word_mode;
10393 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10394 (NULL_RTX, x, false, int_mode));
10395 }
10396 return true;
10397
10398 case CONST_DOUBLE:
10399
10400 /* First determine number of instructions to do the move
10401 as an integer constant. */
10402 if (!aarch64_float_const_representable_p (x)
10403 && !aarch64_can_const_movi_rtx_p (x, mode)
10404 && aarch64_float_const_rtx_p (x))
10405 {
10406 unsigned HOST_WIDE_INT ival;
10407 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10408 gcc_assert (succeed);
10409
10410 scalar_int_mode imode = (mode == HFmode
10411 ? SImode
10412 : int_mode_for_mode (mode).require ());
10413 int ncost = aarch64_internal_mov_immediate
10414 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10415 *cost += COSTS_N_INSNS (ncost);
10416 return true;
10417 }
10418
10419 if (speed)
10420 {
10421 /* mov[df,sf]_aarch64. */
10422 if (aarch64_float_const_representable_p (x))
10423 /* FMOV (scalar immediate). */
10424 *cost += extra_cost->fp[mode == DFmode].fpconst;
10425 else if (!aarch64_float_const_zero_rtx_p (x))
10426 {
10427 /* This will be a load from memory. */
10428 if (mode == DFmode)
10429 *cost += extra_cost->ldst.loadd;
10430 else
10431 *cost += extra_cost->ldst.loadf;
10432 }
10433 else
10434 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10435 or MOV v0.s[0], wzr - neither of which are modeled by the
10436 cost tables. Just use the default cost. */
10437 {
10438 }
10439 }
10440
10441 return true;
10442
10443 case MEM:
10444 if (speed)
10445 {
10446 /* For loads we want the base cost of a load, plus an
10447 approximation for the additional cost of the addressing
10448 mode. */
10449 rtx address = XEXP (x, 0);
10450 if (VECTOR_MODE_P (mode))
10451 *cost += extra_cost->ldst.loadv;
10452 else if (GET_MODE_CLASS (mode) == MODE_INT)
10453 *cost += extra_cost->ldst.load;
10454 else if (mode == SFmode)
10455 *cost += extra_cost->ldst.loadf;
10456 else if (mode == DFmode)
10457 *cost += extra_cost->ldst.loadd;
10458
10459 *cost +=
10460 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10461 0, speed));
10462 }
10463
10464 return true;
10465
10466 case NEG:
10467 op0 = XEXP (x, 0);
10468
10469 if (VECTOR_MODE_P (mode))
10470 {
10471 if (speed)
10472 {
10473 /* FNEG. */
10474 *cost += extra_cost->vect.alu;
10475 }
10476 return false;
10477 }
10478
10479 if (GET_MODE_CLASS (mode) == MODE_INT)
10480 {
10481 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10482 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10483 {
10484 /* CSETM. */
10485 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10486 return true;
10487 }
10488
10489 /* Cost this as SUB wzr, X. */
10490 op0 = CONST0_RTX (mode);
10491 op1 = XEXP (x, 0);
10492 goto cost_minus;
10493 }
10494
10495 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10496 {
10497 /* Support (neg(fma...)) as a single instruction only if
10498 sign of zeros is unimportant. This matches the decision
10499 making in aarch64.md. */
10500 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10501 {
10502 /* FNMADD. */
10503 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10504 return true;
10505 }
10506 if (GET_CODE (op0) == MULT)
10507 {
10508 /* FNMUL. */
10509 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10510 return true;
10511 }
10512 if (speed)
10513 /* FNEG. */
10514 *cost += extra_cost->fp[mode == DFmode].neg;
10515 return false;
10516 }
10517
10518 return false;
10519
10520 case CLRSB:
10521 case CLZ:
10522 if (speed)
10523 {
10524 if (VECTOR_MODE_P (mode))
10525 *cost += extra_cost->vect.alu;
10526 else
10527 *cost += extra_cost->alu.clz;
10528 }
10529
10530 return false;
10531
10532 case COMPARE:
10533 op0 = XEXP (x, 0);
10534 op1 = XEXP (x, 1);
10535
10536 if (op1 == const0_rtx
10537 && GET_CODE (op0) == AND)
10538 {
10539 x = op0;
10540 mode = GET_MODE (op0);
10541 goto cost_logic;
10542 }
10543
10544 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10545 {
10546 /* TODO: A write to the CC flags possibly costs extra, this
10547 needs encoding in the cost tables. */
10548
10549 mode = GET_MODE (op0);
10550 /* ANDS. */
10551 if (GET_CODE (op0) == AND)
10552 {
10553 x = op0;
10554 goto cost_logic;
10555 }
10556
10557 if (GET_CODE (op0) == PLUS)
10558 {
10559 /* ADDS (and CMN alias). */
10560 x = op0;
10561 goto cost_plus;
10562 }
10563
10564 if (GET_CODE (op0) == MINUS)
10565 {
10566 /* SUBS. */
10567 x = op0;
10568 goto cost_minus;
10569 }
10570
10571 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10572 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10573 && CONST_INT_P (XEXP (op0, 2)))
10574 {
10575 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10576 Handle it here directly rather than going to cost_logic
10577 since we know the immediate generated for the TST is valid
10578 so we can avoid creating an intermediate rtx for it only
10579 for costing purposes. */
10580 if (speed)
10581 *cost += extra_cost->alu.logical;
10582
10583 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10584 ZERO_EXTRACT, 0, speed);
10585 return true;
10586 }
10587
10588 if (GET_CODE (op1) == NEG)
10589 {
10590 /* CMN. */
10591 if (speed)
10592 *cost += extra_cost->alu.arith;
10593
10594 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10595 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10596 return true;
10597 }
10598
10599 /* CMP.
10600
10601 Compare can freely swap the order of operands, and
10602 canonicalization puts the more complex operation first.
10603 But the integer MINUS logic expects the shift/extend
10604 operation in op1. */
10605 if (! (REG_P (op0)
10606 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10607 {
10608 op0 = XEXP (x, 1);
10609 op1 = XEXP (x, 0);
10610 }
10611 goto cost_minus;
10612 }
10613
10614 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10615 {
10616 /* FCMP. */
10617 if (speed)
10618 *cost += extra_cost->fp[mode == DFmode].compare;
10619
10620 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10621 {
10622 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10623 /* FCMP supports constant 0.0 for no extra cost. */
10624 return true;
10625 }
10626 return false;
10627 }
10628
10629 if (VECTOR_MODE_P (mode))
10630 {
10631 /* Vector compare. */
10632 if (speed)
10633 *cost += extra_cost->vect.alu;
10634
10635 if (aarch64_float_const_zero_rtx_p (op1))
10636 {
10637 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10638 cost. */
10639 return true;
10640 }
10641 return false;
10642 }
10643 return false;
10644
10645 case MINUS:
10646 {
10647 op0 = XEXP (x, 0);
10648 op1 = XEXP (x, 1);
10649
10650 cost_minus:
10651 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10652
10653 /* Detect valid immediates. */
10654 if ((GET_MODE_CLASS (mode) == MODE_INT
10655 || (GET_MODE_CLASS (mode) == MODE_CC
10656 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10657 && CONST_INT_P (op1)
10658 && aarch64_uimm12_shift (INTVAL (op1)))
10659 {
10660 if (speed)
10661 /* SUB(S) (immediate). */
10662 *cost += extra_cost->alu.arith;
10663 return true;
10664 }
10665
10666 /* Look for SUB (extended register). */
10667 if (is_a <scalar_int_mode> (mode, &int_mode)
10668 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10669 {
10670 if (speed)
10671 *cost += extra_cost->alu.extend_arith;
10672
10673 op1 = aarch64_strip_extend (op1, true);
10674 *cost += rtx_cost (op1, VOIDmode,
10675 (enum rtx_code) GET_CODE (op1), 0, speed);
10676 return true;
10677 }
10678
10679 rtx new_op1 = aarch64_strip_extend (op1, false);
10680
10681 /* Cost this as an FMA-alike operation. */
10682 if ((GET_CODE (new_op1) == MULT
10683 || aarch64_shift_p (GET_CODE (new_op1)))
10684 && code != COMPARE)
10685 {
10686 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10687 (enum rtx_code) code,
10688 speed);
10689 return true;
10690 }
10691
10692 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10693
10694 if (speed)
10695 {
10696 if (VECTOR_MODE_P (mode))
10697 {
10698 /* Vector SUB. */
10699 *cost += extra_cost->vect.alu;
10700 }
10701 else if (GET_MODE_CLASS (mode) == MODE_INT)
10702 {
10703 /* SUB(S). */
10704 *cost += extra_cost->alu.arith;
10705 }
10706 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10707 {
10708 /* FSUB. */
10709 *cost += extra_cost->fp[mode == DFmode].addsub;
10710 }
10711 }
10712 return true;
10713 }
10714
10715 case PLUS:
10716 {
10717 rtx new_op0;
10718
10719 op0 = XEXP (x, 0);
10720 op1 = XEXP (x, 1);
10721
10722 cost_plus:
10723 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10724 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10725 {
10726 /* CSINC. */
10727 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10728 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10729 return true;
10730 }
10731
10732 if (GET_MODE_CLASS (mode) == MODE_INT
10733 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10734 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10735 {
10736 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10737
10738 if (speed)
10739 /* ADD (immediate). */
10740 *cost += extra_cost->alu.arith;
10741 return true;
10742 }
10743
10744 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10745
10746 /* Look for ADD (extended register). */
10747 if (is_a <scalar_int_mode> (mode, &int_mode)
10748 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10749 {
10750 if (speed)
10751 *cost += extra_cost->alu.extend_arith;
10752
10753 op0 = aarch64_strip_extend (op0, true);
10754 *cost += rtx_cost (op0, VOIDmode,
10755 (enum rtx_code) GET_CODE (op0), 0, speed);
10756 return true;
10757 }
10758
10759 /* Strip any extend, leave shifts behind as we will
10760 cost them through mult_cost. */
10761 new_op0 = aarch64_strip_extend (op0, false);
10762
10763 if (GET_CODE (new_op0) == MULT
10764 || aarch64_shift_p (GET_CODE (new_op0)))
10765 {
10766 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10767 speed);
10768 return true;
10769 }
10770
10771 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10772
10773 if (speed)
10774 {
10775 if (VECTOR_MODE_P (mode))
10776 {
10777 /* Vector ADD. */
10778 *cost += extra_cost->vect.alu;
10779 }
10780 else if (GET_MODE_CLASS (mode) == MODE_INT)
10781 {
10782 /* ADD. */
10783 *cost += extra_cost->alu.arith;
10784 }
10785 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10786 {
10787 /* FADD. */
10788 *cost += extra_cost->fp[mode == DFmode].addsub;
10789 }
10790 }
10791 return true;
10792 }
10793
10794 case BSWAP:
10795 *cost = COSTS_N_INSNS (1);
10796
10797 if (speed)
10798 {
10799 if (VECTOR_MODE_P (mode))
10800 *cost += extra_cost->vect.alu;
10801 else
10802 *cost += extra_cost->alu.rev;
10803 }
10804 return false;
10805
10806 case IOR:
10807 if (aarch_rev16_p (x))
10808 {
10809 *cost = COSTS_N_INSNS (1);
10810
10811 if (speed)
10812 {
10813 if (VECTOR_MODE_P (mode))
10814 *cost += extra_cost->vect.alu;
10815 else
10816 *cost += extra_cost->alu.rev;
10817 }
10818 return true;
10819 }
10820
10821 if (aarch64_extr_rtx_p (x, &op0, &op1))
10822 {
10823 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10824 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10825 if (speed)
10826 *cost += extra_cost->alu.shift;
10827
10828 return true;
10829 }
10830 /* Fall through. */
10831 case XOR:
10832 case AND:
10833 cost_logic:
10834 op0 = XEXP (x, 0);
10835 op1 = XEXP (x, 1);
10836
10837 if (VECTOR_MODE_P (mode))
10838 {
10839 if (speed)
10840 *cost += extra_cost->vect.alu;
10841 return true;
10842 }
10843
10844 if (code == AND
10845 && GET_CODE (op0) == MULT
10846 && CONST_INT_P (XEXP (op0, 1))
10847 && CONST_INT_P (op1)
10848 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10849 INTVAL (op1)) != 0)
10850 {
10851 /* This is a UBFM/SBFM. */
10852 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10853 if (speed)
10854 *cost += extra_cost->alu.bfx;
10855 return true;
10856 }
10857
10858 if (is_int_mode (mode, &int_mode))
10859 {
10860 if (CONST_INT_P (op1))
10861 {
10862 /* We have a mask + shift version of a UBFIZ
10863 i.e. the *andim_ashift<mode>_bfiz pattern. */
10864 if (GET_CODE (op0) == ASHIFT
10865 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10866 XEXP (op0, 1)))
10867 {
10868 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10869 (enum rtx_code) code, 0, speed);
10870 if (speed)
10871 *cost += extra_cost->alu.bfx;
10872
10873 return true;
10874 }
10875 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10876 {
10877 /* We possibly get the immediate for free, this is not
10878 modelled. */
10879 *cost += rtx_cost (op0, int_mode,
10880 (enum rtx_code) code, 0, speed);
10881 if (speed)
10882 *cost += extra_cost->alu.logical;
10883
10884 return true;
10885 }
10886 }
10887 else
10888 {
10889 rtx new_op0 = op0;
10890
10891 /* Handle ORN, EON, or BIC. */
10892 if (GET_CODE (op0) == NOT)
10893 op0 = XEXP (op0, 0);
10894
10895 new_op0 = aarch64_strip_shift (op0);
10896
10897 /* If we had a shift on op0 then this is a logical-shift-
10898 by-register/immediate operation. Otherwise, this is just
10899 a logical operation. */
10900 if (speed)
10901 {
10902 if (new_op0 != op0)
10903 {
10904 /* Shift by immediate. */
10905 if (CONST_INT_P (XEXP (op0, 1)))
10906 *cost += extra_cost->alu.log_shift;
10907 else
10908 *cost += extra_cost->alu.log_shift_reg;
10909 }
10910 else
10911 *cost += extra_cost->alu.logical;
10912 }
10913
10914 /* In both cases we want to cost both operands. */
10915 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10916 0, speed);
10917 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10918 1, speed);
10919
10920 return true;
10921 }
10922 }
10923 return false;
10924
10925 case NOT:
10926 x = XEXP (x, 0);
10927 op0 = aarch64_strip_shift (x);
10928
10929 if (VECTOR_MODE_P (mode))
10930 {
10931 /* Vector NOT. */
10932 *cost += extra_cost->vect.alu;
10933 return false;
10934 }
10935
10936 /* MVN-shifted-reg. */
10937 if (op0 != x)
10938 {
10939 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10940
10941 if (speed)
10942 *cost += extra_cost->alu.log_shift;
10943
10944 return true;
10945 }
10946 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10947 Handle the second form here taking care that 'a' in the above can
10948 be a shift. */
10949 else if (GET_CODE (op0) == XOR)
10950 {
10951 rtx newop0 = XEXP (op0, 0);
10952 rtx newop1 = XEXP (op0, 1);
10953 rtx op0_stripped = aarch64_strip_shift (newop0);
10954
10955 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10956 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10957
10958 if (speed)
10959 {
10960 if (op0_stripped != newop0)
10961 *cost += extra_cost->alu.log_shift;
10962 else
10963 *cost += extra_cost->alu.logical;
10964 }
10965
10966 return true;
10967 }
10968 /* MVN. */
10969 if (speed)
10970 *cost += extra_cost->alu.logical;
10971
10972 return false;
10973
10974 case ZERO_EXTEND:
10975
10976 op0 = XEXP (x, 0);
10977 /* If a value is written in SI mode, then zero extended to DI
10978 mode, the operation will in general be free as a write to
10979 a 'w' register implicitly zeroes the upper bits of an 'x'
10980 register. However, if this is
10981
10982 (set (reg) (zero_extend (reg)))
10983
10984 we must cost the explicit register move. */
10985 if (mode == DImode
10986 && GET_MODE (op0) == SImode
10987 && outer == SET)
10988 {
10989 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10990
10991 /* If OP_COST is non-zero, then the cost of the zero extend
10992 is effectively the cost of the inner operation. Otherwise
10993 we have a MOV instruction and we take the cost from the MOV
10994 itself. This is true independently of whether we are
10995 optimizing for space or time. */
10996 if (op_cost)
10997 *cost = op_cost;
10998
10999 return true;
11000 }
11001 else if (MEM_P (op0))
11002 {
11003 /* All loads can zero extend to any size for free. */
11004 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11005 return true;
11006 }
11007
11008 op0 = aarch64_extend_bitfield_pattern_p (x);
11009 if (op0)
11010 {
11011 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11012 if (speed)
11013 *cost += extra_cost->alu.bfx;
11014 return true;
11015 }
11016
11017 if (speed)
11018 {
11019 if (VECTOR_MODE_P (mode))
11020 {
11021 /* UMOV. */
11022 *cost += extra_cost->vect.alu;
11023 }
11024 else
11025 {
11026 /* We generate an AND instead of UXTB/UXTH. */
11027 *cost += extra_cost->alu.logical;
11028 }
11029 }
11030 return false;
11031
11032 case SIGN_EXTEND:
11033 if (MEM_P (XEXP (x, 0)))
11034 {
11035 /* LDRSH. */
11036 if (speed)
11037 {
11038 rtx address = XEXP (XEXP (x, 0), 0);
11039 *cost += extra_cost->ldst.load_sign_extend;
11040
11041 *cost +=
11042 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11043 0, speed));
11044 }
11045 return true;
11046 }
11047
11048 op0 = aarch64_extend_bitfield_pattern_p (x);
11049 if (op0)
11050 {
11051 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11052 if (speed)
11053 *cost += extra_cost->alu.bfx;
11054 return true;
11055 }
11056
11057 if (speed)
11058 {
11059 if (VECTOR_MODE_P (mode))
11060 *cost += extra_cost->vect.alu;
11061 else
11062 *cost += extra_cost->alu.extend;
11063 }
11064 return false;
11065
11066 case ASHIFT:
11067 op0 = XEXP (x, 0);
11068 op1 = XEXP (x, 1);
11069
11070 if (CONST_INT_P (op1))
11071 {
11072 if (speed)
11073 {
11074 if (VECTOR_MODE_P (mode))
11075 {
11076 /* Vector shift (immediate). */
11077 *cost += extra_cost->vect.alu;
11078 }
11079 else
11080 {
11081 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11082 aliases. */
11083 *cost += extra_cost->alu.shift;
11084 }
11085 }
11086
11087 /* We can incorporate zero/sign extend for free. */
11088 if (GET_CODE (op0) == ZERO_EXTEND
11089 || GET_CODE (op0) == SIGN_EXTEND)
11090 op0 = XEXP (op0, 0);
11091
11092 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11093 return true;
11094 }
11095 else
11096 {
11097 if (VECTOR_MODE_P (mode))
11098 {
11099 if (speed)
11100 /* Vector shift (register). */
11101 *cost += extra_cost->vect.alu;
11102 }
11103 else
11104 {
11105 if (speed)
11106 /* LSLV. */
11107 *cost += extra_cost->alu.shift_reg;
11108
11109 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11110 && CONST_INT_P (XEXP (op1, 1))
11111 && known_eq (INTVAL (XEXP (op1, 1)),
11112 GET_MODE_BITSIZE (mode) - 1))
11113 {
11114 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11115 /* We already demanded XEXP (op1, 0) to be REG_P, so
11116 don't recurse into it. */
11117 return true;
11118 }
11119 }
11120 return false; /* All arguments need to be in registers. */
11121 }
11122
11123 case ROTATE:
11124 case ROTATERT:
11125 case LSHIFTRT:
11126 case ASHIFTRT:
11127 op0 = XEXP (x, 0);
11128 op1 = XEXP (x, 1);
11129
11130 if (CONST_INT_P (op1))
11131 {
11132 /* ASR (immediate) and friends. */
11133 if (speed)
11134 {
11135 if (VECTOR_MODE_P (mode))
11136 *cost += extra_cost->vect.alu;
11137 else
11138 *cost += extra_cost->alu.shift;
11139 }
11140
11141 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11142 return true;
11143 }
11144 else
11145 {
11146 if (VECTOR_MODE_P (mode))
11147 {
11148 if (speed)
11149 /* Vector shift (register). */
11150 *cost += extra_cost->vect.alu;
11151 }
11152 else
11153 {
11154 if (speed)
11155 /* ASR (register) and friends. */
11156 *cost += extra_cost->alu.shift_reg;
11157
11158 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11159 && CONST_INT_P (XEXP (op1, 1))
11160 && known_eq (INTVAL (XEXP (op1, 1)),
11161 GET_MODE_BITSIZE (mode) - 1))
11162 {
11163 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11164 /* We already demanded XEXP (op1, 0) to be REG_P, so
11165 don't recurse into it. */
11166 return true;
11167 }
11168 }
11169 return false; /* All arguments need to be in registers. */
11170 }
11171
11172 case SYMBOL_REF:
11173
11174 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11175 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11176 {
11177 /* LDR. */
11178 if (speed)
11179 *cost += extra_cost->ldst.load;
11180 }
11181 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11182 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11183 {
11184 /* ADRP, followed by ADD. */
11185 *cost += COSTS_N_INSNS (1);
11186 if (speed)
11187 *cost += 2 * extra_cost->alu.arith;
11188 }
11189 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11190 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11191 {
11192 /* ADR. */
11193 if (speed)
11194 *cost += extra_cost->alu.arith;
11195 }
11196
11197 if (flag_pic)
11198 {
11199 /* One extra load instruction, after accessing the GOT. */
11200 *cost += COSTS_N_INSNS (1);
11201 if (speed)
11202 *cost += extra_cost->ldst.load;
11203 }
11204 return true;
11205
11206 case HIGH:
11207 case LO_SUM:
11208 /* ADRP/ADD (immediate). */
11209 if (speed)
11210 *cost += extra_cost->alu.arith;
11211 return true;
11212
11213 case ZERO_EXTRACT:
11214 case SIGN_EXTRACT:
11215 /* UBFX/SBFX. */
11216 if (speed)
11217 {
11218 if (VECTOR_MODE_P (mode))
11219 *cost += extra_cost->vect.alu;
11220 else
11221 *cost += extra_cost->alu.bfx;
11222 }
11223
11224 /* We can trust that the immediates used will be correct (there
11225 are no by-register forms), so we need only cost op0. */
11226 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11227 return true;
11228
11229 case MULT:
11230 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11231 /* aarch64_rtx_mult_cost always handles recursion to its
11232 operands. */
11233 return true;
11234
11235 case MOD:
11236 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11237 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11238 an unconditional negate. This case should only ever be reached through
11239 the set_smod_pow2_cheap check in expmed.c. */
11240 if (CONST_INT_P (XEXP (x, 1))
11241 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11242 && (mode == SImode || mode == DImode))
11243 {
11244 /* We expand to 4 instructions. Reset the baseline. */
11245 *cost = COSTS_N_INSNS (4);
11246
11247 if (speed)
11248 *cost += 2 * extra_cost->alu.logical
11249 + 2 * extra_cost->alu.arith;
11250
11251 return true;
11252 }
11253
11254 /* Fall-through. */
11255 case UMOD:
11256 if (speed)
11257 {
11258 /* Slighly prefer UMOD over SMOD. */
11259 if (VECTOR_MODE_P (mode))
11260 *cost += extra_cost->vect.alu;
11261 else if (GET_MODE_CLASS (mode) == MODE_INT)
11262 *cost += (extra_cost->mult[mode == DImode].add
11263 + extra_cost->mult[mode == DImode].idiv
11264 + (code == MOD ? 1 : 0));
11265 }
11266 return false; /* All arguments need to be in registers. */
11267
11268 case DIV:
11269 case UDIV:
11270 case SQRT:
11271 if (speed)
11272 {
11273 if (VECTOR_MODE_P (mode))
11274 *cost += extra_cost->vect.alu;
11275 else if (GET_MODE_CLASS (mode) == MODE_INT)
11276 /* There is no integer SQRT, so only DIV and UDIV can get
11277 here. */
11278 *cost += (extra_cost->mult[mode == DImode].idiv
11279 /* Slighly prefer UDIV over SDIV. */
11280 + (code == DIV ? 1 : 0));
11281 else
11282 *cost += extra_cost->fp[mode == DFmode].div;
11283 }
11284 return false; /* All arguments need to be in registers. */
11285
11286 case IF_THEN_ELSE:
11287 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11288 XEXP (x, 2), cost, speed);
11289
11290 case EQ:
11291 case NE:
11292 case GT:
11293 case GTU:
11294 case LT:
11295 case LTU:
11296 case GE:
11297 case GEU:
11298 case LE:
11299 case LEU:
11300
11301 return false; /* All arguments must be in registers. */
11302
11303 case FMA:
11304 op0 = XEXP (x, 0);
11305 op1 = XEXP (x, 1);
11306 op2 = XEXP (x, 2);
11307
11308 if (speed)
11309 {
11310 if (VECTOR_MODE_P (mode))
11311 *cost += extra_cost->vect.alu;
11312 else
11313 *cost += extra_cost->fp[mode == DFmode].fma;
11314 }
11315
11316 /* FMSUB, FNMADD, and FNMSUB are free. */
11317 if (GET_CODE (op0) == NEG)
11318 op0 = XEXP (op0, 0);
11319
11320 if (GET_CODE (op2) == NEG)
11321 op2 = XEXP (op2, 0);
11322
11323 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11324 and the by-element operand as operand 0. */
11325 if (GET_CODE (op1) == NEG)
11326 op1 = XEXP (op1, 0);
11327
11328 /* Catch vector-by-element operations. The by-element operand can
11329 either be (vec_duplicate (vec_select (x))) or just
11330 (vec_select (x)), depending on whether we are multiplying by
11331 a vector or a scalar.
11332
11333 Canonicalization is not very good in these cases, FMA4 will put the
11334 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11335 if (GET_CODE (op0) == VEC_DUPLICATE)
11336 op0 = XEXP (op0, 0);
11337 else if (GET_CODE (op1) == VEC_DUPLICATE)
11338 op1 = XEXP (op1, 0);
11339
11340 if (GET_CODE (op0) == VEC_SELECT)
11341 op0 = XEXP (op0, 0);
11342 else if (GET_CODE (op1) == VEC_SELECT)
11343 op1 = XEXP (op1, 0);
11344
11345 /* If the remaining parameters are not registers,
11346 get the cost to put them into registers. */
11347 *cost += rtx_cost (op0, mode, FMA, 0, speed);
11348 *cost += rtx_cost (op1, mode, FMA, 1, speed);
11349 *cost += rtx_cost (op2, mode, FMA, 2, speed);
11350 return true;
11351
11352 case FLOAT:
11353 case UNSIGNED_FLOAT:
11354 if (speed)
11355 *cost += extra_cost->fp[mode == DFmode].fromint;
11356 return false;
11357
11358 case FLOAT_EXTEND:
11359 if (speed)
11360 {
11361 if (VECTOR_MODE_P (mode))
11362 {
11363 /*Vector truncate. */
11364 *cost += extra_cost->vect.alu;
11365 }
11366 else
11367 *cost += extra_cost->fp[mode == DFmode].widen;
11368 }
11369 return false;
11370
11371 case FLOAT_TRUNCATE:
11372 if (speed)
11373 {
11374 if (VECTOR_MODE_P (mode))
11375 {
11376 /*Vector conversion. */
11377 *cost += extra_cost->vect.alu;
11378 }
11379 else
11380 *cost += extra_cost->fp[mode == DFmode].narrow;
11381 }
11382 return false;
11383
11384 case FIX:
11385 case UNSIGNED_FIX:
11386 x = XEXP (x, 0);
11387 /* Strip the rounding part. They will all be implemented
11388 by the fcvt* family of instructions anyway. */
11389 if (GET_CODE (x) == UNSPEC)
11390 {
11391 unsigned int uns_code = XINT (x, 1);
11392
11393 if (uns_code == UNSPEC_FRINTA
11394 || uns_code == UNSPEC_FRINTM
11395 || uns_code == UNSPEC_FRINTN
11396 || uns_code == UNSPEC_FRINTP
11397 || uns_code == UNSPEC_FRINTZ)
11398 x = XVECEXP (x, 0, 0);
11399 }
11400
11401 if (speed)
11402 {
11403 if (VECTOR_MODE_P (mode))
11404 *cost += extra_cost->vect.alu;
11405 else
11406 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11407 }
11408
11409 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11410 fixed-point fcvt. */
11411 if (GET_CODE (x) == MULT
11412 && ((VECTOR_MODE_P (mode)
11413 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11414 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11415 {
11416 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11417 0, speed);
11418 return true;
11419 }
11420
11421 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11422 return true;
11423
11424 case ABS:
11425 if (VECTOR_MODE_P (mode))
11426 {
11427 /* ABS (vector). */
11428 if (speed)
11429 *cost += extra_cost->vect.alu;
11430 }
11431 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11432 {
11433 op0 = XEXP (x, 0);
11434
11435 /* FABD, which is analogous to FADD. */
11436 if (GET_CODE (op0) == MINUS)
11437 {
11438 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11439 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11440 if (speed)
11441 *cost += extra_cost->fp[mode == DFmode].addsub;
11442
11443 return true;
11444 }
11445 /* Simple FABS is analogous to FNEG. */
11446 if (speed)
11447 *cost += extra_cost->fp[mode == DFmode].neg;
11448 }
11449 else
11450 {
11451 /* Integer ABS will either be split to
11452 two arithmetic instructions, or will be an ABS
11453 (scalar), which we don't model. */
11454 *cost = COSTS_N_INSNS (2);
11455 if (speed)
11456 *cost += 2 * extra_cost->alu.arith;
11457 }
11458 return false;
11459
11460 case SMAX:
11461 case SMIN:
11462 if (speed)
11463 {
11464 if (VECTOR_MODE_P (mode))
11465 *cost += extra_cost->vect.alu;
11466 else
11467 {
11468 /* FMAXNM/FMINNM/FMAX/FMIN.
11469 TODO: This may not be accurate for all implementations, but
11470 we do not model this in the cost tables. */
11471 *cost += extra_cost->fp[mode == DFmode].addsub;
11472 }
11473 }
11474 return false;
11475
11476 case UNSPEC:
11477 /* The floating point round to integer frint* instructions. */
11478 if (aarch64_frint_unspec_p (XINT (x, 1)))
11479 {
11480 if (speed)
11481 *cost += extra_cost->fp[mode == DFmode].roundint;
11482
11483 return false;
11484 }
11485
11486 if (XINT (x, 1) == UNSPEC_RBIT)
11487 {
11488 if (speed)
11489 *cost += extra_cost->alu.rev;
11490
11491 return false;
11492 }
11493 break;
11494
11495 case TRUNCATE:
11496
11497 /* Decompose <su>muldi3_highpart. */
11498 if (/* (truncate:DI */
11499 mode == DImode
11500 /* (lshiftrt:TI */
11501 && GET_MODE (XEXP (x, 0)) == TImode
11502 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11503 /* (mult:TI */
11504 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11505 /* (ANY_EXTEND:TI (reg:DI))
11506 (ANY_EXTEND:TI (reg:DI))) */
11507 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11508 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11509 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11510 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11511 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11512 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11513 /* (const_int 64) */
11514 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11515 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11516 {
11517 /* UMULH/SMULH. */
11518 if (speed)
11519 *cost += extra_cost->mult[mode == DImode].extend;
11520 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11521 mode, MULT, 0, speed);
11522 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11523 mode, MULT, 1, speed);
11524 return true;
11525 }
11526
11527 /* Fall through. */
11528 default:
11529 break;
11530 }
11531
11532 if (dump_file
11533 && flag_aarch64_verbose_cost)
11534 fprintf (dump_file,
11535 "\nFailed to cost RTX. Assuming default cost.\n");
11536
11537 return true;
11538 }
11539
11540 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11541 calculated for X. This cost is stored in *COST. Returns true
11542 if the total cost of X was calculated. */
11543 static bool
11544 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11545 int param, int *cost, bool speed)
11546 {
11547 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11548
11549 if (dump_file
11550 && flag_aarch64_verbose_cost)
11551 {
11552 print_rtl_single (dump_file, x);
11553 fprintf (dump_file, "\n%s cost: %d (%s)\n",
11554 speed ? "Hot" : "Cold",
11555 *cost, result ? "final" : "partial");
11556 }
11557
11558 return result;
11559 }
11560
11561 static int
11562 aarch64_register_move_cost (machine_mode mode,
11563 reg_class_t from_i, reg_class_t to_i)
11564 {
11565 enum reg_class from = (enum reg_class) from_i;
11566 enum reg_class to = (enum reg_class) to_i;
11567 const struct cpu_regmove_cost *regmove_cost
11568 = aarch64_tune_params.regmove_cost;
11569
11570 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11571 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11572 to = GENERAL_REGS;
11573
11574 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11575 from = GENERAL_REGS;
11576
11577 /* Moving between GPR and stack cost is the same as GP2GP. */
11578 if ((from == GENERAL_REGS && to == STACK_REG)
11579 || (to == GENERAL_REGS && from == STACK_REG))
11580 return regmove_cost->GP2GP;
11581
11582 /* To/From the stack register, we move via the gprs. */
11583 if (to == STACK_REG || from == STACK_REG)
11584 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11585 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11586
11587 if (known_eq (GET_MODE_SIZE (mode), 16))
11588 {
11589 /* 128-bit operations on general registers require 2 instructions. */
11590 if (from == GENERAL_REGS && to == GENERAL_REGS)
11591 return regmove_cost->GP2GP * 2;
11592 else if (from == GENERAL_REGS)
11593 return regmove_cost->GP2FP * 2;
11594 else if (to == GENERAL_REGS)
11595 return regmove_cost->FP2GP * 2;
11596
11597 /* When AdvSIMD instructions are disabled it is not possible to move
11598 a 128-bit value directly between Q registers. This is handled in
11599 secondary reload. A general register is used as a scratch to move
11600 the upper DI value and the lower DI value is moved directly,
11601 hence the cost is the sum of three moves. */
11602 if (! TARGET_SIMD)
11603 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11604
11605 return regmove_cost->FP2FP;
11606 }
11607
11608 if (from == GENERAL_REGS && to == GENERAL_REGS)
11609 return regmove_cost->GP2GP;
11610 else if (from == GENERAL_REGS)
11611 return regmove_cost->GP2FP;
11612 else if (to == GENERAL_REGS)
11613 return regmove_cost->FP2GP;
11614
11615 return regmove_cost->FP2FP;
11616 }
11617
11618 static int
11619 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11620 reg_class_t rclass ATTRIBUTE_UNUSED,
11621 bool in ATTRIBUTE_UNUSED)
11622 {
11623 return aarch64_tune_params.memmov_cost;
11624 }
11625
11626 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11627 to optimize 1.0/sqrt. */
11628
11629 static bool
11630 use_rsqrt_p (machine_mode mode)
11631 {
11632 return (!flag_trapping_math
11633 && flag_unsafe_math_optimizations
11634 && ((aarch64_tune_params.approx_modes->recip_sqrt
11635 & AARCH64_APPROX_MODE (mode))
11636 || flag_mrecip_low_precision_sqrt));
11637 }
11638
11639 /* Function to decide when to use the approximate reciprocal square root
11640 builtin. */
11641
11642 static tree
11643 aarch64_builtin_reciprocal (tree fndecl)
11644 {
11645 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11646
11647 if (!use_rsqrt_p (mode))
11648 return NULL_TREE;
11649 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11650 }
11651
11652 /* Emit instruction sequence to compute either the approximate square root
11653 or its approximate reciprocal, depending on the flag RECP, and return
11654 whether the sequence was emitted or not. */
11655
11656 bool
11657 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11658 {
11659 machine_mode mode = GET_MODE (dst);
11660
11661 if (GET_MODE_INNER (mode) == HFmode)
11662 {
11663 gcc_assert (!recp);
11664 return false;
11665 }
11666
11667 if (!recp)
11668 {
11669 if (!(flag_mlow_precision_sqrt
11670 || (aarch64_tune_params.approx_modes->sqrt
11671 & AARCH64_APPROX_MODE (mode))))
11672 return false;
11673
11674 if (flag_finite_math_only
11675 || flag_trapping_math
11676 || !flag_unsafe_math_optimizations
11677 || optimize_function_for_size_p (cfun))
11678 return false;
11679 }
11680 else
11681 /* Caller assumes we cannot fail. */
11682 gcc_assert (use_rsqrt_p (mode));
11683
11684 machine_mode mmsk = mode_for_int_vector (mode).require ();
11685 rtx xmsk = gen_reg_rtx (mmsk);
11686 if (!recp)
11687 /* When calculating the approximate square root, compare the
11688 argument with 0.0 and create a mask. */
11689 emit_insn (gen_rtx_SET (xmsk,
11690 gen_rtx_NEG (mmsk,
11691 gen_rtx_EQ (mmsk, src,
11692 CONST0_RTX (mode)))));
11693
11694 /* Estimate the approximate reciprocal square root. */
11695 rtx xdst = gen_reg_rtx (mode);
11696 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11697
11698 /* Iterate over the series twice for SF and thrice for DF. */
11699 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11700
11701 /* Optionally iterate over the series once less for faster performance
11702 while sacrificing the accuracy. */
11703 if ((recp && flag_mrecip_low_precision_sqrt)
11704 || (!recp && flag_mlow_precision_sqrt))
11705 iterations--;
11706
11707 /* Iterate over the series to calculate the approximate reciprocal square
11708 root. */
11709 rtx x1 = gen_reg_rtx (mode);
11710 while (iterations--)
11711 {
11712 rtx x2 = gen_reg_rtx (mode);
11713 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11714
11715 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11716
11717 if (iterations > 0)
11718 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11719 }
11720
11721 if (!recp)
11722 {
11723 /* Qualify the approximate reciprocal square root when the argument is
11724 0.0 by squashing the intermediary result to 0.0. */
11725 rtx xtmp = gen_reg_rtx (mmsk);
11726 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11727 gen_rtx_SUBREG (mmsk, xdst, 0)));
11728 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11729
11730 /* Calculate the approximate square root. */
11731 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11732 }
11733
11734 /* Finalize the approximation. */
11735 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11736
11737 return true;
11738 }
11739
11740 /* Emit the instruction sequence to compute the approximation for the division
11741 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11742
11743 bool
11744 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11745 {
11746 machine_mode mode = GET_MODE (quo);
11747
11748 if (GET_MODE_INNER (mode) == HFmode)
11749 return false;
11750
11751 bool use_approx_division_p = (flag_mlow_precision_div
11752 || (aarch64_tune_params.approx_modes->division
11753 & AARCH64_APPROX_MODE (mode)));
11754
11755 if (!flag_finite_math_only
11756 || flag_trapping_math
11757 || !flag_unsafe_math_optimizations
11758 || optimize_function_for_size_p (cfun)
11759 || !use_approx_division_p)
11760 return false;
11761
11762 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11763 return false;
11764
11765 /* Estimate the approximate reciprocal. */
11766 rtx xrcp = gen_reg_rtx (mode);
11767 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11768
11769 /* Iterate over the series twice for SF and thrice for DF. */
11770 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11771
11772 /* Optionally iterate over the series once less for faster performance,
11773 while sacrificing the accuracy. */
11774 if (flag_mlow_precision_div)
11775 iterations--;
11776
11777 /* Iterate over the series to calculate the approximate reciprocal. */
11778 rtx xtmp = gen_reg_rtx (mode);
11779 while (iterations--)
11780 {
11781 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11782
11783 if (iterations > 0)
11784 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11785 }
11786
11787 if (num != CONST1_RTX (mode))
11788 {
11789 /* As the approximate reciprocal of DEN is already calculated, only
11790 calculate the approximate division when NUM is not 1.0. */
11791 rtx xnum = force_reg (mode, num);
11792 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11793 }
11794
11795 /* Finalize the approximation. */
11796 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11797 return true;
11798 }
11799
11800 /* Return the number of instructions that can be issued per cycle. */
11801 static int
11802 aarch64_sched_issue_rate (void)
11803 {
11804 return aarch64_tune_params.issue_rate;
11805 }
11806
11807 static int
11808 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11809 {
11810 int issue_rate = aarch64_sched_issue_rate ();
11811
11812 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11813 }
11814
11815
11816 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11817 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11818 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11819
11820 static int
11821 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11822 int ready_index)
11823 {
11824 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11825 }
11826
11827
11828 /* Vectorizer cost model target hooks. */
11829
11830 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11831 static int
11832 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11833 tree vectype,
11834 int misalign ATTRIBUTE_UNUSED)
11835 {
11836 unsigned elements;
11837 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11838 bool fp = false;
11839
11840 if (vectype != NULL)
11841 fp = FLOAT_TYPE_P (vectype);
11842
11843 switch (type_of_cost)
11844 {
11845 case scalar_stmt:
11846 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11847
11848 case scalar_load:
11849 return costs->scalar_load_cost;
11850
11851 case scalar_store:
11852 return costs->scalar_store_cost;
11853
11854 case vector_stmt:
11855 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11856
11857 case vector_load:
11858 return costs->vec_align_load_cost;
11859
11860 case vector_store:
11861 return costs->vec_store_cost;
11862
11863 case vec_to_scalar:
11864 return costs->vec_to_scalar_cost;
11865
11866 case scalar_to_vec:
11867 return costs->scalar_to_vec_cost;
11868
11869 case unaligned_load:
11870 case vector_gather_load:
11871 return costs->vec_unalign_load_cost;
11872
11873 case unaligned_store:
11874 case vector_scatter_store:
11875 return costs->vec_unalign_store_cost;
11876
11877 case cond_branch_taken:
11878 return costs->cond_taken_branch_cost;
11879
11880 case cond_branch_not_taken:
11881 return costs->cond_not_taken_branch_cost;
11882
11883 case vec_perm:
11884 return costs->vec_permute_cost;
11885
11886 case vec_promote_demote:
11887 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11888
11889 case vec_construct:
11890 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11891 return elements / 2 + 1;
11892
11893 default:
11894 gcc_unreachable ();
11895 }
11896 }
11897
11898 /* Implement targetm.vectorize.add_stmt_cost. */
11899 static unsigned
11900 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11901 struct _stmt_vec_info *stmt_info, int misalign,
11902 enum vect_cost_model_location where)
11903 {
11904 unsigned *cost = (unsigned *) data;
11905 unsigned retval = 0;
11906
11907 if (flag_vect_cost_model)
11908 {
11909 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11910 int stmt_cost =
11911 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11912
11913 /* Statements in an inner loop relative to the loop being
11914 vectorized are weighted more heavily. The value here is
11915 arbitrary and could potentially be improved with analysis. */
11916 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11917 count *= 50; /* FIXME */
11918
11919 retval = (unsigned) (count * stmt_cost);
11920 cost[where] += retval;
11921 }
11922
11923 return retval;
11924 }
11925
11926 static void initialize_aarch64_code_model (struct gcc_options *);
11927
11928 /* Parse the TO_PARSE string and put the architecture struct that it
11929 selects into RES and the architectural features into ISA_FLAGS.
11930 Return an aarch64_parse_opt_result describing the parse result.
11931 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11932 When the TO_PARSE string contains an invalid extension,
11933 a copy of the string is created and stored to INVALID_EXTENSION. */
11934
11935 static enum aarch64_parse_opt_result
11936 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11937 uint64_t *isa_flags, std::string *invalid_extension)
11938 {
11939 const char *ext;
11940 const struct processor *arch;
11941 size_t len;
11942
11943 ext = strchr (to_parse, '+');
11944
11945 if (ext != NULL)
11946 len = ext - to_parse;
11947 else
11948 len = strlen (to_parse);
11949
11950 if (len == 0)
11951 return AARCH64_PARSE_MISSING_ARG;
11952
11953
11954 /* Loop through the list of supported ARCHes to find a match. */
11955 for (arch = all_architectures; arch->name != NULL; arch++)
11956 {
11957 if (strlen (arch->name) == len
11958 && strncmp (arch->name, to_parse, len) == 0)
11959 {
11960 uint64_t isa_temp = arch->flags;
11961
11962 if (ext != NULL)
11963 {
11964 /* TO_PARSE string contains at least one extension. */
11965 enum aarch64_parse_opt_result ext_res
11966 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11967
11968 if (ext_res != AARCH64_PARSE_OK)
11969 return ext_res;
11970 }
11971 /* Extension parsing was successful. Confirm the result
11972 arch and ISA flags. */
11973 *res = arch;
11974 *isa_flags = isa_temp;
11975 return AARCH64_PARSE_OK;
11976 }
11977 }
11978
11979 /* ARCH name not found in list. */
11980 return AARCH64_PARSE_INVALID_ARG;
11981 }
11982
11983 /* Parse the TO_PARSE string and put the result tuning in RES and the
11984 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11985 describing the parse result. If there is an error parsing, RES and
11986 ISA_FLAGS are left unchanged.
11987 When the TO_PARSE string contains an invalid extension,
11988 a copy of the string is created and stored to INVALID_EXTENSION. */
11989
11990 static enum aarch64_parse_opt_result
11991 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11992 uint64_t *isa_flags, std::string *invalid_extension)
11993 {
11994 const char *ext;
11995 const struct processor *cpu;
11996 size_t len;
11997
11998 ext = strchr (to_parse, '+');
11999
12000 if (ext != NULL)
12001 len = ext - to_parse;
12002 else
12003 len = strlen (to_parse);
12004
12005 if (len == 0)
12006 return AARCH64_PARSE_MISSING_ARG;
12007
12008
12009 /* Loop through the list of supported CPUs to find a match. */
12010 for (cpu = all_cores; cpu->name != NULL; cpu++)
12011 {
12012 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12013 {
12014 uint64_t isa_temp = cpu->flags;
12015
12016
12017 if (ext != NULL)
12018 {
12019 /* TO_PARSE string contains at least one extension. */
12020 enum aarch64_parse_opt_result ext_res
12021 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12022
12023 if (ext_res != AARCH64_PARSE_OK)
12024 return ext_res;
12025 }
12026 /* Extension parsing was successfull. Confirm the result
12027 cpu and ISA flags. */
12028 *res = cpu;
12029 *isa_flags = isa_temp;
12030 return AARCH64_PARSE_OK;
12031 }
12032 }
12033
12034 /* CPU name not found in list. */
12035 return AARCH64_PARSE_INVALID_ARG;
12036 }
12037
12038 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12039 Return an aarch64_parse_opt_result describing the parse result.
12040 If the parsing fails the RES does not change. */
12041
12042 static enum aarch64_parse_opt_result
12043 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12044 {
12045 const struct processor *cpu;
12046
12047 /* Loop through the list of supported CPUs to find a match. */
12048 for (cpu = all_cores; cpu->name != NULL; cpu++)
12049 {
12050 if (strcmp (cpu->name, to_parse) == 0)
12051 {
12052 *res = cpu;
12053 return AARCH64_PARSE_OK;
12054 }
12055 }
12056
12057 /* CPU name not found in list. */
12058 return AARCH64_PARSE_INVALID_ARG;
12059 }
12060
12061 /* Parse TOKEN, which has length LENGTH to see if it is an option
12062 described in FLAG. If it is, return the index bit for that fusion type.
12063 If not, error (printing OPTION_NAME) and return zero. */
12064
12065 static unsigned int
12066 aarch64_parse_one_option_token (const char *token,
12067 size_t length,
12068 const struct aarch64_flag_desc *flag,
12069 const char *option_name)
12070 {
12071 for (; flag->name != NULL; flag++)
12072 {
12073 if (length == strlen (flag->name)
12074 && !strncmp (flag->name, token, length))
12075 return flag->flag;
12076 }
12077
12078 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12079 return 0;
12080 }
12081
12082 /* Parse OPTION which is a comma-separated list of flags to enable.
12083 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12084 default state we inherit from the CPU tuning structures. OPTION_NAME
12085 gives the top-level option we are parsing in the -moverride string,
12086 for use in error messages. */
12087
12088 static unsigned int
12089 aarch64_parse_boolean_options (const char *option,
12090 const struct aarch64_flag_desc *flags,
12091 unsigned int initial_state,
12092 const char *option_name)
12093 {
12094 const char separator = '.';
12095 const char* specs = option;
12096 const char* ntoken = option;
12097 unsigned int found_flags = initial_state;
12098
12099 while ((ntoken = strchr (specs, separator)))
12100 {
12101 size_t token_length = ntoken - specs;
12102 unsigned token_ops = aarch64_parse_one_option_token (specs,
12103 token_length,
12104 flags,
12105 option_name);
12106 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12107 in the token stream, reset the supported operations. So:
12108
12109 adrp+add.cmp+branch.none.adrp+add
12110
12111 would have the result of turning on only adrp+add fusion. */
12112 if (!token_ops)
12113 found_flags = 0;
12114
12115 found_flags |= token_ops;
12116 specs = ++ntoken;
12117 }
12118
12119 /* We ended with a comma, print something. */
12120 if (!(*specs))
12121 {
12122 error ("%s string ill-formed\n", option_name);
12123 return 0;
12124 }
12125
12126 /* We still have one more token to parse. */
12127 size_t token_length = strlen (specs);
12128 unsigned token_ops = aarch64_parse_one_option_token (specs,
12129 token_length,
12130 flags,
12131 option_name);
12132 if (!token_ops)
12133 found_flags = 0;
12134
12135 found_flags |= token_ops;
12136 return found_flags;
12137 }
12138
12139 /* Support for overriding instruction fusion. */
12140
12141 static void
12142 aarch64_parse_fuse_string (const char *fuse_string,
12143 struct tune_params *tune)
12144 {
12145 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12146 aarch64_fusible_pairs,
12147 tune->fusible_ops,
12148 "fuse=");
12149 }
12150
12151 /* Support for overriding other tuning flags. */
12152
12153 static void
12154 aarch64_parse_tune_string (const char *tune_string,
12155 struct tune_params *tune)
12156 {
12157 tune->extra_tuning_flags
12158 = aarch64_parse_boolean_options (tune_string,
12159 aarch64_tuning_flags,
12160 tune->extra_tuning_flags,
12161 "tune=");
12162 }
12163
12164 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12165 Accept the valid SVE vector widths allowed by
12166 aarch64_sve_vector_bits_enum and use it to override sve_width
12167 in TUNE. */
12168
12169 static void
12170 aarch64_parse_sve_width_string (const char *tune_string,
12171 struct tune_params *tune)
12172 {
12173 int width = -1;
12174
12175 int n = sscanf (tune_string, "%d", &width);
12176 if (n == EOF)
12177 {
12178 error ("invalid format for sve_width");
12179 return;
12180 }
12181 switch (width)
12182 {
12183 case SVE_128:
12184 case SVE_256:
12185 case SVE_512:
12186 case SVE_1024:
12187 case SVE_2048:
12188 break;
12189 default:
12190 error ("invalid sve_width value: %d", width);
12191 }
12192 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12193 }
12194
12195 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12196 we understand. If it is, extract the option string and handoff to
12197 the appropriate function. */
12198
12199 void
12200 aarch64_parse_one_override_token (const char* token,
12201 size_t length,
12202 struct tune_params *tune)
12203 {
12204 const struct aarch64_tuning_override_function *fn
12205 = aarch64_tuning_override_functions;
12206
12207 const char *option_part = strchr (token, '=');
12208 if (!option_part)
12209 {
12210 error ("tuning string missing in option (%s)", token);
12211 return;
12212 }
12213
12214 /* Get the length of the option name. */
12215 length = option_part - token;
12216 /* Skip the '=' to get to the option string. */
12217 option_part++;
12218
12219 for (; fn->name != NULL; fn++)
12220 {
12221 if (!strncmp (fn->name, token, length))
12222 {
12223 fn->parse_override (option_part, tune);
12224 return;
12225 }
12226 }
12227
12228 error ("unknown tuning option (%s)",token);
12229 return;
12230 }
12231
12232 /* A checking mechanism for the implementation of the tls size. */
12233
12234 static void
12235 initialize_aarch64_tls_size (struct gcc_options *opts)
12236 {
12237 if (aarch64_tls_size == 0)
12238 aarch64_tls_size = 24;
12239
12240 switch (opts->x_aarch64_cmodel_var)
12241 {
12242 case AARCH64_CMODEL_TINY:
12243 /* Both the default and maximum TLS size allowed under tiny is 1M which
12244 needs two instructions to address, so we clamp the size to 24. */
12245 if (aarch64_tls_size > 24)
12246 aarch64_tls_size = 24;
12247 break;
12248 case AARCH64_CMODEL_SMALL:
12249 /* The maximum TLS size allowed under small is 4G. */
12250 if (aarch64_tls_size > 32)
12251 aarch64_tls_size = 32;
12252 break;
12253 case AARCH64_CMODEL_LARGE:
12254 /* The maximum TLS size allowed under large is 16E.
12255 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12256 if (aarch64_tls_size > 48)
12257 aarch64_tls_size = 48;
12258 break;
12259 default:
12260 gcc_unreachable ();
12261 }
12262
12263 return;
12264 }
12265
12266 /* Parse STRING looking for options in the format:
12267 string :: option:string
12268 option :: name=substring
12269 name :: {a-z}
12270 substring :: defined by option. */
12271
12272 static void
12273 aarch64_parse_override_string (const char* input_string,
12274 struct tune_params* tune)
12275 {
12276 const char separator = ':';
12277 size_t string_length = strlen (input_string) + 1;
12278 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12279 char *string = string_root;
12280 strncpy (string, input_string, string_length);
12281 string[string_length - 1] = '\0';
12282
12283 char* ntoken = string;
12284
12285 while ((ntoken = strchr (string, separator)))
12286 {
12287 size_t token_length = ntoken - string;
12288 /* Make this substring look like a string. */
12289 *ntoken = '\0';
12290 aarch64_parse_one_override_token (string, token_length, tune);
12291 string = ++ntoken;
12292 }
12293
12294 /* One last option to parse. */
12295 aarch64_parse_one_override_token (string, strlen (string), tune);
12296 free (string_root);
12297 }
12298
12299
12300 static void
12301 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12302 {
12303 if (accepted_branch_protection_string)
12304 {
12305 opts->x_aarch64_branch_protection_string
12306 = xstrdup (accepted_branch_protection_string);
12307 }
12308
12309 /* PR 70044: We have to be careful about being called multiple times for the
12310 same function. This means all changes should be repeatable. */
12311
12312 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12313 Disable the frame pointer flag so the mid-end will not use a frame
12314 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12315 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12316 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12317 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12318 if (opts->x_flag_omit_frame_pointer == 0)
12319 opts->x_flag_omit_frame_pointer = 2;
12320
12321 /* If not optimizing for size, set the default
12322 alignment to what the target wants. */
12323 if (!opts->x_optimize_size)
12324 {
12325 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12326 opts->x_str_align_loops = aarch64_tune_params.loop_align;
12327 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12328 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12329 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12330 opts->x_str_align_functions = aarch64_tune_params.function_align;
12331 }
12332
12333 /* We default to no pc-relative literal loads. */
12334
12335 aarch64_pcrelative_literal_loads = false;
12336
12337 /* If -mpc-relative-literal-loads is set on the command line, this
12338 implies that the user asked for PC relative literal loads. */
12339 if (opts->x_pcrelative_literal_loads == 1)
12340 aarch64_pcrelative_literal_loads = true;
12341
12342 /* In the tiny memory model it makes no sense to disallow PC relative
12343 literal pool loads. */
12344 if (aarch64_cmodel == AARCH64_CMODEL_TINY
12345 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12346 aarch64_pcrelative_literal_loads = true;
12347
12348 /* When enabling the lower precision Newton series for the square root, also
12349 enable it for the reciprocal square root, since the latter is an
12350 intermediary step for the former. */
12351 if (flag_mlow_precision_sqrt)
12352 flag_mrecip_low_precision_sqrt = true;
12353 }
12354
12355 /* 'Unpack' up the internal tuning structs and update the options
12356 in OPTS. The caller must have set up selected_tune and selected_arch
12357 as all the other target-specific codegen decisions are
12358 derived from them. */
12359
12360 void
12361 aarch64_override_options_internal (struct gcc_options *opts)
12362 {
12363 aarch64_tune_flags = selected_tune->flags;
12364 aarch64_tune = selected_tune->sched_core;
12365 /* Make a copy of the tuning parameters attached to the core, which
12366 we may later overwrite. */
12367 aarch64_tune_params = *(selected_tune->tune);
12368 aarch64_architecture_version = selected_arch->architecture_version;
12369
12370 if (opts->x_aarch64_override_tune_string)
12371 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12372 &aarch64_tune_params);
12373
12374 /* This target defaults to strict volatile bitfields. */
12375 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12376 opts->x_flag_strict_volatile_bitfields = 1;
12377
12378 if (aarch64_stack_protector_guard == SSP_GLOBAL
12379 && opts->x_aarch64_stack_protector_guard_offset_str)
12380 {
12381 error ("incompatible options %<-mstack-protector-guard=global%> and "
12382 "%<-mstack-protector-guard-offset=%s%>",
12383 aarch64_stack_protector_guard_offset_str);
12384 }
12385
12386 if (aarch64_stack_protector_guard == SSP_SYSREG
12387 && !(opts->x_aarch64_stack_protector_guard_offset_str
12388 && opts->x_aarch64_stack_protector_guard_reg_str))
12389 {
12390 error ("both %<-mstack-protector-guard-offset%> and "
12391 "%<-mstack-protector-guard-reg%> must be used "
12392 "with %<-mstack-protector-guard=sysreg%>");
12393 }
12394
12395 if (opts->x_aarch64_stack_protector_guard_reg_str)
12396 {
12397 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12398 error ("specify a system register with a small string length.");
12399 }
12400
12401 if (opts->x_aarch64_stack_protector_guard_offset_str)
12402 {
12403 char *end;
12404 const char *str = aarch64_stack_protector_guard_offset_str;
12405 errno = 0;
12406 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12407 if (!*str || *end || errno)
12408 error ("%qs is not a valid offset in %qs", str,
12409 "-mstack-protector-guard-offset=");
12410 aarch64_stack_protector_guard_offset = offs;
12411 }
12412
12413 initialize_aarch64_code_model (opts);
12414 initialize_aarch64_tls_size (opts);
12415
12416 int queue_depth = 0;
12417 switch (aarch64_tune_params.autoprefetcher_model)
12418 {
12419 case tune_params::AUTOPREFETCHER_OFF:
12420 queue_depth = -1;
12421 break;
12422 case tune_params::AUTOPREFETCHER_WEAK:
12423 queue_depth = 0;
12424 break;
12425 case tune_params::AUTOPREFETCHER_STRONG:
12426 queue_depth = max_insn_queue_index + 1;
12427 break;
12428 default:
12429 gcc_unreachable ();
12430 }
12431
12432 /* We don't mind passing in global_options_set here as we don't use
12433 the *options_set structs anyway. */
12434 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12435 queue_depth,
12436 opts->x_param_values,
12437 global_options_set.x_param_values);
12438
12439 /* Set up parameters to be used in prefetching algorithm. Do not
12440 override the defaults unless we are tuning for a core we have
12441 researched values for. */
12442 if (aarch64_tune_params.prefetch->num_slots > 0)
12443 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12444 aarch64_tune_params.prefetch->num_slots,
12445 opts->x_param_values,
12446 global_options_set.x_param_values);
12447 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12448 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12449 aarch64_tune_params.prefetch->l1_cache_size,
12450 opts->x_param_values,
12451 global_options_set.x_param_values);
12452 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12453 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12454 aarch64_tune_params.prefetch->l1_cache_line_size,
12455 opts->x_param_values,
12456 global_options_set.x_param_values);
12457 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12458 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12459 aarch64_tune_params.prefetch->l2_cache_size,
12460 opts->x_param_values,
12461 global_options_set.x_param_values);
12462 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12463 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12464 0,
12465 opts->x_param_values,
12466 global_options_set.x_param_values);
12467 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12468 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12469 aarch64_tune_params.prefetch->minimum_stride,
12470 opts->x_param_values,
12471 global_options_set.x_param_values);
12472
12473 /* Use the alternative scheduling-pressure algorithm by default. */
12474 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12475 opts->x_param_values,
12476 global_options_set.x_param_values);
12477
12478 /* If the user hasn't changed it via configure then set the default to 64 KB
12479 for the backend. */
12480 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12481 DEFAULT_STK_CLASH_GUARD_SIZE == 0
12482 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12483 opts->x_param_values,
12484 global_options_set.x_param_values);
12485
12486 /* Validate the guard size. */
12487 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12488
12489 /* Enforce that interval is the same size as size so the mid-end does the
12490 right thing. */
12491 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12492 guard_size,
12493 opts->x_param_values,
12494 global_options_set.x_param_values);
12495
12496 /* The maybe_set calls won't update the value if the user has explicitly set
12497 one. Which means we need to validate that probing interval and guard size
12498 are equal. */
12499 int probe_interval
12500 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12501 if (guard_size != probe_interval)
12502 error ("stack clash guard size %<%d%> must be equal to probing interval "
12503 "%<%d%>", guard_size, probe_interval);
12504
12505 /* Enable sw prefetching at specified optimization level for
12506 CPUS that have prefetch. Lower optimization level threshold by 1
12507 when profiling is enabled. */
12508 if (opts->x_flag_prefetch_loop_arrays < 0
12509 && !opts->x_optimize_size
12510 && aarch64_tune_params.prefetch->default_opt_level >= 0
12511 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12512 opts->x_flag_prefetch_loop_arrays = 1;
12513
12514 if (opts->x_aarch64_arch_string == NULL)
12515 opts->x_aarch64_arch_string = selected_arch->name;
12516 if (opts->x_aarch64_cpu_string == NULL)
12517 opts->x_aarch64_cpu_string = selected_cpu->name;
12518 if (opts->x_aarch64_tune_string == NULL)
12519 opts->x_aarch64_tune_string = selected_tune->name;
12520
12521 aarch64_override_options_after_change_1 (opts);
12522 }
12523
12524 /* Print a hint with a suggestion for a core or architecture name that
12525 most closely resembles what the user passed in STR. ARCH is true if
12526 the user is asking for an architecture name. ARCH is false if the user
12527 is asking for a core name. */
12528
12529 static void
12530 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12531 {
12532 auto_vec<const char *> candidates;
12533 const struct processor *entry = arch ? all_architectures : all_cores;
12534 for (; entry->name != NULL; entry++)
12535 candidates.safe_push (entry->name);
12536
12537 #ifdef HAVE_LOCAL_CPU_DETECT
12538 /* Add also "native" as possible value. */
12539 if (arch)
12540 candidates.safe_push ("native");
12541 #endif
12542
12543 char *s;
12544 const char *hint = candidates_list_and_hint (str, s, candidates);
12545 if (hint)
12546 inform (input_location, "valid arguments are: %s;"
12547 " did you mean %qs?", s, hint);
12548 else
12549 inform (input_location, "valid arguments are: %s", s);
12550
12551 XDELETEVEC (s);
12552 }
12553
12554 /* Print a hint with a suggestion for a core name that most closely resembles
12555 what the user passed in STR. */
12556
12557 inline static void
12558 aarch64_print_hint_for_core (const char *str)
12559 {
12560 aarch64_print_hint_for_core_or_arch (str, false);
12561 }
12562
12563 /* Print a hint with a suggestion for an architecture name that most closely
12564 resembles what the user passed in STR. */
12565
12566 inline static void
12567 aarch64_print_hint_for_arch (const char *str)
12568 {
12569 aarch64_print_hint_for_core_or_arch (str, true);
12570 }
12571
12572
12573 /* Print a hint with a suggestion for an extension name
12574 that most closely resembles what the user passed in STR. */
12575
12576 void
12577 aarch64_print_hint_for_extensions (const std::string &str)
12578 {
12579 auto_vec<const char *> candidates;
12580 aarch64_get_all_extension_candidates (&candidates);
12581 char *s;
12582 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12583 if (hint)
12584 inform (input_location, "valid arguments are: %s;"
12585 " did you mean %qs?", s, hint);
12586 else
12587 inform (input_location, "valid arguments are: %s;", s);
12588
12589 XDELETEVEC (s);
12590 }
12591
12592 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12593 specified in STR and throw errors if appropriate. Put the results if
12594 they are valid in RES and ISA_FLAGS. Return whether the option is
12595 valid. */
12596
12597 static bool
12598 aarch64_validate_mcpu (const char *str, const struct processor **res,
12599 uint64_t *isa_flags)
12600 {
12601 std::string invalid_extension;
12602 enum aarch64_parse_opt_result parse_res
12603 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12604
12605 if (parse_res == AARCH64_PARSE_OK)
12606 return true;
12607
12608 switch (parse_res)
12609 {
12610 case AARCH64_PARSE_MISSING_ARG:
12611 error ("missing cpu name in %<-mcpu=%s%>", str);
12612 break;
12613 case AARCH64_PARSE_INVALID_ARG:
12614 error ("unknown value %qs for %<-mcpu%>", str);
12615 aarch64_print_hint_for_core (str);
12616 break;
12617 case AARCH64_PARSE_INVALID_FEATURE:
12618 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12619 invalid_extension.c_str (), str);
12620 aarch64_print_hint_for_extensions (invalid_extension);
12621 break;
12622 default:
12623 gcc_unreachable ();
12624 }
12625
12626 return false;
12627 }
12628
12629 /* Parses CONST_STR for branch protection features specified in
12630 aarch64_branch_protect_types, and set any global variables required. Returns
12631 the parsing result and assigns LAST_STR to the last processed token from
12632 CONST_STR so that it can be used for error reporting. */
12633
12634 static enum
12635 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12636 char** last_str)
12637 {
12638 char *str_root = xstrdup (const_str);
12639 char* token_save = NULL;
12640 char *str = strtok_r (str_root, "+", &token_save);
12641 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12642 if (!str)
12643 res = AARCH64_PARSE_MISSING_ARG;
12644 else
12645 {
12646 char *next_str = strtok_r (NULL, "+", &token_save);
12647 /* Reset the branch protection features to their defaults. */
12648 aarch64_handle_no_branch_protection (NULL, NULL);
12649
12650 while (str && res == AARCH64_PARSE_OK)
12651 {
12652 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12653 bool found = false;
12654 /* Search for this type. */
12655 while (type && type->name && !found && res == AARCH64_PARSE_OK)
12656 {
12657 if (strcmp (str, type->name) == 0)
12658 {
12659 found = true;
12660 res = type->handler (str, next_str);
12661 str = next_str;
12662 next_str = strtok_r (NULL, "+", &token_save);
12663 }
12664 else
12665 type++;
12666 }
12667 if (found && res == AARCH64_PARSE_OK)
12668 {
12669 bool found_subtype = true;
12670 /* Loop through each token until we find one that isn't a
12671 subtype. */
12672 while (found_subtype)
12673 {
12674 found_subtype = false;
12675 const aarch64_branch_protect_type *subtype = type->subtypes;
12676 /* Search for the subtype. */
12677 while (str && subtype && subtype->name && !found_subtype
12678 && res == AARCH64_PARSE_OK)
12679 {
12680 if (strcmp (str, subtype->name) == 0)
12681 {
12682 found_subtype = true;
12683 res = subtype->handler (str, next_str);
12684 str = next_str;
12685 next_str = strtok_r (NULL, "+", &token_save);
12686 }
12687 else
12688 subtype++;
12689 }
12690 }
12691 }
12692 else if (!found)
12693 res = AARCH64_PARSE_INVALID_ARG;
12694 }
12695 }
12696 /* Copy the last processed token into the argument to pass it back.
12697 Used by option and attribute validation to print the offending token. */
12698 if (last_str)
12699 {
12700 if (str) strcpy (*last_str, str);
12701 else *last_str = NULL;
12702 }
12703 if (res == AARCH64_PARSE_OK)
12704 {
12705 /* If needed, alloc the accepted string then copy in const_str.
12706 Used by override_option_after_change_1. */
12707 if (!accepted_branch_protection_string)
12708 accepted_branch_protection_string = (char *) xmalloc (
12709 BRANCH_PROTECT_STR_MAX
12710 + 1);
12711 strncpy (accepted_branch_protection_string, const_str,
12712 BRANCH_PROTECT_STR_MAX + 1);
12713 /* Forcibly null-terminate. */
12714 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12715 }
12716 return res;
12717 }
12718
12719 static bool
12720 aarch64_validate_mbranch_protection (const char *const_str)
12721 {
12722 char *str = (char *) xmalloc (strlen (const_str));
12723 enum aarch64_parse_opt_result res =
12724 aarch64_parse_branch_protection (const_str, &str);
12725 if (res == AARCH64_PARSE_INVALID_ARG)
12726 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12727 else if (res == AARCH64_PARSE_MISSING_ARG)
12728 error ("missing argument for %<-mbranch-protection=%>");
12729 free (str);
12730 return res == AARCH64_PARSE_OK;
12731 }
12732
12733 /* Validate a command-line -march option. Parse the arch and extensions
12734 (if any) specified in STR and throw errors if appropriate. Put the
12735 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12736 option is valid. */
12737
12738 static bool
12739 aarch64_validate_march (const char *str, const struct processor **res,
12740 uint64_t *isa_flags)
12741 {
12742 std::string invalid_extension;
12743 enum aarch64_parse_opt_result parse_res
12744 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12745
12746 if (parse_res == AARCH64_PARSE_OK)
12747 return true;
12748
12749 switch (parse_res)
12750 {
12751 case AARCH64_PARSE_MISSING_ARG:
12752 error ("missing arch name in %<-march=%s%>", str);
12753 break;
12754 case AARCH64_PARSE_INVALID_ARG:
12755 error ("unknown value %qs for %<-march%>", str);
12756 aarch64_print_hint_for_arch (str);
12757 break;
12758 case AARCH64_PARSE_INVALID_FEATURE:
12759 error ("invalid feature modifier %qs in %<-march=%s%>",
12760 invalid_extension.c_str (), str);
12761 aarch64_print_hint_for_extensions (invalid_extension);
12762 break;
12763 default:
12764 gcc_unreachable ();
12765 }
12766
12767 return false;
12768 }
12769
12770 /* Validate a command-line -mtune option. Parse the cpu
12771 specified in STR and throw errors if appropriate. Put the
12772 result, if it is valid, in RES. Return whether the option is
12773 valid. */
12774
12775 static bool
12776 aarch64_validate_mtune (const char *str, const struct processor **res)
12777 {
12778 enum aarch64_parse_opt_result parse_res
12779 = aarch64_parse_tune (str, res);
12780
12781 if (parse_res == AARCH64_PARSE_OK)
12782 return true;
12783
12784 switch (parse_res)
12785 {
12786 case AARCH64_PARSE_MISSING_ARG:
12787 error ("missing cpu name in %<-mtune=%s%>", str);
12788 break;
12789 case AARCH64_PARSE_INVALID_ARG:
12790 error ("unknown value %qs for %<-mtune%>", str);
12791 aarch64_print_hint_for_core (str);
12792 break;
12793 default:
12794 gcc_unreachable ();
12795 }
12796 return false;
12797 }
12798
12799 /* Return the CPU corresponding to the enum CPU.
12800 If it doesn't specify a cpu, return the default. */
12801
12802 static const struct processor *
12803 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12804 {
12805 if (cpu != aarch64_none)
12806 return &all_cores[cpu];
12807
12808 /* The & 0x3f is to extract the bottom 6 bits that encode the
12809 default cpu as selected by the --with-cpu GCC configure option
12810 in config.gcc.
12811 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12812 flags mechanism should be reworked to make it more sane. */
12813 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12814 }
12815
12816 /* Return the architecture corresponding to the enum ARCH.
12817 If it doesn't specify a valid architecture, return the default. */
12818
12819 static const struct processor *
12820 aarch64_get_arch (enum aarch64_arch arch)
12821 {
12822 if (arch != aarch64_no_arch)
12823 return &all_architectures[arch];
12824
12825 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12826
12827 return &all_architectures[cpu->arch];
12828 }
12829
12830 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12831
12832 static poly_uint16
12833 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12834 {
12835 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12836 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12837 deciding which .md file patterns to use and when deciding whether
12838 something is a legitimate address or constant. */
12839 if (value == SVE_SCALABLE || value == SVE_128)
12840 return poly_uint16 (2, 2);
12841 else
12842 return (int) value / 64;
12843 }
12844
12845 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12846 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12847 tuning structs. In particular it must set selected_tune and
12848 aarch64_isa_flags that define the available ISA features and tuning
12849 decisions. It must also set selected_arch as this will be used to
12850 output the .arch asm tags for each function. */
12851
12852 static void
12853 aarch64_override_options (void)
12854 {
12855 uint64_t cpu_isa = 0;
12856 uint64_t arch_isa = 0;
12857 aarch64_isa_flags = 0;
12858
12859 bool valid_cpu = true;
12860 bool valid_tune = true;
12861 bool valid_arch = true;
12862
12863 selected_cpu = NULL;
12864 selected_arch = NULL;
12865 selected_tune = NULL;
12866
12867 if (aarch64_branch_protection_string)
12868 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12869
12870 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12871 If either of -march or -mtune is given, they override their
12872 respective component of -mcpu. */
12873 if (aarch64_cpu_string)
12874 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12875 &cpu_isa);
12876
12877 if (aarch64_arch_string)
12878 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12879 &arch_isa);
12880
12881 if (aarch64_tune_string)
12882 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12883
12884 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12885 SUBTARGET_OVERRIDE_OPTIONS;
12886 #endif
12887
12888 /* If the user did not specify a processor, choose the default
12889 one for them. This will be the CPU set during configuration using
12890 --with-cpu, otherwise it is "generic". */
12891 if (!selected_cpu)
12892 {
12893 if (selected_arch)
12894 {
12895 selected_cpu = &all_cores[selected_arch->ident];
12896 aarch64_isa_flags = arch_isa;
12897 explicit_arch = selected_arch->arch;
12898 }
12899 else
12900 {
12901 /* Get default configure-time CPU. */
12902 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12903 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12904 }
12905
12906 if (selected_tune)
12907 explicit_tune_core = selected_tune->ident;
12908 }
12909 /* If both -mcpu and -march are specified check that they are architecturally
12910 compatible, warn if they're not and prefer the -march ISA flags. */
12911 else if (selected_arch)
12912 {
12913 if (selected_arch->arch != selected_cpu->arch)
12914 {
12915 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12916 all_architectures[selected_cpu->arch].name,
12917 selected_arch->name);
12918 }
12919 aarch64_isa_flags = arch_isa;
12920 explicit_arch = selected_arch->arch;
12921 explicit_tune_core = selected_tune ? selected_tune->ident
12922 : selected_cpu->ident;
12923 }
12924 else
12925 {
12926 /* -mcpu but no -march. */
12927 aarch64_isa_flags = cpu_isa;
12928 explicit_tune_core = selected_tune ? selected_tune->ident
12929 : selected_cpu->ident;
12930 gcc_assert (selected_cpu);
12931 selected_arch = &all_architectures[selected_cpu->arch];
12932 explicit_arch = selected_arch->arch;
12933 }
12934
12935 /* Set the arch as well as we will need it when outputing
12936 the .arch directive in assembly. */
12937 if (!selected_arch)
12938 {
12939 gcc_assert (selected_cpu);
12940 selected_arch = &all_architectures[selected_cpu->arch];
12941 }
12942
12943 if (!selected_tune)
12944 selected_tune = selected_cpu;
12945
12946 if (aarch64_enable_bti == 2)
12947 {
12948 #ifdef TARGET_ENABLE_BTI
12949 aarch64_enable_bti = 1;
12950 #else
12951 aarch64_enable_bti = 0;
12952 #endif
12953 }
12954
12955 /* Return address signing is currently not supported for ILP32 targets. For
12956 LP64 targets use the configured option in the absence of a command-line
12957 option for -mbranch-protection. */
12958 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12959 {
12960 #ifdef TARGET_ENABLE_PAC_RET
12961 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12962 #else
12963 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12964 #endif
12965 }
12966
12967 #ifndef HAVE_AS_MABI_OPTION
12968 /* The compiler may have been configured with 2.23.* binutils, which does
12969 not have support for ILP32. */
12970 if (TARGET_ILP32)
12971 error ("assembler does not support %<-mabi=ilp32%>");
12972 #endif
12973
12974 /* Convert -msve-vector-bits to a VG count. */
12975 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12976
12977 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12978 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12979
12980 /* Make sure we properly set up the explicit options. */
12981 if ((aarch64_cpu_string && valid_cpu)
12982 || (aarch64_tune_string && valid_tune))
12983 gcc_assert (explicit_tune_core != aarch64_none);
12984
12985 if ((aarch64_cpu_string && valid_cpu)
12986 || (aarch64_arch_string && valid_arch))
12987 gcc_assert (explicit_arch != aarch64_no_arch);
12988
12989 /* The pass to insert speculation tracking runs before
12990 shrink-wrapping and the latter does not know how to update the
12991 tracking status. So disable it in this case. */
12992 if (aarch64_track_speculation)
12993 flag_shrink_wrap = 0;
12994
12995 aarch64_override_options_internal (&global_options);
12996
12997 /* Save these options as the default ones in case we push and pop them later
12998 while processing functions with potential target attributes. */
12999 target_option_default_node = target_option_current_node
13000 = build_target_option_node (&global_options);
13001 }
13002
13003 /* Implement targetm.override_options_after_change. */
13004
13005 static void
13006 aarch64_override_options_after_change (void)
13007 {
13008 aarch64_override_options_after_change_1 (&global_options);
13009 }
13010
13011 static struct machine_function *
13012 aarch64_init_machine_status (void)
13013 {
13014 struct machine_function *machine;
13015 machine = ggc_cleared_alloc<machine_function> ();
13016 return machine;
13017 }
13018
13019 void
13020 aarch64_init_expanders (void)
13021 {
13022 init_machine_status = aarch64_init_machine_status;
13023 }
13024
13025 /* A checking mechanism for the implementation of the various code models. */
13026 static void
13027 initialize_aarch64_code_model (struct gcc_options *opts)
13028 {
13029 if (opts->x_flag_pic)
13030 {
13031 switch (opts->x_aarch64_cmodel_var)
13032 {
13033 case AARCH64_CMODEL_TINY:
13034 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13035 break;
13036 case AARCH64_CMODEL_SMALL:
13037 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13038 aarch64_cmodel = (flag_pic == 2
13039 ? AARCH64_CMODEL_SMALL_PIC
13040 : AARCH64_CMODEL_SMALL_SPIC);
13041 #else
13042 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13043 #endif
13044 break;
13045 case AARCH64_CMODEL_LARGE:
13046 sorry ("code model %qs with %<-f%s%>", "large",
13047 opts->x_flag_pic > 1 ? "PIC" : "pic");
13048 break;
13049 default:
13050 gcc_unreachable ();
13051 }
13052 }
13053 else
13054 aarch64_cmodel = opts->x_aarch64_cmodel_var;
13055 }
13056
13057 /* Implement TARGET_OPTION_SAVE. */
13058
13059 static void
13060 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13061 {
13062 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13063 ptr->x_aarch64_branch_protection_string
13064 = opts->x_aarch64_branch_protection_string;
13065 }
13066
13067 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
13068 using the information saved in PTR. */
13069
13070 static void
13071 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13072 {
13073 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13074 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13075 opts->x_explicit_arch = ptr->x_explicit_arch;
13076 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13077 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13078 opts->x_aarch64_branch_protection_string
13079 = ptr->x_aarch64_branch_protection_string;
13080 if (opts->x_aarch64_branch_protection_string)
13081 {
13082 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13083 NULL);
13084 }
13085
13086 aarch64_override_options_internal (opts);
13087 }
13088
13089 /* Implement TARGET_OPTION_PRINT. */
13090
13091 static void
13092 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13093 {
13094 const struct processor *cpu
13095 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13096 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13097 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13098 std::string extension
13099 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13100
13101 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13102 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13103 arch->name, extension.c_str ());
13104 }
13105
13106 static GTY(()) tree aarch64_previous_fndecl;
13107
13108 void
13109 aarch64_reset_previous_fndecl (void)
13110 {
13111 aarch64_previous_fndecl = NULL;
13112 }
13113
13114 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13115 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13116 make sure optab availability predicates are recomputed when necessary. */
13117
13118 void
13119 aarch64_save_restore_target_globals (tree new_tree)
13120 {
13121 if (TREE_TARGET_GLOBALS (new_tree))
13122 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13123 else if (new_tree == target_option_default_node)
13124 restore_target_globals (&default_target_globals);
13125 else
13126 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13127 }
13128
13129 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13130 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13131 of the function, if such exists. This function may be called multiple
13132 times on a single function so use aarch64_previous_fndecl to avoid
13133 setting up identical state. */
13134
13135 static void
13136 aarch64_set_current_function (tree fndecl)
13137 {
13138 if (!fndecl || fndecl == aarch64_previous_fndecl)
13139 return;
13140
13141 tree old_tree = (aarch64_previous_fndecl
13142 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13143 : NULL_TREE);
13144
13145 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13146
13147 /* If current function has no attributes but the previous one did,
13148 use the default node. */
13149 if (!new_tree && old_tree)
13150 new_tree = target_option_default_node;
13151
13152 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13153 the default have been handled by aarch64_save_restore_target_globals from
13154 aarch64_pragma_target_parse. */
13155 if (old_tree == new_tree)
13156 return;
13157
13158 aarch64_previous_fndecl = fndecl;
13159
13160 /* First set the target options. */
13161 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13162
13163 aarch64_save_restore_target_globals (new_tree);
13164 }
13165
13166 /* Enum describing the various ways we can handle attributes.
13167 In many cases we can reuse the generic option handling machinery. */
13168
13169 enum aarch64_attr_opt_type
13170 {
13171 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
13172 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
13173 aarch64_attr_enum, /* Attribute sets an enum variable. */
13174 aarch64_attr_custom /* Attribute requires a custom handling function. */
13175 };
13176
13177 /* All the information needed to handle a target attribute.
13178 NAME is the name of the attribute.
13179 ATTR_TYPE specifies the type of behavior of the attribute as described
13180 in the definition of enum aarch64_attr_opt_type.
13181 ALLOW_NEG is true if the attribute supports a "no-" form.
13182 HANDLER is the function that takes the attribute string as an argument
13183 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13184 OPT_NUM is the enum specifying the option that the attribute modifies.
13185 This is needed for attributes that mirror the behavior of a command-line
13186 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13187 aarch64_attr_enum. */
13188
13189 struct aarch64_attribute_info
13190 {
13191 const char *name;
13192 enum aarch64_attr_opt_type attr_type;
13193 bool allow_neg;
13194 bool (*handler) (const char *);
13195 enum opt_code opt_num;
13196 };
13197
13198 /* Handle the ARCH_STR argument to the arch= target attribute. */
13199
13200 static bool
13201 aarch64_handle_attr_arch (const char *str)
13202 {
13203 const struct processor *tmp_arch = NULL;
13204 std::string invalid_extension;
13205 enum aarch64_parse_opt_result parse_res
13206 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13207
13208 if (parse_res == AARCH64_PARSE_OK)
13209 {
13210 gcc_assert (tmp_arch);
13211 selected_arch = tmp_arch;
13212 explicit_arch = selected_arch->arch;
13213 return true;
13214 }
13215
13216 switch (parse_res)
13217 {
13218 case AARCH64_PARSE_MISSING_ARG:
13219 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13220 break;
13221 case AARCH64_PARSE_INVALID_ARG:
13222 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13223 aarch64_print_hint_for_arch (str);
13224 break;
13225 case AARCH64_PARSE_INVALID_FEATURE:
13226 error ("invalid feature modifier %s of value (\"%s\") in "
13227 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13228 aarch64_print_hint_for_extensions (invalid_extension);
13229 break;
13230 default:
13231 gcc_unreachable ();
13232 }
13233
13234 return false;
13235 }
13236
13237 /* Handle the argument CPU_STR to the cpu= target attribute. */
13238
13239 static bool
13240 aarch64_handle_attr_cpu (const char *str)
13241 {
13242 const struct processor *tmp_cpu = NULL;
13243 std::string invalid_extension;
13244 enum aarch64_parse_opt_result parse_res
13245 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13246
13247 if (parse_res == AARCH64_PARSE_OK)
13248 {
13249 gcc_assert (tmp_cpu);
13250 selected_tune = tmp_cpu;
13251 explicit_tune_core = selected_tune->ident;
13252
13253 selected_arch = &all_architectures[tmp_cpu->arch];
13254 explicit_arch = selected_arch->arch;
13255 return true;
13256 }
13257
13258 switch (parse_res)
13259 {
13260 case AARCH64_PARSE_MISSING_ARG:
13261 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13262 break;
13263 case AARCH64_PARSE_INVALID_ARG:
13264 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13265 aarch64_print_hint_for_core (str);
13266 break;
13267 case AARCH64_PARSE_INVALID_FEATURE:
13268 error ("invalid feature modifier %s of value (\"%s\") in "
13269 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13270 aarch64_print_hint_for_extensions (invalid_extension);
13271 break;
13272 default:
13273 gcc_unreachable ();
13274 }
13275
13276 return false;
13277 }
13278
13279 /* Handle the argument STR to the branch-protection= attribute. */
13280
13281 static bool
13282 aarch64_handle_attr_branch_protection (const char* str)
13283 {
13284 char *err_str = (char *) xmalloc (strlen (str));
13285 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13286 &err_str);
13287 bool success = false;
13288 switch (res)
13289 {
13290 case AARCH64_PARSE_MISSING_ARG:
13291 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13292 " attribute");
13293 break;
13294 case AARCH64_PARSE_INVALID_ARG:
13295 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13296 "=\")%> pragma or attribute", err_str);
13297 break;
13298 case AARCH64_PARSE_OK:
13299 success = true;
13300 /* Fall through. */
13301 case AARCH64_PARSE_INVALID_FEATURE:
13302 break;
13303 default:
13304 gcc_unreachable ();
13305 }
13306 free (err_str);
13307 return success;
13308 }
13309
13310 /* Handle the argument STR to the tune= target attribute. */
13311
13312 static bool
13313 aarch64_handle_attr_tune (const char *str)
13314 {
13315 const struct processor *tmp_tune = NULL;
13316 enum aarch64_parse_opt_result parse_res
13317 = aarch64_parse_tune (str, &tmp_tune);
13318
13319 if (parse_res == AARCH64_PARSE_OK)
13320 {
13321 gcc_assert (tmp_tune);
13322 selected_tune = tmp_tune;
13323 explicit_tune_core = selected_tune->ident;
13324 return true;
13325 }
13326
13327 switch (parse_res)
13328 {
13329 case AARCH64_PARSE_INVALID_ARG:
13330 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13331 aarch64_print_hint_for_core (str);
13332 break;
13333 default:
13334 gcc_unreachable ();
13335 }
13336
13337 return false;
13338 }
13339
13340 /* Parse an architecture extensions target attribute string specified in STR.
13341 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13342 if successful. Update aarch64_isa_flags to reflect the ISA features
13343 modified. */
13344
13345 static bool
13346 aarch64_handle_attr_isa_flags (char *str)
13347 {
13348 enum aarch64_parse_opt_result parse_res;
13349 uint64_t isa_flags = aarch64_isa_flags;
13350
13351 /* We allow "+nothing" in the beginning to clear out all architectural
13352 features if the user wants to handpick specific features. */
13353 if (strncmp ("+nothing", str, 8) == 0)
13354 {
13355 isa_flags = 0;
13356 str += 8;
13357 }
13358
13359 std::string invalid_extension;
13360 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13361
13362 if (parse_res == AARCH64_PARSE_OK)
13363 {
13364 aarch64_isa_flags = isa_flags;
13365 return true;
13366 }
13367
13368 switch (parse_res)
13369 {
13370 case AARCH64_PARSE_MISSING_ARG:
13371 error ("missing value in %<target()%> pragma or attribute");
13372 break;
13373
13374 case AARCH64_PARSE_INVALID_FEATURE:
13375 error ("invalid feature modifier %s of value (\"%s\") in "
13376 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13377 break;
13378
13379 default:
13380 gcc_unreachable ();
13381 }
13382
13383 return false;
13384 }
13385
13386 /* The target attributes that we support. On top of these we also support just
13387 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13388 handled explicitly in aarch64_process_one_target_attr. */
13389
13390 static const struct aarch64_attribute_info aarch64_attributes[] =
13391 {
13392 { "general-regs-only", aarch64_attr_mask, false, NULL,
13393 OPT_mgeneral_regs_only },
13394 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13395 OPT_mfix_cortex_a53_835769 },
13396 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13397 OPT_mfix_cortex_a53_843419 },
13398 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13399 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13400 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13401 OPT_momit_leaf_frame_pointer },
13402 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13403 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13404 OPT_march_ },
13405 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13406 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13407 OPT_mtune_ },
13408 { "branch-protection", aarch64_attr_custom, false,
13409 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13410 { "sign-return-address", aarch64_attr_enum, false, NULL,
13411 OPT_msign_return_address_ },
13412 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13413 };
13414
13415 /* Parse ARG_STR which contains the definition of one target attribute.
13416 Show appropriate errors if any or return true if the attribute is valid. */
13417
13418 static bool
13419 aarch64_process_one_target_attr (char *arg_str)
13420 {
13421 bool invert = false;
13422
13423 size_t len = strlen (arg_str);
13424
13425 if (len == 0)
13426 {
13427 error ("malformed %<target()%> pragma or attribute");
13428 return false;
13429 }
13430
13431 char *str_to_check = (char *) alloca (len + 1);
13432 strcpy (str_to_check, arg_str);
13433
13434 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13435 It is easier to detect and handle it explicitly here rather than going
13436 through the machinery for the rest of the target attributes in this
13437 function. */
13438 if (*str_to_check == '+')
13439 return aarch64_handle_attr_isa_flags (str_to_check);
13440
13441 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13442 {
13443 invert = true;
13444 str_to_check += 3;
13445 }
13446 char *arg = strchr (str_to_check, '=');
13447
13448 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13449 and point ARG to "foo". */
13450 if (arg)
13451 {
13452 *arg = '\0';
13453 arg++;
13454 }
13455 const struct aarch64_attribute_info *p_attr;
13456 bool found = false;
13457 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13458 {
13459 /* If the names don't match up, or the user has given an argument
13460 to an attribute that doesn't accept one, or didn't give an argument
13461 to an attribute that expects one, fail to match. */
13462 if (strcmp (str_to_check, p_attr->name) != 0)
13463 continue;
13464
13465 found = true;
13466 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13467 || p_attr->attr_type == aarch64_attr_enum;
13468
13469 if (attr_need_arg_p ^ (arg != NULL))
13470 {
13471 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13472 return false;
13473 }
13474
13475 /* If the name matches but the attribute does not allow "no-" versions
13476 then we can't match. */
13477 if (invert && !p_attr->allow_neg)
13478 {
13479 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13480 return false;
13481 }
13482
13483 switch (p_attr->attr_type)
13484 {
13485 /* Has a custom handler registered.
13486 For example, cpu=, arch=, tune=. */
13487 case aarch64_attr_custom:
13488 gcc_assert (p_attr->handler);
13489 if (!p_attr->handler (arg))
13490 return false;
13491 break;
13492
13493 /* Either set or unset a boolean option. */
13494 case aarch64_attr_bool:
13495 {
13496 struct cl_decoded_option decoded;
13497
13498 generate_option (p_attr->opt_num, NULL, !invert,
13499 CL_TARGET, &decoded);
13500 aarch64_handle_option (&global_options, &global_options_set,
13501 &decoded, input_location);
13502 break;
13503 }
13504 /* Set or unset a bit in the target_flags. aarch64_handle_option
13505 should know what mask to apply given the option number. */
13506 case aarch64_attr_mask:
13507 {
13508 struct cl_decoded_option decoded;
13509 /* We only need to specify the option number.
13510 aarch64_handle_option will know which mask to apply. */
13511 decoded.opt_index = p_attr->opt_num;
13512 decoded.value = !invert;
13513 aarch64_handle_option (&global_options, &global_options_set,
13514 &decoded, input_location);
13515 break;
13516 }
13517 /* Use the option setting machinery to set an option to an enum. */
13518 case aarch64_attr_enum:
13519 {
13520 gcc_assert (arg);
13521 bool valid;
13522 int value;
13523 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13524 &value, CL_TARGET);
13525 if (valid)
13526 {
13527 set_option (&global_options, NULL, p_attr->opt_num, value,
13528 NULL, DK_UNSPECIFIED, input_location,
13529 global_dc);
13530 }
13531 else
13532 {
13533 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13534 }
13535 break;
13536 }
13537 default:
13538 gcc_unreachable ();
13539 }
13540 }
13541
13542 /* If we reached here we either have found an attribute and validated
13543 it or didn't match any. If we matched an attribute but its arguments
13544 were malformed we will have returned false already. */
13545 return found;
13546 }
13547
13548 /* Count how many times the character C appears in
13549 NULL-terminated string STR. */
13550
13551 static unsigned int
13552 num_occurences_in_str (char c, char *str)
13553 {
13554 unsigned int res = 0;
13555 while (*str != '\0')
13556 {
13557 if (*str == c)
13558 res++;
13559
13560 str++;
13561 }
13562
13563 return res;
13564 }
13565
13566 /* Parse the tree in ARGS that contains the target attribute information
13567 and update the global target options space. */
13568
13569 bool
13570 aarch64_process_target_attr (tree args)
13571 {
13572 if (TREE_CODE (args) == TREE_LIST)
13573 {
13574 do
13575 {
13576 tree head = TREE_VALUE (args);
13577 if (head)
13578 {
13579 if (!aarch64_process_target_attr (head))
13580 return false;
13581 }
13582 args = TREE_CHAIN (args);
13583 } while (args);
13584
13585 return true;
13586 }
13587
13588 if (TREE_CODE (args) != STRING_CST)
13589 {
13590 error ("attribute %<target%> argument not a string");
13591 return false;
13592 }
13593
13594 size_t len = strlen (TREE_STRING_POINTER (args));
13595 char *str_to_check = (char *) alloca (len + 1);
13596 strcpy (str_to_check, TREE_STRING_POINTER (args));
13597
13598 if (len == 0)
13599 {
13600 error ("malformed %<target()%> pragma or attribute");
13601 return false;
13602 }
13603
13604 /* Used to catch empty spaces between commas i.e.
13605 attribute ((target ("attr1,,attr2"))). */
13606 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13607
13608 /* Handle multiple target attributes separated by ','. */
13609 char *token = strtok_r (str_to_check, ",", &str_to_check);
13610
13611 unsigned int num_attrs = 0;
13612 while (token)
13613 {
13614 num_attrs++;
13615 if (!aarch64_process_one_target_attr (token))
13616 {
13617 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13618 return false;
13619 }
13620
13621 token = strtok_r (NULL, ",", &str_to_check);
13622 }
13623
13624 if (num_attrs != num_commas + 1)
13625 {
13626 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13627 return false;
13628 }
13629
13630 return true;
13631 }
13632
13633 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13634 process attribute ((target ("..."))). */
13635
13636 static bool
13637 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13638 {
13639 struct cl_target_option cur_target;
13640 bool ret;
13641 tree old_optimize;
13642 tree new_target, new_optimize;
13643 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13644
13645 /* If what we're processing is the current pragma string then the
13646 target option node is already stored in target_option_current_node
13647 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13648 having to re-parse the string. This is especially useful to keep
13649 arm_neon.h compile times down since that header contains a lot
13650 of intrinsics enclosed in pragmas. */
13651 if (!existing_target && args == current_target_pragma)
13652 {
13653 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13654 return true;
13655 }
13656 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13657
13658 old_optimize = build_optimization_node (&global_options);
13659 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13660
13661 /* If the function changed the optimization levels as well as setting
13662 target options, start with the optimizations specified. */
13663 if (func_optimize && func_optimize != old_optimize)
13664 cl_optimization_restore (&global_options,
13665 TREE_OPTIMIZATION (func_optimize));
13666
13667 /* Save the current target options to restore at the end. */
13668 cl_target_option_save (&cur_target, &global_options);
13669
13670 /* If fndecl already has some target attributes applied to it, unpack
13671 them so that we add this attribute on top of them, rather than
13672 overwriting them. */
13673 if (existing_target)
13674 {
13675 struct cl_target_option *existing_options
13676 = TREE_TARGET_OPTION (existing_target);
13677
13678 if (existing_options)
13679 cl_target_option_restore (&global_options, existing_options);
13680 }
13681 else
13682 cl_target_option_restore (&global_options,
13683 TREE_TARGET_OPTION (target_option_current_node));
13684
13685 ret = aarch64_process_target_attr (args);
13686
13687 /* Set up any additional state. */
13688 if (ret)
13689 {
13690 aarch64_override_options_internal (&global_options);
13691 /* Initialize SIMD builtins if we haven't already.
13692 Set current_target_pragma to NULL for the duration so that
13693 the builtin initialization code doesn't try to tag the functions
13694 being built with the attributes specified by any current pragma, thus
13695 going into an infinite recursion. */
13696 if (TARGET_SIMD)
13697 {
13698 tree saved_current_target_pragma = current_target_pragma;
13699 current_target_pragma = NULL;
13700 aarch64_init_simd_builtins ();
13701 current_target_pragma = saved_current_target_pragma;
13702 }
13703 new_target = build_target_option_node (&global_options);
13704 }
13705 else
13706 new_target = NULL;
13707
13708 new_optimize = build_optimization_node (&global_options);
13709
13710 if (fndecl && ret)
13711 {
13712 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13713
13714 if (old_optimize != new_optimize)
13715 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13716 }
13717
13718 cl_target_option_restore (&global_options, &cur_target);
13719
13720 if (old_optimize != new_optimize)
13721 cl_optimization_restore (&global_options,
13722 TREE_OPTIMIZATION (old_optimize));
13723 return ret;
13724 }
13725
13726 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13727 tri-bool options (yes, no, don't care) and the default value is
13728 DEF, determine whether to reject inlining. */
13729
13730 static bool
13731 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13732 int dont_care, int def)
13733 {
13734 /* If the callee doesn't care, always allow inlining. */
13735 if (callee == dont_care)
13736 return true;
13737
13738 /* If the caller doesn't care, always allow inlining. */
13739 if (caller == dont_care)
13740 return true;
13741
13742 /* Otherwise, allow inlining if either the callee and caller values
13743 agree, or if the callee is using the default value. */
13744 return (callee == caller || callee == def);
13745 }
13746
13747 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13748 to inline CALLEE into CALLER based on target-specific info.
13749 Make sure that the caller and callee have compatible architectural
13750 features. Then go through the other possible target attributes
13751 and see if they can block inlining. Try not to reject always_inline
13752 callees unless they are incompatible architecturally. */
13753
13754 static bool
13755 aarch64_can_inline_p (tree caller, tree callee)
13756 {
13757 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13758 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13759
13760 struct cl_target_option *caller_opts
13761 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13762 : target_option_default_node);
13763
13764 struct cl_target_option *callee_opts
13765 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13766 : target_option_default_node);
13767
13768 /* Callee's ISA flags should be a subset of the caller's. */
13769 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13770 != callee_opts->x_aarch64_isa_flags)
13771 return false;
13772
13773 /* Allow non-strict aligned functions inlining into strict
13774 aligned ones. */
13775 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13776 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13777 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13778 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13779 return false;
13780
13781 bool always_inline = lookup_attribute ("always_inline",
13782 DECL_ATTRIBUTES (callee));
13783
13784 /* If the architectural features match up and the callee is always_inline
13785 then the other attributes don't matter. */
13786 if (always_inline)
13787 return true;
13788
13789 if (caller_opts->x_aarch64_cmodel_var
13790 != callee_opts->x_aarch64_cmodel_var)
13791 return false;
13792
13793 if (caller_opts->x_aarch64_tls_dialect
13794 != callee_opts->x_aarch64_tls_dialect)
13795 return false;
13796
13797 /* Honour explicit requests to workaround errata. */
13798 if (!aarch64_tribools_ok_for_inlining_p (
13799 caller_opts->x_aarch64_fix_a53_err835769,
13800 callee_opts->x_aarch64_fix_a53_err835769,
13801 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13802 return false;
13803
13804 if (!aarch64_tribools_ok_for_inlining_p (
13805 caller_opts->x_aarch64_fix_a53_err843419,
13806 callee_opts->x_aarch64_fix_a53_err843419,
13807 2, TARGET_FIX_ERR_A53_843419))
13808 return false;
13809
13810 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13811 caller and calle and they don't match up, reject inlining. */
13812 if (!aarch64_tribools_ok_for_inlining_p (
13813 caller_opts->x_flag_omit_leaf_frame_pointer,
13814 callee_opts->x_flag_omit_leaf_frame_pointer,
13815 2, 1))
13816 return false;
13817
13818 /* If the callee has specific tuning overrides, respect them. */
13819 if (callee_opts->x_aarch64_override_tune_string != NULL
13820 && caller_opts->x_aarch64_override_tune_string == NULL)
13821 return false;
13822
13823 /* If the user specified tuning override strings for the
13824 caller and callee and they don't match up, reject inlining.
13825 We just do a string compare here, we don't analyze the meaning
13826 of the string, as it would be too costly for little gain. */
13827 if (callee_opts->x_aarch64_override_tune_string
13828 && caller_opts->x_aarch64_override_tune_string
13829 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13830 caller_opts->x_aarch64_override_tune_string) != 0))
13831 return false;
13832
13833 return true;
13834 }
13835
13836 /* Return true if SYMBOL_REF X binds locally. */
13837
13838 static bool
13839 aarch64_symbol_binds_local_p (const_rtx x)
13840 {
13841 return (SYMBOL_REF_DECL (x)
13842 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13843 : SYMBOL_REF_LOCAL_P (x));
13844 }
13845
13846 /* Return true if SYMBOL_REF X is thread local */
13847 static bool
13848 aarch64_tls_symbol_p (rtx x)
13849 {
13850 if (! TARGET_HAVE_TLS)
13851 return false;
13852
13853 if (GET_CODE (x) != SYMBOL_REF)
13854 return false;
13855
13856 return SYMBOL_REF_TLS_MODEL (x) != 0;
13857 }
13858
13859 /* Classify a TLS symbol into one of the TLS kinds. */
13860 enum aarch64_symbol_type
13861 aarch64_classify_tls_symbol (rtx x)
13862 {
13863 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13864
13865 switch (tls_kind)
13866 {
13867 case TLS_MODEL_GLOBAL_DYNAMIC:
13868 case TLS_MODEL_LOCAL_DYNAMIC:
13869 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13870
13871 case TLS_MODEL_INITIAL_EXEC:
13872 switch (aarch64_cmodel)
13873 {
13874 case AARCH64_CMODEL_TINY:
13875 case AARCH64_CMODEL_TINY_PIC:
13876 return SYMBOL_TINY_TLSIE;
13877 default:
13878 return SYMBOL_SMALL_TLSIE;
13879 }
13880
13881 case TLS_MODEL_LOCAL_EXEC:
13882 if (aarch64_tls_size == 12)
13883 return SYMBOL_TLSLE12;
13884 else if (aarch64_tls_size == 24)
13885 return SYMBOL_TLSLE24;
13886 else if (aarch64_tls_size == 32)
13887 return SYMBOL_TLSLE32;
13888 else if (aarch64_tls_size == 48)
13889 return SYMBOL_TLSLE48;
13890 else
13891 gcc_unreachable ();
13892
13893 case TLS_MODEL_EMULATED:
13894 case TLS_MODEL_NONE:
13895 return SYMBOL_FORCE_TO_MEM;
13896
13897 default:
13898 gcc_unreachable ();
13899 }
13900 }
13901
13902 /* Return the correct method for accessing X + OFFSET, where X is either
13903 a SYMBOL_REF or LABEL_REF. */
13904
13905 enum aarch64_symbol_type
13906 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13907 {
13908 if (GET_CODE (x) == LABEL_REF)
13909 {
13910 switch (aarch64_cmodel)
13911 {
13912 case AARCH64_CMODEL_LARGE:
13913 return SYMBOL_FORCE_TO_MEM;
13914
13915 case AARCH64_CMODEL_TINY_PIC:
13916 case AARCH64_CMODEL_TINY:
13917 return SYMBOL_TINY_ABSOLUTE;
13918
13919 case AARCH64_CMODEL_SMALL_SPIC:
13920 case AARCH64_CMODEL_SMALL_PIC:
13921 case AARCH64_CMODEL_SMALL:
13922 return SYMBOL_SMALL_ABSOLUTE;
13923
13924 default:
13925 gcc_unreachable ();
13926 }
13927 }
13928
13929 if (GET_CODE (x) == SYMBOL_REF)
13930 {
13931 if (aarch64_tls_symbol_p (x))
13932 return aarch64_classify_tls_symbol (x);
13933
13934 switch (aarch64_cmodel)
13935 {
13936 case AARCH64_CMODEL_TINY:
13937 /* When we retrieve symbol + offset address, we have to make sure
13938 the offset does not cause overflow of the final address. But
13939 we have no way of knowing the address of symbol at compile time
13940 so we can't accurately say if the distance between the PC and
13941 symbol + offset is outside the addressible range of +/-1M in the
13942 TINY code model. So we rely on images not being greater than
13943 1M and cap the offset at 1M and anything beyond 1M will have to
13944 be loaded using an alternative mechanism. Furthermore if the
13945 symbol is a weak reference to something that isn't known to
13946 resolve to a symbol in this module, then force to memory. */
13947 if ((SYMBOL_REF_WEAK (x)
13948 && !aarch64_symbol_binds_local_p (x))
13949 || !IN_RANGE (offset, -1048575, 1048575))
13950 return SYMBOL_FORCE_TO_MEM;
13951 return SYMBOL_TINY_ABSOLUTE;
13952
13953 case AARCH64_CMODEL_SMALL:
13954 /* Same reasoning as the tiny code model, but the offset cap here is
13955 4G. */
13956 if ((SYMBOL_REF_WEAK (x)
13957 && !aarch64_symbol_binds_local_p (x))
13958 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13959 HOST_WIDE_INT_C (4294967264)))
13960 return SYMBOL_FORCE_TO_MEM;
13961 return SYMBOL_SMALL_ABSOLUTE;
13962
13963 case AARCH64_CMODEL_TINY_PIC:
13964 if (!aarch64_symbol_binds_local_p (x))
13965 return SYMBOL_TINY_GOT;
13966 return SYMBOL_TINY_ABSOLUTE;
13967
13968 case AARCH64_CMODEL_SMALL_SPIC:
13969 case AARCH64_CMODEL_SMALL_PIC:
13970 if (!aarch64_symbol_binds_local_p (x))
13971 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13972 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13973 return SYMBOL_SMALL_ABSOLUTE;
13974
13975 case AARCH64_CMODEL_LARGE:
13976 /* This is alright even in PIC code as the constant
13977 pool reference is always PC relative and within
13978 the same translation unit. */
13979 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13980 return SYMBOL_SMALL_ABSOLUTE;
13981 else
13982 return SYMBOL_FORCE_TO_MEM;
13983
13984 default:
13985 gcc_unreachable ();
13986 }
13987 }
13988
13989 /* By default push everything into the constant pool. */
13990 return SYMBOL_FORCE_TO_MEM;
13991 }
13992
13993 bool
13994 aarch64_constant_address_p (rtx x)
13995 {
13996 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13997 }
13998
13999 bool
14000 aarch64_legitimate_pic_operand_p (rtx x)
14001 {
14002 if (GET_CODE (x) == SYMBOL_REF
14003 || (GET_CODE (x) == CONST
14004 && GET_CODE (XEXP (x, 0)) == PLUS
14005 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
14006 return false;
14007
14008 return true;
14009 }
14010
14011 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
14012 that should be rematerialized rather than spilled. */
14013
14014 static bool
14015 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14016 {
14017 /* Support CSE and rematerialization of common constants. */
14018 if (CONST_INT_P (x)
14019 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14020 || GET_CODE (x) == CONST_VECTOR)
14021 return true;
14022
14023 /* Do not allow vector struct mode constants for Advanced SIMD.
14024 We could support 0 and -1 easily, but they need support in
14025 aarch64-simd.md. */
14026 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14027 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14028 return false;
14029
14030 /* Only accept variable-length vector constants if they can be
14031 handled directly.
14032
14033 ??? It would be possible to handle rematerialization of other
14034 constants via secondary reloads. */
14035 if (vec_flags & VEC_ANY_SVE)
14036 return aarch64_simd_valid_immediate (x, NULL);
14037
14038 if (GET_CODE (x) == HIGH)
14039 x = XEXP (x, 0);
14040
14041 /* Accept polynomial constants that can be calculated by using the
14042 destination of a move as the sole temporary. Constants that
14043 require a second temporary cannot be rematerialized (they can't be
14044 forced to memory and also aren't legitimate constants). */
14045 poly_int64 offset;
14046 if (poly_int_rtx_p (x, &offset))
14047 return aarch64_offset_temporaries (false, offset) <= 1;
14048
14049 /* If an offset is being added to something else, we need to allow the
14050 base to be moved into the destination register, meaning that there
14051 are no free temporaries for the offset. */
14052 x = strip_offset (x, &offset);
14053 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14054 return false;
14055
14056 /* Do not allow const (plus (anchor_symbol, const_int)). */
14057 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14058 return false;
14059
14060 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
14061 so spilling them is better than rematerialization. */
14062 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14063 return true;
14064
14065 /* Label references are always constant. */
14066 if (GET_CODE (x) == LABEL_REF)
14067 return true;
14068
14069 return false;
14070 }
14071
14072 rtx
14073 aarch64_load_tp (rtx target)
14074 {
14075 if (!target
14076 || GET_MODE (target) != Pmode
14077 || !register_operand (target, Pmode))
14078 target = gen_reg_rtx (Pmode);
14079
14080 /* Can return in any reg. */
14081 emit_insn (gen_aarch64_load_tp_hard (target));
14082 return target;
14083 }
14084
14085 /* On AAPCS systems, this is the "struct __va_list". */
14086 static GTY(()) tree va_list_type;
14087
14088 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14089 Return the type to use as __builtin_va_list.
14090
14091 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14092
14093 struct __va_list
14094 {
14095 void *__stack;
14096 void *__gr_top;
14097 void *__vr_top;
14098 int __gr_offs;
14099 int __vr_offs;
14100 }; */
14101
14102 static tree
14103 aarch64_build_builtin_va_list (void)
14104 {
14105 tree va_list_name;
14106 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14107
14108 /* Create the type. */
14109 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14110 /* Give it the required name. */
14111 va_list_name = build_decl (BUILTINS_LOCATION,
14112 TYPE_DECL,
14113 get_identifier ("__va_list"),
14114 va_list_type);
14115 DECL_ARTIFICIAL (va_list_name) = 1;
14116 TYPE_NAME (va_list_type) = va_list_name;
14117 TYPE_STUB_DECL (va_list_type) = va_list_name;
14118
14119 /* Create the fields. */
14120 f_stack = build_decl (BUILTINS_LOCATION,
14121 FIELD_DECL, get_identifier ("__stack"),
14122 ptr_type_node);
14123 f_grtop = build_decl (BUILTINS_LOCATION,
14124 FIELD_DECL, get_identifier ("__gr_top"),
14125 ptr_type_node);
14126 f_vrtop = build_decl (BUILTINS_LOCATION,
14127 FIELD_DECL, get_identifier ("__vr_top"),
14128 ptr_type_node);
14129 f_groff = build_decl (BUILTINS_LOCATION,
14130 FIELD_DECL, get_identifier ("__gr_offs"),
14131 integer_type_node);
14132 f_vroff = build_decl (BUILTINS_LOCATION,
14133 FIELD_DECL, get_identifier ("__vr_offs"),
14134 integer_type_node);
14135
14136 /* Tell tree-stdarg pass about our internal offset fields.
14137 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14138 purpose to identify whether the code is updating va_list internal
14139 offset fields through irregular way. */
14140 va_list_gpr_counter_field = f_groff;
14141 va_list_fpr_counter_field = f_vroff;
14142
14143 DECL_ARTIFICIAL (f_stack) = 1;
14144 DECL_ARTIFICIAL (f_grtop) = 1;
14145 DECL_ARTIFICIAL (f_vrtop) = 1;
14146 DECL_ARTIFICIAL (f_groff) = 1;
14147 DECL_ARTIFICIAL (f_vroff) = 1;
14148
14149 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14150 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14151 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14152 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14153 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14154
14155 TYPE_FIELDS (va_list_type) = f_stack;
14156 DECL_CHAIN (f_stack) = f_grtop;
14157 DECL_CHAIN (f_grtop) = f_vrtop;
14158 DECL_CHAIN (f_vrtop) = f_groff;
14159 DECL_CHAIN (f_groff) = f_vroff;
14160
14161 /* Compute its layout. */
14162 layout_type (va_list_type);
14163
14164 return va_list_type;
14165 }
14166
14167 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14168 static void
14169 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14170 {
14171 const CUMULATIVE_ARGS *cum;
14172 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14173 tree stack, grtop, vrtop, groff, vroff;
14174 tree t;
14175 int gr_save_area_size = cfun->va_list_gpr_size;
14176 int vr_save_area_size = cfun->va_list_fpr_size;
14177 int vr_offset;
14178
14179 cum = &crtl->args.info;
14180 if (cfun->va_list_gpr_size)
14181 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14182 cfun->va_list_gpr_size);
14183 if (cfun->va_list_fpr_size)
14184 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14185 * UNITS_PER_VREG, cfun->va_list_fpr_size);
14186
14187 if (!TARGET_FLOAT)
14188 {
14189 gcc_assert (cum->aapcs_nvrn == 0);
14190 vr_save_area_size = 0;
14191 }
14192
14193 f_stack = TYPE_FIELDS (va_list_type_node);
14194 f_grtop = DECL_CHAIN (f_stack);
14195 f_vrtop = DECL_CHAIN (f_grtop);
14196 f_groff = DECL_CHAIN (f_vrtop);
14197 f_vroff = DECL_CHAIN (f_groff);
14198
14199 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14200 NULL_TREE);
14201 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14202 NULL_TREE);
14203 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14204 NULL_TREE);
14205 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14206 NULL_TREE);
14207 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14208 NULL_TREE);
14209
14210 /* Emit code to initialize STACK, which points to the next varargs stack
14211 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14212 by named arguments. STACK is 8-byte aligned. */
14213 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14214 if (cum->aapcs_stack_size > 0)
14215 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14216 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14217 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14218
14219 /* Emit code to initialize GRTOP, the top of the GR save area.
14220 virtual_incoming_args_rtx should have been 16 byte aligned. */
14221 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14222 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14223 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14224
14225 /* Emit code to initialize VRTOP, the top of the VR save area.
14226 This address is gr_save_area_bytes below GRTOP, rounded
14227 down to the next 16-byte boundary. */
14228 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14229 vr_offset = ROUND_UP (gr_save_area_size,
14230 STACK_BOUNDARY / BITS_PER_UNIT);
14231
14232 if (vr_offset)
14233 t = fold_build_pointer_plus_hwi (t, -vr_offset);
14234 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14235 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14236
14237 /* Emit code to initialize GROFF, the offset from GRTOP of the
14238 next GPR argument. */
14239 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14240 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14241 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14242
14243 /* Likewise emit code to initialize VROFF, the offset from FTOP
14244 of the next VR argument. */
14245 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14246 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14247 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14248 }
14249
14250 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14251
14252 static tree
14253 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14254 gimple_seq *post_p ATTRIBUTE_UNUSED)
14255 {
14256 tree addr;
14257 bool indirect_p;
14258 bool is_ha; /* is HFA or HVA. */
14259 bool dw_align; /* double-word align. */
14260 machine_mode ag_mode = VOIDmode;
14261 int nregs;
14262 machine_mode mode;
14263
14264 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14265 tree stack, f_top, f_off, off, arg, roundup, on_stack;
14266 HOST_WIDE_INT size, rsize, adjust, align;
14267 tree t, u, cond1, cond2;
14268
14269 indirect_p = pass_va_arg_by_reference (type);
14270 if (indirect_p)
14271 type = build_pointer_type (type);
14272
14273 mode = TYPE_MODE (type);
14274
14275 f_stack = TYPE_FIELDS (va_list_type_node);
14276 f_grtop = DECL_CHAIN (f_stack);
14277 f_vrtop = DECL_CHAIN (f_grtop);
14278 f_groff = DECL_CHAIN (f_vrtop);
14279 f_vroff = DECL_CHAIN (f_groff);
14280
14281 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14282 f_stack, NULL_TREE);
14283 size = int_size_in_bytes (type);
14284
14285 bool abi_break;
14286 align
14287 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14288
14289 dw_align = false;
14290 adjust = 0;
14291 if (aarch64_vfp_is_call_or_return_candidate (mode,
14292 type,
14293 &ag_mode,
14294 &nregs,
14295 &is_ha))
14296 {
14297 /* No frontends can create types with variable-sized modes, so we
14298 shouldn't be asked to pass or return them. */
14299 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14300
14301 /* TYPE passed in fp/simd registers. */
14302 if (!TARGET_FLOAT)
14303 aarch64_err_no_fpadvsimd (mode);
14304
14305 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14306 unshare_expr (valist), f_vrtop, NULL_TREE);
14307 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14308 unshare_expr (valist), f_vroff, NULL_TREE);
14309
14310 rsize = nregs * UNITS_PER_VREG;
14311
14312 if (is_ha)
14313 {
14314 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14315 adjust = UNITS_PER_VREG - ag_size;
14316 }
14317 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14318 && size < UNITS_PER_VREG)
14319 {
14320 adjust = UNITS_PER_VREG - size;
14321 }
14322 }
14323 else
14324 {
14325 /* TYPE passed in general registers. */
14326 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14327 unshare_expr (valist), f_grtop, NULL_TREE);
14328 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14329 unshare_expr (valist), f_groff, NULL_TREE);
14330 rsize = ROUND_UP (size, UNITS_PER_WORD);
14331 nregs = rsize / UNITS_PER_WORD;
14332
14333 if (align > 8)
14334 {
14335 if (abi_break && warn_psabi)
14336 inform (input_location, "parameter passing for argument of type "
14337 "%qT changed in GCC 9.1", type);
14338 dw_align = true;
14339 }
14340
14341 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14342 && size < UNITS_PER_WORD)
14343 {
14344 adjust = UNITS_PER_WORD - size;
14345 }
14346 }
14347
14348 /* Get a local temporary for the field value. */
14349 off = get_initialized_tmp_var (f_off, pre_p, NULL);
14350
14351 /* Emit code to branch if off >= 0. */
14352 t = build2 (GE_EXPR, boolean_type_node, off,
14353 build_int_cst (TREE_TYPE (off), 0));
14354 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14355
14356 if (dw_align)
14357 {
14358 /* Emit: offs = (offs + 15) & -16. */
14359 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14360 build_int_cst (TREE_TYPE (off), 15));
14361 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14362 build_int_cst (TREE_TYPE (off), -16));
14363 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14364 }
14365 else
14366 roundup = NULL;
14367
14368 /* Update ap.__[g|v]r_offs */
14369 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14370 build_int_cst (TREE_TYPE (off), rsize));
14371 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14372
14373 /* String up. */
14374 if (roundup)
14375 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14376
14377 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14378 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14379 build_int_cst (TREE_TYPE (f_off), 0));
14380 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14381
14382 /* String up: make sure the assignment happens before the use. */
14383 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14384 COND_EXPR_ELSE (cond1) = t;
14385
14386 /* Prepare the trees handling the argument that is passed on the stack;
14387 the top level node will store in ON_STACK. */
14388 arg = get_initialized_tmp_var (stack, pre_p, NULL);
14389 if (align > 8)
14390 {
14391 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14392 t = fold_build_pointer_plus_hwi (arg, 15);
14393 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14394 build_int_cst (TREE_TYPE (t), -16));
14395 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14396 }
14397 else
14398 roundup = NULL;
14399 /* Advance ap.__stack */
14400 t = fold_build_pointer_plus_hwi (arg, size + 7);
14401 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14402 build_int_cst (TREE_TYPE (t), -8));
14403 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14404 /* String up roundup and advance. */
14405 if (roundup)
14406 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14407 /* String up with arg */
14408 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14409 /* Big-endianness related address adjustment. */
14410 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14411 && size < UNITS_PER_WORD)
14412 {
14413 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14414 size_int (UNITS_PER_WORD - size));
14415 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14416 }
14417
14418 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14419 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14420
14421 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14422 t = off;
14423 if (adjust)
14424 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14425 build_int_cst (TREE_TYPE (off), adjust));
14426
14427 t = fold_convert (sizetype, t);
14428 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14429
14430 if (is_ha)
14431 {
14432 /* type ha; // treat as "struct {ftype field[n];}"
14433 ... [computing offs]
14434 for (i = 0; i <nregs; ++i, offs += 16)
14435 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14436 return ha; */
14437 int i;
14438 tree tmp_ha, field_t, field_ptr_t;
14439
14440 /* Declare a local variable. */
14441 tmp_ha = create_tmp_var_raw (type, "ha");
14442 gimple_add_tmp_var (tmp_ha);
14443
14444 /* Establish the base type. */
14445 switch (ag_mode)
14446 {
14447 case E_SFmode:
14448 field_t = float_type_node;
14449 field_ptr_t = float_ptr_type_node;
14450 break;
14451 case E_DFmode:
14452 field_t = double_type_node;
14453 field_ptr_t = double_ptr_type_node;
14454 break;
14455 case E_TFmode:
14456 field_t = long_double_type_node;
14457 field_ptr_t = long_double_ptr_type_node;
14458 break;
14459 case E_HFmode:
14460 field_t = aarch64_fp16_type_node;
14461 field_ptr_t = aarch64_fp16_ptr_type_node;
14462 break;
14463 case E_V2SImode:
14464 case E_V4SImode:
14465 {
14466 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14467 field_t = build_vector_type_for_mode (innertype, ag_mode);
14468 field_ptr_t = build_pointer_type (field_t);
14469 }
14470 break;
14471 default:
14472 gcc_assert (0);
14473 }
14474
14475 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14476 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14477 addr = t;
14478 t = fold_convert (field_ptr_t, addr);
14479 t = build2 (MODIFY_EXPR, field_t,
14480 build1 (INDIRECT_REF, field_t, tmp_ha),
14481 build1 (INDIRECT_REF, field_t, t));
14482
14483 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14484 for (i = 1; i < nregs; ++i)
14485 {
14486 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14487 u = fold_convert (field_ptr_t, addr);
14488 u = build2 (MODIFY_EXPR, field_t,
14489 build2 (MEM_REF, field_t, tmp_ha,
14490 build_int_cst (field_ptr_t,
14491 (i *
14492 int_size_in_bytes (field_t)))),
14493 build1 (INDIRECT_REF, field_t, u));
14494 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14495 }
14496
14497 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14498 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14499 }
14500
14501 COND_EXPR_ELSE (cond2) = t;
14502 addr = fold_convert (build_pointer_type (type), cond1);
14503 addr = build_va_arg_indirect_ref (addr);
14504
14505 if (indirect_p)
14506 addr = build_va_arg_indirect_ref (addr);
14507
14508 return addr;
14509 }
14510
14511 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14512
14513 static void
14514 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
14515 const function_arg_info &arg,
14516 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
14517 {
14518 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14519 CUMULATIVE_ARGS local_cum;
14520 int gr_saved = cfun->va_list_gpr_size;
14521 int vr_saved = cfun->va_list_fpr_size;
14522
14523 /* The caller has advanced CUM up to, but not beyond, the last named
14524 argument. Advance a local copy of CUM past the last "real" named
14525 argument, to find out how many registers are left over. */
14526 local_cum = *cum;
14527 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
14528
14529 /* Found out how many registers we need to save.
14530 Honor tree-stdvar analysis results. */
14531 if (cfun->va_list_gpr_size)
14532 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14533 cfun->va_list_gpr_size / UNITS_PER_WORD);
14534 if (cfun->va_list_fpr_size)
14535 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14536 cfun->va_list_fpr_size / UNITS_PER_VREG);
14537
14538 if (!TARGET_FLOAT)
14539 {
14540 gcc_assert (local_cum.aapcs_nvrn == 0);
14541 vr_saved = 0;
14542 }
14543
14544 if (!no_rtl)
14545 {
14546 if (gr_saved > 0)
14547 {
14548 rtx ptr, mem;
14549
14550 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14551 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14552 - gr_saved * UNITS_PER_WORD);
14553 mem = gen_frame_mem (BLKmode, ptr);
14554 set_mem_alias_set (mem, get_varargs_alias_set ());
14555
14556 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14557 mem, gr_saved);
14558 }
14559 if (vr_saved > 0)
14560 {
14561 /* We can't use move_block_from_reg, because it will use
14562 the wrong mode, storing D regs only. */
14563 machine_mode mode = TImode;
14564 int off, i, vr_start;
14565
14566 /* Set OFF to the offset from virtual_incoming_args_rtx of
14567 the first vector register. The VR save area lies below
14568 the GR one, and is aligned to 16 bytes. */
14569 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14570 STACK_BOUNDARY / BITS_PER_UNIT);
14571 off -= vr_saved * UNITS_PER_VREG;
14572
14573 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14574 for (i = 0; i < vr_saved; ++i)
14575 {
14576 rtx ptr, mem;
14577
14578 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14579 mem = gen_frame_mem (mode, ptr);
14580 set_mem_alias_set (mem, get_varargs_alias_set ());
14581 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14582 off += UNITS_PER_VREG;
14583 }
14584 }
14585 }
14586
14587 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14588 any complication of having crtl->args.pretend_args_size changed. */
14589 cfun->machine->frame.saved_varargs_size
14590 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14591 STACK_BOUNDARY / BITS_PER_UNIT)
14592 + vr_saved * UNITS_PER_VREG);
14593 }
14594
14595 static void
14596 aarch64_conditional_register_usage (void)
14597 {
14598 int i;
14599 if (!TARGET_FLOAT)
14600 {
14601 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14602 {
14603 fixed_regs[i] = 1;
14604 call_used_regs[i] = 1;
14605 }
14606 }
14607 if (!TARGET_SVE)
14608 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14609 {
14610 fixed_regs[i] = 1;
14611 call_used_regs[i] = 1;
14612 }
14613
14614 /* When tracking speculation, we need a couple of call-clobbered registers
14615 to track the speculation state. It would be nice to just use
14616 IP0 and IP1, but currently there are numerous places that just
14617 assume these registers are free for other uses (eg pointer
14618 authentication). */
14619 if (aarch64_track_speculation)
14620 {
14621 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14622 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14623 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14624 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14625 }
14626 }
14627
14628 /* Walk down the type tree of TYPE counting consecutive base elements.
14629 If *MODEP is VOIDmode, then set it to the first valid floating point
14630 type. If a non-floating point type is found, or if a floating point
14631 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14632 otherwise return the count in the sub-tree. */
14633 static int
14634 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14635 {
14636 machine_mode mode;
14637 HOST_WIDE_INT size;
14638
14639 switch (TREE_CODE (type))
14640 {
14641 case REAL_TYPE:
14642 mode = TYPE_MODE (type);
14643 if (mode != DFmode && mode != SFmode
14644 && mode != TFmode && mode != HFmode)
14645 return -1;
14646
14647 if (*modep == VOIDmode)
14648 *modep = mode;
14649
14650 if (*modep == mode)
14651 return 1;
14652
14653 break;
14654
14655 case COMPLEX_TYPE:
14656 mode = TYPE_MODE (TREE_TYPE (type));
14657 if (mode != DFmode && mode != SFmode
14658 && mode != TFmode && mode != HFmode)
14659 return -1;
14660
14661 if (*modep == VOIDmode)
14662 *modep = mode;
14663
14664 if (*modep == mode)
14665 return 2;
14666
14667 break;
14668
14669 case VECTOR_TYPE:
14670 /* Use V2SImode and V4SImode as representatives of all 64-bit
14671 and 128-bit vector types. */
14672 size = int_size_in_bytes (type);
14673 switch (size)
14674 {
14675 case 8:
14676 mode = V2SImode;
14677 break;
14678 case 16:
14679 mode = V4SImode;
14680 break;
14681 default:
14682 return -1;
14683 }
14684
14685 if (*modep == VOIDmode)
14686 *modep = mode;
14687
14688 /* Vector modes are considered to be opaque: two vectors are
14689 equivalent for the purposes of being homogeneous aggregates
14690 if they are the same size. */
14691 if (*modep == mode)
14692 return 1;
14693
14694 break;
14695
14696 case ARRAY_TYPE:
14697 {
14698 int count;
14699 tree index = TYPE_DOMAIN (type);
14700
14701 /* Can't handle incomplete types nor sizes that are not
14702 fixed. */
14703 if (!COMPLETE_TYPE_P (type)
14704 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14705 return -1;
14706
14707 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14708 if (count == -1
14709 || !index
14710 || !TYPE_MAX_VALUE (index)
14711 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14712 || !TYPE_MIN_VALUE (index)
14713 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14714 || count < 0)
14715 return -1;
14716
14717 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14718 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14719
14720 /* There must be no padding. */
14721 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14722 count * GET_MODE_BITSIZE (*modep)))
14723 return -1;
14724
14725 return count;
14726 }
14727
14728 case RECORD_TYPE:
14729 {
14730 int count = 0;
14731 int sub_count;
14732 tree field;
14733
14734 /* Can't handle incomplete types nor sizes that are not
14735 fixed. */
14736 if (!COMPLETE_TYPE_P (type)
14737 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14738 return -1;
14739
14740 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14741 {
14742 if (TREE_CODE (field) != FIELD_DECL)
14743 continue;
14744
14745 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14746 if (sub_count < 0)
14747 return -1;
14748 count += sub_count;
14749 }
14750
14751 /* There must be no padding. */
14752 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14753 count * GET_MODE_BITSIZE (*modep)))
14754 return -1;
14755
14756 return count;
14757 }
14758
14759 case UNION_TYPE:
14760 case QUAL_UNION_TYPE:
14761 {
14762 /* These aren't very interesting except in a degenerate case. */
14763 int count = 0;
14764 int sub_count;
14765 tree field;
14766
14767 /* Can't handle incomplete types nor sizes that are not
14768 fixed. */
14769 if (!COMPLETE_TYPE_P (type)
14770 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14771 return -1;
14772
14773 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14774 {
14775 if (TREE_CODE (field) != FIELD_DECL)
14776 continue;
14777
14778 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14779 if (sub_count < 0)
14780 return -1;
14781 count = count > sub_count ? count : sub_count;
14782 }
14783
14784 /* There must be no padding. */
14785 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14786 count * GET_MODE_BITSIZE (*modep)))
14787 return -1;
14788
14789 return count;
14790 }
14791
14792 default:
14793 break;
14794 }
14795
14796 return -1;
14797 }
14798
14799 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14800 type as described in AAPCS64 \S 4.1.2.
14801
14802 See the comment above aarch64_composite_type_p for the notes on MODE. */
14803
14804 static bool
14805 aarch64_short_vector_p (const_tree type,
14806 machine_mode mode)
14807 {
14808 poly_int64 size = -1;
14809
14810 if (type && TREE_CODE (type) == VECTOR_TYPE)
14811 size = int_size_in_bytes (type);
14812 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14813 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14814 size = GET_MODE_SIZE (mode);
14815
14816 return known_eq (size, 8) || known_eq (size, 16);
14817 }
14818
14819 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14820 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14821 array types. The C99 floating-point complex types are also considered
14822 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14823 types, which are GCC extensions and out of the scope of AAPCS64, are
14824 treated as composite types here as well.
14825
14826 Note that MODE itself is not sufficient in determining whether a type
14827 is such a composite type or not. This is because
14828 stor-layout.c:compute_record_mode may have already changed the MODE
14829 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14830 structure with only one field may have its MODE set to the mode of the
14831 field. Also an integer mode whose size matches the size of the
14832 RECORD_TYPE type may be used to substitute the original mode
14833 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14834 solely relied on. */
14835
14836 static bool
14837 aarch64_composite_type_p (const_tree type,
14838 machine_mode mode)
14839 {
14840 if (aarch64_short_vector_p (type, mode))
14841 return false;
14842
14843 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14844 return true;
14845
14846 if (mode == BLKmode
14847 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14848 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14849 return true;
14850
14851 return false;
14852 }
14853
14854 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14855 shall be passed or returned in simd/fp register(s) (providing these
14856 parameter passing registers are available).
14857
14858 Upon successful return, *COUNT returns the number of needed registers,
14859 *BASE_MODE returns the mode of the individual register and when IS_HAF
14860 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14861 floating-point aggregate or a homogeneous short-vector aggregate. */
14862
14863 static bool
14864 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14865 const_tree type,
14866 machine_mode *base_mode,
14867 int *count,
14868 bool *is_ha)
14869 {
14870 machine_mode new_mode = VOIDmode;
14871 bool composite_p = aarch64_composite_type_p (type, mode);
14872
14873 if (is_ha != NULL) *is_ha = false;
14874
14875 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14876 || aarch64_short_vector_p (type, mode))
14877 {
14878 *count = 1;
14879 new_mode = mode;
14880 }
14881 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14882 {
14883 if (is_ha != NULL) *is_ha = true;
14884 *count = 2;
14885 new_mode = GET_MODE_INNER (mode);
14886 }
14887 else if (type && composite_p)
14888 {
14889 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14890
14891 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14892 {
14893 if (is_ha != NULL) *is_ha = true;
14894 *count = ag_count;
14895 }
14896 else
14897 return false;
14898 }
14899 else
14900 return false;
14901
14902 *base_mode = new_mode;
14903 return true;
14904 }
14905
14906 /* Implement TARGET_STRUCT_VALUE_RTX. */
14907
14908 static rtx
14909 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14910 int incoming ATTRIBUTE_UNUSED)
14911 {
14912 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14913 }
14914
14915 /* Implements target hook vector_mode_supported_p. */
14916 static bool
14917 aarch64_vector_mode_supported_p (machine_mode mode)
14918 {
14919 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14920 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14921 }
14922
14923 /* Return the full-width SVE vector mode for element mode MODE, if one
14924 exists. */
14925 opt_machine_mode
14926 aarch64_full_sve_mode (scalar_mode mode)
14927 {
14928 switch (mode)
14929 {
14930 case E_DFmode:
14931 return VNx2DFmode;
14932 case E_SFmode:
14933 return VNx4SFmode;
14934 case E_HFmode:
14935 return VNx8HFmode;
14936 case E_DImode:
14937 return VNx2DImode;
14938 case E_SImode:
14939 return VNx4SImode;
14940 case E_HImode:
14941 return VNx8HImode;
14942 case E_QImode:
14943 return VNx16QImode;
14944 default:
14945 return opt_machine_mode ();
14946 }
14947 }
14948
14949 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14950 if it exists. */
14951 opt_machine_mode
14952 aarch64_vq_mode (scalar_mode mode)
14953 {
14954 switch (mode)
14955 {
14956 case E_DFmode:
14957 return V2DFmode;
14958 case E_SFmode:
14959 return V4SFmode;
14960 case E_HFmode:
14961 return V8HFmode;
14962 case E_SImode:
14963 return V4SImode;
14964 case E_HImode:
14965 return V8HImode;
14966 case E_QImode:
14967 return V16QImode;
14968 case E_DImode:
14969 return V2DImode;
14970 default:
14971 return opt_machine_mode ();
14972 }
14973 }
14974
14975 /* Return appropriate SIMD container
14976 for MODE within a vector of WIDTH bits. */
14977 static machine_mode
14978 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14979 {
14980 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14981 return aarch64_full_sve_mode (mode).else_mode (word_mode);
14982
14983 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14984 if (TARGET_SIMD)
14985 {
14986 if (known_eq (width, 128))
14987 return aarch64_vq_mode (mode).else_mode (word_mode);
14988 else
14989 switch (mode)
14990 {
14991 case E_SFmode:
14992 return V2SFmode;
14993 case E_HFmode:
14994 return V4HFmode;
14995 case E_SImode:
14996 return V2SImode;
14997 case E_HImode:
14998 return V4HImode;
14999 case E_QImode:
15000 return V8QImode;
15001 default:
15002 break;
15003 }
15004 }
15005 return word_mode;
15006 }
15007
15008 /* Return 128-bit container as the preferred SIMD mode for MODE. */
15009 static machine_mode
15010 aarch64_preferred_simd_mode (scalar_mode mode)
15011 {
15012 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15013 return aarch64_simd_container_mode (mode, bits);
15014 }
15015
15016 /* Return a list of possible vector sizes for the vectorizer
15017 to iterate over. */
15018 static void
15019 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15020 {
15021 if (TARGET_SVE)
15022 sizes->safe_push (BYTES_PER_SVE_VECTOR);
15023 sizes->safe_push (16);
15024 sizes->safe_push (8);
15025 }
15026
15027 /* Implement TARGET_MANGLE_TYPE. */
15028
15029 static const char *
15030 aarch64_mangle_type (const_tree type)
15031 {
15032 /* The AArch64 ABI documents say that "__va_list" has to be
15033 mangled as if it is in the "std" namespace. */
15034 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15035 return "St9__va_list";
15036
15037 /* Half-precision float. */
15038 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15039 return "Dh";
15040
15041 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
15042 builtin types. */
15043 if (TYPE_NAME (type) != NULL)
15044 return aarch64_mangle_builtin_type (type);
15045
15046 /* Use the default mangling. */
15047 return NULL;
15048 }
15049
15050 /* Find the first rtx_insn before insn that will generate an assembly
15051 instruction. */
15052
15053 static rtx_insn *
15054 aarch64_prev_real_insn (rtx_insn *insn)
15055 {
15056 if (!insn)
15057 return NULL;
15058
15059 do
15060 {
15061 insn = prev_real_insn (insn);
15062 }
15063 while (insn && recog_memoized (insn) < 0);
15064
15065 return insn;
15066 }
15067
15068 static bool
15069 is_madd_op (enum attr_type t1)
15070 {
15071 unsigned int i;
15072 /* A number of these may be AArch32 only. */
15073 enum attr_type mlatypes[] = {
15074 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15075 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15076 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15077 };
15078
15079 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15080 {
15081 if (t1 == mlatypes[i])
15082 return true;
15083 }
15084
15085 return false;
15086 }
15087
15088 /* Check if there is a register dependency between a load and the insn
15089 for which we hold recog_data. */
15090
15091 static bool
15092 dep_between_memop_and_curr (rtx memop)
15093 {
15094 rtx load_reg;
15095 int opno;
15096
15097 gcc_assert (GET_CODE (memop) == SET);
15098
15099 if (!REG_P (SET_DEST (memop)))
15100 return false;
15101
15102 load_reg = SET_DEST (memop);
15103 for (opno = 1; opno < recog_data.n_operands; opno++)
15104 {
15105 rtx operand = recog_data.operand[opno];
15106 if (REG_P (operand)
15107 && reg_overlap_mentioned_p (load_reg, operand))
15108 return true;
15109
15110 }
15111 return false;
15112 }
15113
15114
15115 /* When working around the Cortex-A53 erratum 835769,
15116 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15117 instruction and has a preceding memory instruction such that a NOP
15118 should be inserted between them. */
15119
15120 bool
15121 aarch64_madd_needs_nop (rtx_insn* insn)
15122 {
15123 enum attr_type attr_type;
15124 rtx_insn *prev;
15125 rtx body;
15126
15127 if (!TARGET_FIX_ERR_A53_835769)
15128 return false;
15129
15130 if (!INSN_P (insn) || recog_memoized (insn) < 0)
15131 return false;
15132
15133 attr_type = get_attr_type (insn);
15134 if (!is_madd_op (attr_type))
15135 return false;
15136
15137 prev = aarch64_prev_real_insn (insn);
15138 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15139 Restore recog state to INSN to avoid state corruption. */
15140 extract_constrain_insn_cached (insn);
15141
15142 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15143 return false;
15144
15145 body = single_set (prev);
15146
15147 /* If the previous insn is a memory op and there is no dependency between
15148 it and the DImode madd, emit a NOP between them. If body is NULL then we
15149 have a complex memory operation, probably a load/store pair.
15150 Be conservative for now and emit a NOP. */
15151 if (GET_MODE (recog_data.operand[0]) == DImode
15152 && (!body || !dep_between_memop_and_curr (body)))
15153 return true;
15154
15155 return false;
15156
15157 }
15158
15159
15160 /* Implement FINAL_PRESCAN_INSN. */
15161
15162 void
15163 aarch64_final_prescan_insn (rtx_insn *insn)
15164 {
15165 if (aarch64_madd_needs_nop (insn))
15166 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15167 }
15168
15169
15170 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15171 instruction. */
15172
15173 bool
15174 aarch64_sve_index_immediate_p (rtx base_or_step)
15175 {
15176 return (CONST_INT_P (base_or_step)
15177 && IN_RANGE (INTVAL (base_or_step), -16, 15));
15178 }
15179
15180 /* Return true if X is a valid immediate for the SVE ADD and SUB
15181 instructions. Negate X first if NEGATE_P is true. */
15182
15183 bool
15184 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15185 {
15186 rtx elt;
15187
15188 if (!const_vec_duplicate_p (x, &elt)
15189 || !CONST_INT_P (elt))
15190 return false;
15191
15192 HOST_WIDE_INT val = INTVAL (elt);
15193 if (negate_p)
15194 val = -val;
15195 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15196
15197 if (val & 0xff)
15198 return IN_RANGE (val, 0, 0xff);
15199 return IN_RANGE (val, 0, 0xff00);
15200 }
15201
15202 /* Return true if X is a valid immediate operand for an SVE logical
15203 instruction such as AND. */
15204
15205 bool
15206 aarch64_sve_bitmask_immediate_p (rtx x)
15207 {
15208 rtx elt;
15209
15210 return (const_vec_duplicate_p (x, &elt)
15211 && CONST_INT_P (elt)
15212 && aarch64_bitmask_imm (INTVAL (elt),
15213 GET_MODE_INNER (GET_MODE (x))));
15214 }
15215
15216 /* Return true if X is a valid immediate for the SVE DUP and CPY
15217 instructions. */
15218
15219 bool
15220 aarch64_sve_dup_immediate_p (rtx x)
15221 {
15222 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15223 if (!CONST_INT_P (x))
15224 return false;
15225
15226 HOST_WIDE_INT val = INTVAL (x);
15227 if (val & 0xff)
15228 return IN_RANGE (val, -0x80, 0x7f);
15229 return IN_RANGE (val, -0x8000, 0x7f00);
15230 }
15231
15232 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15233 SIGNED_P says whether the operand is signed rather than unsigned. */
15234
15235 bool
15236 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15237 {
15238 rtx elt;
15239
15240 return (const_vec_duplicate_p (x, &elt)
15241 && CONST_INT_P (elt)
15242 && (signed_p
15243 ? IN_RANGE (INTVAL (elt), -16, 15)
15244 : IN_RANGE (INTVAL (elt), 0, 127)));
15245 }
15246
15247 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15248 instruction. Negate X first if NEGATE_P is true. */
15249
15250 bool
15251 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15252 {
15253 rtx elt;
15254 REAL_VALUE_TYPE r;
15255
15256 if (!const_vec_duplicate_p (x, &elt)
15257 || GET_CODE (elt) != CONST_DOUBLE)
15258 return false;
15259
15260 r = *CONST_DOUBLE_REAL_VALUE (elt);
15261
15262 if (negate_p)
15263 r = real_value_negate (&r);
15264
15265 if (real_equal (&r, &dconst1))
15266 return true;
15267 if (real_equal (&r, &dconsthalf))
15268 return true;
15269 return false;
15270 }
15271
15272 /* Return true if X is a valid immediate operand for an SVE FMUL
15273 instruction. */
15274
15275 bool
15276 aarch64_sve_float_mul_immediate_p (rtx x)
15277 {
15278 rtx elt;
15279
15280 return (const_vec_duplicate_p (x, &elt)
15281 && GET_CODE (elt) == CONST_DOUBLE
15282 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15283 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15284 }
15285
15286 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15287 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15288 is nonnull, use it to describe valid immediates. */
15289 static bool
15290 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15291 simd_immediate_info *info,
15292 enum simd_immediate_check which,
15293 simd_immediate_info::insn_type insn)
15294 {
15295 /* Try a 4-byte immediate with LSL. */
15296 for (unsigned int shift = 0; shift < 32; shift += 8)
15297 if ((val32 & (0xff << shift)) == val32)
15298 {
15299 if (info)
15300 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15301 simd_immediate_info::LSL, shift);
15302 return true;
15303 }
15304
15305 /* Try a 2-byte immediate with LSL. */
15306 unsigned int imm16 = val32 & 0xffff;
15307 if (imm16 == (val32 >> 16))
15308 for (unsigned int shift = 0; shift < 16; shift += 8)
15309 if ((imm16 & (0xff << shift)) == imm16)
15310 {
15311 if (info)
15312 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15313 simd_immediate_info::LSL, shift);
15314 return true;
15315 }
15316
15317 /* Try a 4-byte immediate with MSL, except for cases that MVN
15318 can handle. */
15319 if (which == AARCH64_CHECK_MOV)
15320 for (unsigned int shift = 8; shift < 24; shift += 8)
15321 {
15322 unsigned int low = (1 << shift) - 1;
15323 if (((val32 & (0xff << shift)) | low) == val32)
15324 {
15325 if (info)
15326 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15327 simd_immediate_info::MSL, shift);
15328 return true;
15329 }
15330 }
15331
15332 return false;
15333 }
15334
15335 /* Return true if replicating VAL64 is a valid immediate for the
15336 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15337 use it to describe valid immediates. */
15338 static bool
15339 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15340 simd_immediate_info *info,
15341 enum simd_immediate_check which)
15342 {
15343 unsigned int val32 = val64 & 0xffffffff;
15344 unsigned int val16 = val64 & 0xffff;
15345 unsigned int val8 = val64 & 0xff;
15346
15347 if (val32 == (val64 >> 32))
15348 {
15349 if ((which & AARCH64_CHECK_ORR) != 0
15350 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15351 simd_immediate_info::MOV))
15352 return true;
15353
15354 if ((which & AARCH64_CHECK_BIC) != 0
15355 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15356 simd_immediate_info::MVN))
15357 return true;
15358
15359 /* Try using a replicated byte. */
15360 if (which == AARCH64_CHECK_MOV
15361 && val16 == (val32 >> 16)
15362 && val8 == (val16 >> 8))
15363 {
15364 if (info)
15365 *info = simd_immediate_info (QImode, val8);
15366 return true;
15367 }
15368 }
15369
15370 /* Try using a bit-to-bytemask. */
15371 if (which == AARCH64_CHECK_MOV)
15372 {
15373 unsigned int i;
15374 for (i = 0; i < 64; i += 8)
15375 {
15376 unsigned char byte = (val64 >> i) & 0xff;
15377 if (byte != 0 && byte != 0xff)
15378 break;
15379 }
15380 if (i == 64)
15381 {
15382 if (info)
15383 *info = simd_immediate_info (DImode, val64);
15384 return true;
15385 }
15386 }
15387 return false;
15388 }
15389
15390 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15391 instruction. If INFO is nonnull, use it to describe valid immediates. */
15392
15393 static bool
15394 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15395 simd_immediate_info *info)
15396 {
15397 scalar_int_mode mode = DImode;
15398 unsigned int val32 = val64 & 0xffffffff;
15399 if (val32 == (val64 >> 32))
15400 {
15401 mode = SImode;
15402 unsigned int val16 = val32 & 0xffff;
15403 if (val16 == (val32 >> 16))
15404 {
15405 mode = HImode;
15406 unsigned int val8 = val16 & 0xff;
15407 if (val8 == (val16 >> 8))
15408 mode = QImode;
15409 }
15410 }
15411 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15412 if (IN_RANGE (val, -0x80, 0x7f))
15413 {
15414 /* DUP with no shift. */
15415 if (info)
15416 *info = simd_immediate_info (mode, val);
15417 return true;
15418 }
15419 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15420 {
15421 /* DUP with LSL #8. */
15422 if (info)
15423 *info = simd_immediate_info (mode, val);
15424 return true;
15425 }
15426 if (aarch64_bitmask_imm (val64, mode))
15427 {
15428 /* DUPM. */
15429 if (info)
15430 *info = simd_immediate_info (mode, val);
15431 return true;
15432 }
15433 return false;
15434 }
15435
15436 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15437 it to describe valid immediates. */
15438
15439 static bool
15440 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15441 {
15442 if (x == CONST0_RTX (GET_MODE (x)))
15443 {
15444 if (info)
15445 *info = simd_immediate_info (DImode, 0);
15446 return true;
15447 }
15448
15449 /* Analyze the value as a VNx16BImode. This should be relatively
15450 efficient, since rtx_vector_builder has enough built-in capacity
15451 to store all VLA predicate constants without needing the heap. */
15452 rtx_vector_builder builder;
15453 if (!aarch64_get_sve_pred_bits (builder, x))
15454 return false;
15455
15456 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15457 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15458 {
15459 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15460 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15461 if (pattern != AARCH64_NUM_SVPATTERNS)
15462 {
15463 if (info)
15464 {
15465 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15466 *info = simd_immediate_info (int_mode, pattern);
15467 }
15468 return true;
15469 }
15470 }
15471 return false;
15472 }
15473
15474 /* Return true if OP is a valid SIMD immediate for the operation
15475 described by WHICH. If INFO is nonnull, use it to describe valid
15476 immediates. */
15477 bool
15478 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15479 enum simd_immediate_check which)
15480 {
15481 machine_mode mode = GET_MODE (op);
15482 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15483 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15484 return false;
15485
15486 if (vec_flags & VEC_SVE_PRED)
15487 return aarch64_sve_pred_valid_immediate (op, info);
15488
15489 scalar_mode elt_mode = GET_MODE_INNER (mode);
15490 rtx base, step;
15491 unsigned int n_elts;
15492 if (GET_CODE (op) == CONST_VECTOR
15493 && CONST_VECTOR_DUPLICATE_P (op))
15494 n_elts = CONST_VECTOR_NPATTERNS (op);
15495 else if ((vec_flags & VEC_SVE_DATA)
15496 && const_vec_series_p (op, &base, &step))
15497 {
15498 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15499 if (!aarch64_sve_index_immediate_p (base)
15500 || !aarch64_sve_index_immediate_p (step))
15501 return false;
15502
15503 if (info)
15504 *info = simd_immediate_info (elt_mode, base, step);
15505 return true;
15506 }
15507 else if (GET_CODE (op) == CONST_VECTOR
15508 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15509 /* N_ELTS set above. */;
15510 else
15511 return false;
15512
15513 scalar_float_mode elt_float_mode;
15514 if (n_elts == 1
15515 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15516 {
15517 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15518 if (aarch64_float_const_zero_rtx_p (elt)
15519 || aarch64_float_const_representable_p (elt))
15520 {
15521 if (info)
15522 *info = simd_immediate_info (elt_float_mode, elt);
15523 return true;
15524 }
15525 }
15526
15527 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15528 if (elt_size > 8)
15529 return false;
15530
15531 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15532
15533 /* Expand the vector constant out into a byte vector, with the least
15534 significant byte of the register first. */
15535 auto_vec<unsigned char, 16> bytes;
15536 bytes.reserve (n_elts * elt_size);
15537 for (unsigned int i = 0; i < n_elts; i++)
15538 {
15539 /* The vector is provided in gcc endian-neutral fashion.
15540 For aarch64_be Advanced SIMD, it must be laid out in the vector
15541 register in reverse order. */
15542 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15543 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15544
15545 if (elt_mode != elt_int_mode)
15546 elt = gen_lowpart (elt_int_mode, elt);
15547
15548 if (!CONST_INT_P (elt))
15549 return false;
15550
15551 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15552 for (unsigned int byte = 0; byte < elt_size; byte++)
15553 {
15554 bytes.quick_push (elt_val & 0xff);
15555 elt_val >>= BITS_PER_UNIT;
15556 }
15557 }
15558
15559 /* The immediate must repeat every eight bytes. */
15560 unsigned int nbytes = bytes.length ();
15561 for (unsigned i = 8; i < nbytes; ++i)
15562 if (bytes[i] != bytes[i - 8])
15563 return false;
15564
15565 /* Get the repeating 8-byte value as an integer. No endian correction
15566 is needed here because bytes is already in lsb-first order. */
15567 unsigned HOST_WIDE_INT val64 = 0;
15568 for (unsigned int i = 0; i < 8; i++)
15569 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15570 << (i * BITS_PER_UNIT));
15571
15572 if (vec_flags & VEC_SVE_DATA)
15573 return aarch64_sve_valid_immediate (val64, info);
15574 else
15575 return aarch64_advsimd_valid_immediate (val64, info, which);
15576 }
15577
15578 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15579 has a step in the range of INDEX. Return the index expression if so,
15580 otherwise return null. */
15581 rtx
15582 aarch64_check_zero_based_sve_index_immediate (rtx x)
15583 {
15584 rtx base, step;
15585 if (const_vec_series_p (x, &base, &step)
15586 && base == const0_rtx
15587 && aarch64_sve_index_immediate_p (step))
15588 return step;
15589 return NULL_RTX;
15590 }
15591
15592 /* Check of immediate shift constants are within range. */
15593 bool
15594 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15595 {
15596 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15597 if (left)
15598 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15599 else
15600 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15601 }
15602
15603 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15604 operation of width WIDTH at bit position POS. */
15605
15606 rtx
15607 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15608 {
15609 gcc_assert (CONST_INT_P (width));
15610 gcc_assert (CONST_INT_P (pos));
15611
15612 unsigned HOST_WIDE_INT mask
15613 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15614 return GEN_INT (mask << UINTVAL (pos));
15615 }
15616
15617 bool
15618 aarch64_mov_operand_p (rtx x, machine_mode mode)
15619 {
15620 if (GET_CODE (x) == HIGH
15621 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15622 return true;
15623
15624 if (CONST_INT_P (x))
15625 return true;
15626
15627 if (VECTOR_MODE_P (GET_MODE (x)))
15628 {
15629 /* Require predicate constants to be VNx16BI before RA, so that we
15630 force everything to have a canonical form. */
15631 if (!lra_in_progress
15632 && !reload_completed
15633 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15634 && GET_MODE (x) != VNx16BImode)
15635 return false;
15636
15637 return aarch64_simd_valid_immediate (x, NULL);
15638 }
15639
15640 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15641 return true;
15642
15643 if (aarch64_sve_cnt_immediate_p (x))
15644 return true;
15645
15646 return aarch64_classify_symbolic_expression (x)
15647 == SYMBOL_TINY_ABSOLUTE;
15648 }
15649
15650 /* Return a const_int vector of VAL. */
15651 rtx
15652 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15653 {
15654 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15655 return gen_const_vec_duplicate (mode, c);
15656 }
15657
15658 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15659
15660 bool
15661 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15662 {
15663 machine_mode vmode;
15664
15665 vmode = aarch64_simd_container_mode (mode, 64);
15666 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15667 return aarch64_simd_valid_immediate (op_v, NULL);
15668 }
15669
15670 /* Construct and return a PARALLEL RTX vector with elements numbering the
15671 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15672 the vector - from the perspective of the architecture. This does not
15673 line up with GCC's perspective on lane numbers, so we end up with
15674 different masks depending on our target endian-ness. The diagram
15675 below may help. We must draw the distinction when building masks
15676 which select one half of the vector. An instruction selecting
15677 architectural low-lanes for a big-endian target, must be described using
15678 a mask selecting GCC high-lanes.
15679
15680 Big-Endian Little-Endian
15681
15682 GCC 0 1 2 3 3 2 1 0
15683 | x | x | x | x | | x | x | x | x |
15684 Architecture 3 2 1 0 3 2 1 0
15685
15686 Low Mask: { 2, 3 } { 0, 1 }
15687 High Mask: { 0, 1 } { 2, 3 }
15688
15689 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15690
15691 rtx
15692 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15693 {
15694 rtvec v = rtvec_alloc (nunits / 2);
15695 int high_base = nunits / 2;
15696 int low_base = 0;
15697 int base;
15698 rtx t1;
15699 int i;
15700
15701 if (BYTES_BIG_ENDIAN)
15702 base = high ? low_base : high_base;
15703 else
15704 base = high ? high_base : low_base;
15705
15706 for (i = 0; i < nunits / 2; i++)
15707 RTVEC_ELT (v, i) = GEN_INT (base + i);
15708
15709 t1 = gen_rtx_PARALLEL (mode, v);
15710 return t1;
15711 }
15712
15713 /* Check OP for validity as a PARALLEL RTX vector with elements
15714 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15715 from the perspective of the architecture. See the diagram above
15716 aarch64_simd_vect_par_cnst_half for more details. */
15717
15718 bool
15719 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15720 bool high)
15721 {
15722 int nelts;
15723 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15724 return false;
15725
15726 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15727 HOST_WIDE_INT count_op = XVECLEN (op, 0);
15728 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15729 int i = 0;
15730
15731 if (count_op != count_ideal)
15732 return false;
15733
15734 for (i = 0; i < count_ideal; i++)
15735 {
15736 rtx elt_op = XVECEXP (op, 0, i);
15737 rtx elt_ideal = XVECEXP (ideal, 0, i);
15738
15739 if (!CONST_INT_P (elt_op)
15740 || INTVAL (elt_ideal) != INTVAL (elt_op))
15741 return false;
15742 }
15743 return true;
15744 }
15745
15746 /* Return a PARALLEL containing NELTS elements, with element I equal
15747 to BASE + I * STEP. */
15748
15749 rtx
15750 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15751 {
15752 rtvec vec = rtvec_alloc (nelts);
15753 for (unsigned int i = 0; i < nelts; ++i)
15754 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15755 return gen_rtx_PARALLEL (VOIDmode, vec);
15756 }
15757
15758 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15759 series with step STEP. */
15760
15761 bool
15762 aarch64_stepped_int_parallel_p (rtx op, int step)
15763 {
15764 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15765 return false;
15766
15767 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15768 for (int i = 1; i < XVECLEN (op, 0); ++i)
15769 if (!CONST_INT_P (XVECEXP (op, 0, i))
15770 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15771 return false;
15772
15773 return true;
15774 }
15775
15776 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15777 HIGH (exclusive). */
15778 void
15779 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15780 const_tree exp)
15781 {
15782 HOST_WIDE_INT lane;
15783 gcc_assert (CONST_INT_P (operand));
15784 lane = INTVAL (operand);
15785
15786 if (lane < low || lane >= high)
15787 {
15788 if (exp)
15789 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15790 else
15791 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15792 }
15793 }
15794
15795 /* Peform endian correction on lane number N, which indexes a vector
15796 of mode MODE, and return the result as an SImode rtx. */
15797
15798 rtx
15799 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15800 {
15801 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15802 }
15803
15804 /* Return TRUE if OP is a valid vector addressing mode. */
15805
15806 bool
15807 aarch64_simd_mem_operand_p (rtx op)
15808 {
15809 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15810 || REG_P (XEXP (op, 0)));
15811 }
15812
15813 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15814
15815 bool
15816 aarch64_sve_ld1r_operand_p (rtx op)
15817 {
15818 struct aarch64_address_info addr;
15819 scalar_mode mode;
15820
15821 return (MEM_P (op)
15822 && is_a <scalar_mode> (GET_MODE (op), &mode)
15823 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15824 && addr.type == ADDRESS_REG_IMM
15825 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15826 }
15827
15828 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15829 bool
15830 aarch64_sve_ld1rq_operand_p (rtx op)
15831 {
15832 struct aarch64_address_info addr;
15833 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15834 if (!MEM_P (op)
15835 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15836 return false;
15837
15838 if (addr.type == ADDRESS_REG_IMM)
15839 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15840
15841 if (addr.type == ADDRESS_REG_REG)
15842 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15843
15844 return false;
15845 }
15846
15847 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15848 The conditions for STR are the same. */
15849 bool
15850 aarch64_sve_ldr_operand_p (rtx op)
15851 {
15852 struct aarch64_address_info addr;
15853
15854 return (MEM_P (op)
15855 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15856 false, ADDR_QUERY_ANY)
15857 && addr.type == ADDRESS_REG_IMM);
15858 }
15859
15860 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15861 We need to be able to access the individual pieces, so the range
15862 is different from LD[234] and ST[234]. */
15863 bool
15864 aarch64_sve_struct_memory_operand_p (rtx op)
15865 {
15866 if (!MEM_P (op))
15867 return false;
15868
15869 machine_mode mode = GET_MODE (op);
15870 struct aarch64_address_info addr;
15871 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15872 ADDR_QUERY_ANY)
15873 || addr.type != ADDRESS_REG_IMM)
15874 return false;
15875
15876 poly_int64 first = addr.const_offset;
15877 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15878 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15879 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15880 }
15881
15882 /* Emit a register copy from operand to operand, taking care not to
15883 early-clobber source registers in the process.
15884
15885 COUNT is the number of components into which the copy needs to be
15886 decomposed. */
15887 void
15888 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15889 unsigned int count)
15890 {
15891 unsigned int i;
15892 int rdest = REGNO (operands[0]);
15893 int rsrc = REGNO (operands[1]);
15894
15895 if (!reg_overlap_mentioned_p (operands[0], operands[1])
15896 || rdest < rsrc)
15897 for (i = 0; i < count; i++)
15898 emit_move_insn (gen_rtx_REG (mode, rdest + i),
15899 gen_rtx_REG (mode, rsrc + i));
15900 else
15901 for (i = 0; i < count; i++)
15902 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15903 gen_rtx_REG (mode, rsrc + count - i - 1));
15904 }
15905
15906 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15907 one of VSTRUCT modes: OI, CI, or XI. */
15908 int
15909 aarch64_simd_attr_length_rglist (machine_mode mode)
15910 {
15911 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15912 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15913 }
15914
15915 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15916 alignment of a vector to 128 bits. SVE predicates have an alignment of
15917 16 bits. */
15918 static HOST_WIDE_INT
15919 aarch64_simd_vector_alignment (const_tree type)
15920 {
15921 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15922 be set for non-predicate vectors of booleans. Modes are the most
15923 direct way we have of identifying real SVE predicate types. */
15924 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
15925 return 16;
15926 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15927 return 128;
15928 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15929 }
15930
15931 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15932 static poly_uint64
15933 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15934 {
15935 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15936 {
15937 /* If the length of the vector is fixed, try to align to that length,
15938 otherwise don't try to align at all. */
15939 HOST_WIDE_INT result;
15940 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15941 result = TYPE_ALIGN (TREE_TYPE (type));
15942 return result;
15943 }
15944 return TYPE_ALIGN (type);
15945 }
15946
15947 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15948 static bool
15949 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15950 {
15951 if (is_packed)
15952 return false;
15953
15954 /* For fixed-length vectors, check that the vectorizer will aim for
15955 full-vector alignment. This isn't true for generic GCC vectors
15956 that are wider than the ABI maximum of 128 bits. */
15957 poly_uint64 preferred_alignment =
15958 aarch64_vectorize_preferred_vector_alignment (type);
15959 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15960 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15961 preferred_alignment))
15962 return false;
15963
15964 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15965 return true;
15966 }
15967
15968 /* Return true if the vector misalignment factor is supported by the
15969 target. */
15970 static bool
15971 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15972 const_tree type, int misalignment,
15973 bool is_packed)
15974 {
15975 if (TARGET_SIMD && STRICT_ALIGNMENT)
15976 {
15977 /* Return if movmisalign pattern is not supported for this mode. */
15978 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15979 return false;
15980
15981 /* Misalignment factor is unknown at compile time. */
15982 if (misalignment == -1)
15983 return false;
15984 }
15985 return default_builtin_support_vector_misalignment (mode, type, misalignment,
15986 is_packed);
15987 }
15988
15989 /* If VALS is a vector constant that can be loaded into a register
15990 using DUP, generate instructions to do so and return an RTX to
15991 assign to the register. Otherwise return NULL_RTX. */
15992 static rtx
15993 aarch64_simd_dup_constant (rtx vals)
15994 {
15995 machine_mode mode = GET_MODE (vals);
15996 machine_mode inner_mode = GET_MODE_INNER (mode);
15997 rtx x;
15998
15999 if (!const_vec_duplicate_p (vals, &x))
16000 return NULL_RTX;
16001
16002 /* We can load this constant by using DUP and a constant in a
16003 single ARM register. This will be cheaper than a vector
16004 load. */
16005 x = copy_to_mode_reg (inner_mode, x);
16006 return gen_vec_duplicate (mode, x);
16007 }
16008
16009
16010 /* Generate code to load VALS, which is a PARALLEL containing only
16011 constants (for vec_init) or CONST_VECTOR, efficiently into a
16012 register. Returns an RTX to copy into the register, or NULL_RTX
16013 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
16014 static rtx
16015 aarch64_simd_make_constant (rtx vals)
16016 {
16017 machine_mode mode = GET_MODE (vals);
16018 rtx const_dup;
16019 rtx const_vec = NULL_RTX;
16020 int n_const = 0;
16021 int i;
16022
16023 if (GET_CODE (vals) == CONST_VECTOR)
16024 const_vec = vals;
16025 else if (GET_CODE (vals) == PARALLEL)
16026 {
16027 /* A CONST_VECTOR must contain only CONST_INTs and
16028 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16029 Only store valid constants in a CONST_VECTOR. */
16030 int n_elts = XVECLEN (vals, 0);
16031 for (i = 0; i < n_elts; ++i)
16032 {
16033 rtx x = XVECEXP (vals, 0, i);
16034 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16035 n_const++;
16036 }
16037 if (n_const == n_elts)
16038 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
16039 }
16040 else
16041 gcc_unreachable ();
16042
16043 if (const_vec != NULL_RTX
16044 && aarch64_simd_valid_immediate (const_vec, NULL))
16045 /* Load using MOVI/MVNI. */
16046 return const_vec;
16047 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16048 /* Loaded using DUP. */
16049 return const_dup;
16050 else if (const_vec != NULL_RTX)
16051 /* Load from constant pool. We cannot take advantage of single-cycle
16052 LD1 because we need a PC-relative addressing mode. */
16053 return const_vec;
16054 else
16055 /* A PARALLEL containing something not valid inside CONST_VECTOR.
16056 We cannot construct an initializer. */
16057 return NULL_RTX;
16058 }
16059
16060 /* Expand a vector initialisation sequence, such that TARGET is
16061 initialised to contain VALS. */
16062
16063 void
16064 aarch64_expand_vector_init (rtx target, rtx vals)
16065 {
16066 machine_mode mode = GET_MODE (target);
16067 scalar_mode inner_mode = GET_MODE_INNER (mode);
16068 /* The number of vector elements. */
16069 int n_elts = XVECLEN (vals, 0);
16070 /* The number of vector elements which are not constant. */
16071 int n_var = 0;
16072 rtx any_const = NULL_RTX;
16073 /* The first element of vals. */
16074 rtx v0 = XVECEXP (vals, 0, 0);
16075 bool all_same = true;
16076
16077 /* This is a special vec_init<M><N> where N is not an element mode but a
16078 vector mode with half the elements of M. We expect to find two entries
16079 of mode N in VALS and we must put their concatentation into TARGET. */
16080 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16081 {
16082 gcc_assert (known_eq (GET_MODE_SIZE (mode),
16083 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16084 rtx lo = XVECEXP (vals, 0, 0);
16085 rtx hi = XVECEXP (vals, 0, 1);
16086 machine_mode narrow_mode = GET_MODE (lo);
16087 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16088 gcc_assert (narrow_mode == GET_MODE (hi));
16089
16090 /* When we want to concatenate a half-width vector with zeroes we can
16091 use the aarch64_combinez[_be] patterns. Just make sure that the
16092 zeroes are in the right half. */
16093 if (BYTES_BIG_ENDIAN
16094 && aarch64_simd_imm_zero (lo, narrow_mode)
16095 && general_operand (hi, narrow_mode))
16096 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16097 else if (!BYTES_BIG_ENDIAN
16098 && aarch64_simd_imm_zero (hi, narrow_mode)
16099 && general_operand (lo, narrow_mode))
16100 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16101 else
16102 {
16103 /* Else create the two half-width registers and combine them. */
16104 if (!REG_P (lo))
16105 lo = force_reg (GET_MODE (lo), lo);
16106 if (!REG_P (hi))
16107 hi = force_reg (GET_MODE (hi), hi);
16108
16109 if (BYTES_BIG_ENDIAN)
16110 std::swap (lo, hi);
16111 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16112 }
16113 return;
16114 }
16115
16116 /* Count the number of variable elements to initialise. */
16117 for (int i = 0; i < n_elts; ++i)
16118 {
16119 rtx x = XVECEXP (vals, 0, i);
16120 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16121 ++n_var;
16122 else
16123 any_const = x;
16124
16125 all_same &= rtx_equal_p (x, v0);
16126 }
16127
16128 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16129 how best to handle this. */
16130 if (n_var == 0)
16131 {
16132 rtx constant = aarch64_simd_make_constant (vals);
16133 if (constant != NULL_RTX)
16134 {
16135 emit_move_insn (target, constant);
16136 return;
16137 }
16138 }
16139
16140 /* Splat a single non-constant element if we can. */
16141 if (all_same)
16142 {
16143 rtx x = copy_to_mode_reg (inner_mode, v0);
16144 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16145 return;
16146 }
16147
16148 enum insn_code icode = optab_handler (vec_set_optab, mode);
16149 gcc_assert (icode != CODE_FOR_nothing);
16150
16151 /* If there are only variable elements, try to optimize
16152 the insertion using dup for the most common element
16153 followed by insertions. */
16154
16155 /* The algorithm will fill matches[*][0] with the earliest matching element,
16156 and matches[X][1] with the count of duplicate elements (if X is the
16157 earliest element which has duplicates). */
16158
16159 if (n_var == n_elts && n_elts <= 16)
16160 {
16161 int matches[16][2] = {0};
16162 for (int i = 0; i < n_elts; i++)
16163 {
16164 for (int j = 0; j <= i; j++)
16165 {
16166 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16167 {
16168 matches[i][0] = j;
16169 matches[j][1]++;
16170 break;
16171 }
16172 }
16173 }
16174 int maxelement = 0;
16175 int maxv = 0;
16176 for (int i = 0; i < n_elts; i++)
16177 if (matches[i][1] > maxv)
16178 {
16179 maxelement = i;
16180 maxv = matches[i][1];
16181 }
16182
16183 /* Create a duplicate of the most common element, unless all elements
16184 are equally useless to us, in which case just immediately set the
16185 vector register using the first element. */
16186
16187 if (maxv == 1)
16188 {
16189 /* For vectors of two 64-bit elements, we can do even better. */
16190 if (n_elts == 2
16191 && (inner_mode == E_DImode
16192 || inner_mode == E_DFmode))
16193
16194 {
16195 rtx x0 = XVECEXP (vals, 0, 0);
16196 rtx x1 = XVECEXP (vals, 0, 1);
16197 /* Combine can pick up this case, but handling it directly
16198 here leaves clearer RTL.
16199
16200 This is load_pair_lanes<mode>, and also gives us a clean-up
16201 for store_pair_lanes<mode>. */
16202 if (memory_operand (x0, inner_mode)
16203 && memory_operand (x1, inner_mode)
16204 && !STRICT_ALIGNMENT
16205 && rtx_equal_p (XEXP (x1, 0),
16206 plus_constant (Pmode,
16207 XEXP (x0, 0),
16208 GET_MODE_SIZE (inner_mode))))
16209 {
16210 rtx t;
16211 if (inner_mode == DFmode)
16212 t = gen_load_pair_lanesdf (target, x0, x1);
16213 else
16214 t = gen_load_pair_lanesdi (target, x0, x1);
16215 emit_insn (t);
16216 return;
16217 }
16218 }
16219 /* The subreg-move sequence below will move into lane zero of the
16220 vector register. For big-endian we want that position to hold
16221 the last element of VALS. */
16222 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16223 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16224 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16225 }
16226 else
16227 {
16228 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16229 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16230 }
16231
16232 /* Insert the rest. */
16233 for (int i = 0; i < n_elts; i++)
16234 {
16235 rtx x = XVECEXP (vals, 0, i);
16236 if (matches[i][0] == maxelement)
16237 continue;
16238 x = copy_to_mode_reg (inner_mode, x);
16239 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16240 }
16241 return;
16242 }
16243
16244 /* Initialise a vector which is part-variable. We want to first try
16245 to build those lanes which are constant in the most efficient way we
16246 can. */
16247 if (n_var != n_elts)
16248 {
16249 rtx copy = copy_rtx (vals);
16250
16251 /* Load constant part of vector. We really don't care what goes into the
16252 parts we will overwrite, but we're more likely to be able to load the
16253 constant efficiently if it has fewer, larger, repeating parts
16254 (see aarch64_simd_valid_immediate). */
16255 for (int i = 0; i < n_elts; i++)
16256 {
16257 rtx x = XVECEXP (vals, 0, i);
16258 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16259 continue;
16260 rtx subst = any_const;
16261 for (int bit = n_elts / 2; bit > 0; bit /= 2)
16262 {
16263 /* Look in the copied vector, as more elements are const. */
16264 rtx test = XVECEXP (copy, 0, i ^ bit);
16265 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16266 {
16267 subst = test;
16268 break;
16269 }
16270 }
16271 XVECEXP (copy, 0, i) = subst;
16272 }
16273 aarch64_expand_vector_init (target, copy);
16274 }
16275
16276 /* Insert the variable lanes directly. */
16277 for (int i = 0; i < n_elts; i++)
16278 {
16279 rtx x = XVECEXP (vals, 0, i);
16280 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16281 continue;
16282 x = copy_to_mode_reg (inner_mode, x);
16283 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16284 }
16285 }
16286
16287 /* Emit RTL corresponding to:
16288 insr TARGET, ELEM. */
16289
16290 static void
16291 emit_insr (rtx target, rtx elem)
16292 {
16293 machine_mode mode = GET_MODE (target);
16294 scalar_mode elem_mode = GET_MODE_INNER (mode);
16295 elem = force_reg (elem_mode, elem);
16296
16297 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16298 gcc_assert (icode != CODE_FOR_nothing);
16299 emit_insn (GEN_FCN (icode) (target, target, elem));
16300 }
16301
16302 /* Subroutine of aarch64_sve_expand_vector_init for handling
16303 trailing constants.
16304 This function works as follows:
16305 (a) Create a new vector consisting of trailing constants.
16306 (b) Initialize TARGET with the constant vector using emit_move_insn.
16307 (c) Insert remaining elements in TARGET using insr.
16308 NELTS is the total number of elements in original vector while
16309 while NELTS_REQD is the number of elements that are actually
16310 significant.
16311
16312 ??? The heuristic used is to do above only if number of constants
16313 is at least half the total number of elements. May need fine tuning. */
16314
16315 static bool
16316 aarch64_sve_expand_vector_init_handle_trailing_constants
16317 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16318 {
16319 machine_mode mode = GET_MODE (target);
16320 scalar_mode elem_mode = GET_MODE_INNER (mode);
16321 int n_trailing_constants = 0;
16322
16323 for (int i = nelts_reqd - 1;
16324 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16325 i--)
16326 n_trailing_constants++;
16327
16328 if (n_trailing_constants >= nelts_reqd / 2)
16329 {
16330 rtx_vector_builder v (mode, 1, nelts);
16331 for (int i = 0; i < nelts; i++)
16332 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16333 rtx const_vec = v.build ();
16334 emit_move_insn (target, const_vec);
16335
16336 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16337 emit_insr (target, builder.elt (i));
16338
16339 return true;
16340 }
16341
16342 return false;
16343 }
16344
16345 /* Subroutine of aarch64_sve_expand_vector_init.
16346 Works as follows:
16347 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16348 (b) Skip trailing elements from BUILDER, which are the same as
16349 element NELTS_REQD - 1.
16350 (c) Insert earlier elements in reverse order in TARGET using insr. */
16351
16352 static void
16353 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16354 const rtx_vector_builder &builder,
16355 int nelts_reqd)
16356 {
16357 machine_mode mode = GET_MODE (target);
16358 scalar_mode elem_mode = GET_MODE_INNER (mode);
16359
16360 struct expand_operand ops[2];
16361 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16362 gcc_assert (icode != CODE_FOR_nothing);
16363
16364 create_output_operand (&ops[0], target, mode);
16365 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16366 expand_insn (icode, 2, ops);
16367
16368 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16369 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16370 emit_insr (target, builder.elt (i));
16371 }
16372
16373 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16374 when all trailing elements of builder are same.
16375 This works as follows:
16376 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16377 (b) Insert remaining elements in TARGET using insr.
16378
16379 ??? The heuristic used is to do above if number of same trailing elements
16380 is at least 3/4 of total number of elements, loosely based on
16381 heuristic from mostly_zeros_p. May need fine-tuning. */
16382
16383 static bool
16384 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16385 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16386 {
16387 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16388 if (ndups >= (3 * nelts_reqd) / 4)
16389 {
16390 aarch64_sve_expand_vector_init_insert_elems (target, builder,
16391 nelts_reqd - ndups + 1);
16392 return true;
16393 }
16394
16395 return false;
16396 }
16397
16398 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16399 of elements in BUILDER.
16400
16401 The function tries to initialize TARGET from BUILDER if it fits one
16402 of the special cases outlined below.
16403
16404 Failing that, the function divides BUILDER into two sub-vectors:
16405 v_even = even elements of BUILDER;
16406 v_odd = odd elements of BUILDER;
16407
16408 and recursively calls itself with v_even and v_odd.
16409
16410 if (recursive call succeeded for v_even or v_odd)
16411 TARGET = zip (v_even, v_odd)
16412
16413 The function returns true if it managed to build TARGET from BUILDER
16414 with one of the special cases, false otherwise.
16415
16416 Example: {a, 1, b, 2, c, 3, d, 4}
16417
16418 The vector gets divided into:
16419 v_even = {a, b, c, d}
16420 v_odd = {1, 2, 3, 4}
16421
16422 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16423 initialize tmp2 from constant vector v_odd using emit_move_insn.
16424
16425 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16426 4 elements, so we construct tmp1 from v_even using insr:
16427 tmp1 = dup(d)
16428 insr tmp1, c
16429 insr tmp1, b
16430 insr tmp1, a
16431
16432 And finally:
16433 TARGET = zip (tmp1, tmp2)
16434 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16435
16436 static bool
16437 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16438 int nelts, int nelts_reqd)
16439 {
16440 machine_mode mode = GET_MODE (target);
16441
16442 /* Case 1: Vector contains trailing constants. */
16443
16444 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16445 (target, builder, nelts, nelts_reqd))
16446 return true;
16447
16448 /* Case 2: Vector contains leading constants. */
16449
16450 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16451 for (int i = 0; i < nelts_reqd; i++)
16452 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16453 rev_builder.finalize ();
16454
16455 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16456 (target, rev_builder, nelts, nelts_reqd))
16457 {
16458 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16459 return true;
16460 }
16461
16462 /* Case 3: Vector contains trailing same element. */
16463
16464 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16465 (target, builder, nelts_reqd))
16466 return true;
16467
16468 /* Case 4: Vector contains leading same element. */
16469
16470 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16471 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16472 {
16473 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16474 return true;
16475 }
16476
16477 /* Avoid recursing below 4-elements.
16478 ??? The threshold 4 may need fine-tuning. */
16479
16480 if (nelts_reqd <= 4)
16481 return false;
16482
16483 rtx_vector_builder v_even (mode, 1, nelts);
16484 rtx_vector_builder v_odd (mode, 1, nelts);
16485
16486 for (int i = 0; i < nelts * 2; i += 2)
16487 {
16488 v_even.quick_push (builder.elt (i));
16489 v_odd.quick_push (builder.elt (i + 1));
16490 }
16491
16492 v_even.finalize ();
16493 v_odd.finalize ();
16494
16495 rtx tmp1 = gen_reg_rtx (mode);
16496 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16497 nelts, nelts_reqd / 2);
16498
16499 rtx tmp2 = gen_reg_rtx (mode);
16500 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16501 nelts, nelts_reqd / 2);
16502
16503 if (!did_even_p && !did_odd_p)
16504 return false;
16505
16506 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16507 special cases and zip v_even, v_odd. */
16508
16509 if (!did_even_p)
16510 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16511
16512 if (!did_odd_p)
16513 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16514
16515 rtvec v = gen_rtvec (2, tmp1, tmp2);
16516 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16517 return true;
16518 }
16519
16520 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16521
16522 void
16523 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16524 {
16525 machine_mode mode = GET_MODE (target);
16526 int nelts = XVECLEN (vals, 0);
16527
16528 rtx_vector_builder v (mode, 1, nelts);
16529 for (int i = 0; i < nelts; i++)
16530 v.quick_push (XVECEXP (vals, 0, i));
16531 v.finalize ();
16532
16533 /* If neither sub-vectors of v could be initialized specially,
16534 then use INSR to insert all elements from v into TARGET.
16535 ??? This might not be optimal for vectors with large
16536 initializers like 16-element or above.
16537 For nelts < 4, it probably isn't useful to handle specially. */
16538
16539 if (nelts < 4
16540 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16541 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16542 }
16543
16544 /* Check whether VALUE is a vector constant in which every element
16545 is either a power of 2 or a negated power of 2. If so, return
16546 a constant vector of log2s, and flip CODE between PLUS and MINUS
16547 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
16548
16549 static rtx
16550 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16551 {
16552 if (GET_CODE (value) != CONST_VECTOR)
16553 return NULL_RTX;
16554
16555 rtx_vector_builder builder;
16556 if (!builder.new_unary_operation (GET_MODE (value), value, false))
16557 return NULL_RTX;
16558
16559 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16560 /* 1 if the result of the multiplication must be negated,
16561 0 if it mustn't, or -1 if we don't yet care. */
16562 int negate = -1;
16563 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16564 for (unsigned int i = 0; i < encoded_nelts; ++i)
16565 {
16566 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16567 if (!CONST_SCALAR_INT_P (elt))
16568 return NULL_RTX;
16569 rtx_mode_t val (elt, int_mode);
16570 wide_int pow2 = wi::neg (val);
16571 if (val != pow2)
16572 {
16573 /* It matters whether we negate or not. Make that choice,
16574 and make sure that it's consistent with previous elements. */
16575 if (negate == !wi::neg_p (val))
16576 return NULL_RTX;
16577 negate = wi::neg_p (val);
16578 if (!negate)
16579 pow2 = val;
16580 }
16581 /* POW2 is now the value that we want to be a power of 2. */
16582 int shift = wi::exact_log2 (pow2);
16583 if (shift < 0)
16584 return NULL_RTX;
16585 builder.quick_push (gen_int_mode (shift, int_mode));
16586 }
16587 if (negate == -1)
16588 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
16589 code = PLUS;
16590 else if (negate == 1)
16591 code = code == PLUS ? MINUS : PLUS;
16592 return builder.build ();
16593 }
16594
16595 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16596 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
16597 operands array, in the same order as for fma_optab. Return true if
16598 the function emitted all the necessary instructions, false if the caller
16599 should generate the pattern normally with the new OPERANDS array. */
16600
16601 bool
16602 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16603 {
16604 machine_mode mode = GET_MODE (operands[0]);
16605 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16606 {
16607 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16608 NULL_RTX, true, OPTAB_DIRECT);
16609 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16610 operands[3], product, operands[0], true,
16611 OPTAB_DIRECT);
16612 return true;
16613 }
16614 operands[2] = force_reg (mode, operands[2]);
16615 return false;
16616 }
16617
16618 /* Likewise, but for a conditional pattern. */
16619
16620 bool
16621 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16622 {
16623 machine_mode mode = GET_MODE (operands[0]);
16624 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16625 {
16626 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16627 NULL_RTX, true, OPTAB_DIRECT);
16628 emit_insn (gen_cond (code, mode, operands[0], operands[1],
16629 operands[4], product, operands[5]));
16630 return true;
16631 }
16632 operands[3] = force_reg (mode, operands[3]);
16633 return false;
16634 }
16635
16636 static unsigned HOST_WIDE_INT
16637 aarch64_shift_truncation_mask (machine_mode mode)
16638 {
16639 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16640 return 0;
16641 return GET_MODE_UNIT_BITSIZE (mode) - 1;
16642 }
16643
16644 /* Select a format to encode pointers in exception handling data. */
16645 int
16646 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16647 {
16648 int type;
16649 switch (aarch64_cmodel)
16650 {
16651 case AARCH64_CMODEL_TINY:
16652 case AARCH64_CMODEL_TINY_PIC:
16653 case AARCH64_CMODEL_SMALL:
16654 case AARCH64_CMODEL_SMALL_PIC:
16655 case AARCH64_CMODEL_SMALL_SPIC:
16656 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16657 for everything. */
16658 type = DW_EH_PE_sdata4;
16659 break;
16660 default:
16661 /* No assumptions here. 8-byte relocs required. */
16662 type = DW_EH_PE_sdata8;
16663 break;
16664 }
16665 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16666 }
16667
16668 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16669
16670 static void
16671 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16672 {
16673 if (aarch64_simd_decl_p (decl))
16674 {
16675 fprintf (stream, "\t.variant_pcs\t");
16676 assemble_name (stream, name);
16677 fprintf (stream, "\n");
16678 }
16679 }
16680
16681 /* The last .arch and .tune assembly strings that we printed. */
16682 static std::string aarch64_last_printed_arch_string;
16683 static std::string aarch64_last_printed_tune_string;
16684
16685 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16686 by the function fndecl. */
16687
16688 void
16689 aarch64_declare_function_name (FILE *stream, const char* name,
16690 tree fndecl)
16691 {
16692 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16693
16694 struct cl_target_option *targ_options;
16695 if (target_parts)
16696 targ_options = TREE_TARGET_OPTION (target_parts);
16697 else
16698 targ_options = TREE_TARGET_OPTION (target_option_current_node);
16699 gcc_assert (targ_options);
16700
16701 const struct processor *this_arch
16702 = aarch64_get_arch (targ_options->x_explicit_arch);
16703
16704 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16705 std::string extension
16706 = aarch64_get_extension_string_for_isa_flags (isa_flags,
16707 this_arch->flags);
16708 /* Only update the assembler .arch string if it is distinct from the last
16709 such string we printed. */
16710 std::string to_print = this_arch->name + extension;
16711 if (to_print != aarch64_last_printed_arch_string)
16712 {
16713 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16714 aarch64_last_printed_arch_string = to_print;
16715 }
16716
16717 /* Print the cpu name we're tuning for in the comments, might be
16718 useful to readers of the generated asm. Do it only when it changes
16719 from function to function and verbose assembly is requested. */
16720 const struct processor *this_tune
16721 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16722
16723 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16724 {
16725 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16726 this_tune->name);
16727 aarch64_last_printed_tune_string = this_tune->name;
16728 }
16729
16730 aarch64_asm_output_variant_pcs (stream, fndecl, name);
16731
16732 /* Don't forget the type directive for ELF. */
16733 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16734 ASM_OUTPUT_LABEL (stream, name);
16735 }
16736
16737 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16738
16739 void
16740 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16741 {
16742 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16743 const char *value = IDENTIFIER_POINTER (target);
16744 aarch64_asm_output_variant_pcs (stream, decl, name);
16745 ASM_OUTPUT_DEF (stream, name, value);
16746 }
16747
16748 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16749 function symbol references. */
16750
16751 void
16752 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16753 {
16754 default_elf_asm_output_external (stream, decl, name);
16755 aarch64_asm_output_variant_pcs (stream, decl, name);
16756 }
16757
16758 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16759 Used to output the .cfi_b_key_frame directive when signing the current
16760 function with the B key. */
16761
16762 void
16763 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16764 {
16765 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16766 && aarch64_ra_sign_key == AARCH64_KEY_B)
16767 asm_fprintf (f, "\t.cfi_b_key_frame\n");
16768 }
16769
16770 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16771
16772 static void
16773 aarch64_start_file (void)
16774 {
16775 struct cl_target_option *default_options
16776 = TREE_TARGET_OPTION (target_option_default_node);
16777
16778 const struct processor *default_arch
16779 = aarch64_get_arch (default_options->x_explicit_arch);
16780 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16781 std::string extension
16782 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16783 default_arch->flags);
16784
16785 aarch64_last_printed_arch_string = default_arch->name + extension;
16786 aarch64_last_printed_tune_string = "";
16787 asm_fprintf (asm_out_file, "\t.arch %s\n",
16788 aarch64_last_printed_arch_string.c_str ());
16789
16790 default_file_start ();
16791 }
16792
16793 /* Emit load exclusive. */
16794
16795 static void
16796 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16797 rtx mem, rtx model_rtx)
16798 {
16799 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16800 }
16801
16802 /* Emit store exclusive. */
16803
16804 static void
16805 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16806 rtx rval, rtx mem, rtx model_rtx)
16807 {
16808 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16809 }
16810
16811 /* Mark the previous jump instruction as unlikely. */
16812
16813 static void
16814 aarch64_emit_unlikely_jump (rtx insn)
16815 {
16816 rtx_insn *jump = emit_jump_insn (insn);
16817 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16818 }
16819
16820 /* Expand a compare and swap pattern. */
16821
16822 void
16823 aarch64_expand_compare_and_swap (rtx operands[])
16824 {
16825 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16826 machine_mode mode, r_mode;
16827
16828 bval = operands[0];
16829 rval = operands[1];
16830 mem = operands[2];
16831 oldval = operands[3];
16832 newval = operands[4];
16833 is_weak = operands[5];
16834 mod_s = operands[6];
16835 mod_f = operands[7];
16836 mode = GET_MODE (mem);
16837
16838 /* Normally the succ memory model must be stronger than fail, but in the
16839 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16840 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16841 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16842 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16843 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16844
16845 r_mode = mode;
16846 if (mode == QImode || mode == HImode)
16847 {
16848 r_mode = SImode;
16849 rval = gen_reg_rtx (r_mode);
16850 }
16851
16852 if (TARGET_LSE)
16853 {
16854 /* The CAS insn requires oldval and rval overlap, but we need to
16855 have a copy of oldval saved across the operation to tell if
16856 the operation is successful. */
16857 if (reg_overlap_mentioned_p (rval, oldval))
16858 rval = copy_to_mode_reg (r_mode, oldval);
16859 else
16860 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16861
16862 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16863 newval, mod_s));
16864 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16865 }
16866 else
16867 {
16868 /* The oldval predicate varies by mode. Test it and force to reg. */
16869 insn_code code = code_for_aarch64_compare_and_swap (mode);
16870 if (!insn_data[code].operand[2].predicate (oldval, mode))
16871 oldval = force_reg (mode, oldval);
16872
16873 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16874 is_weak, mod_s, mod_f));
16875 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16876 }
16877
16878 if (r_mode != mode)
16879 rval = gen_lowpart (mode, rval);
16880 emit_move_insn (operands[1], rval);
16881
16882 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16883 emit_insn (gen_rtx_SET (bval, x));
16884 }
16885
16886 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16887 sequence implementing an atomic operation. */
16888
16889 static void
16890 aarch64_emit_post_barrier (enum memmodel model)
16891 {
16892 const enum memmodel base_model = memmodel_base (model);
16893
16894 if (is_mm_sync (model)
16895 && (base_model == MEMMODEL_ACQUIRE
16896 || base_model == MEMMODEL_ACQ_REL
16897 || base_model == MEMMODEL_SEQ_CST))
16898 {
16899 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16900 }
16901 }
16902
16903 /* Split a compare and swap pattern. */
16904
16905 void
16906 aarch64_split_compare_and_swap (rtx operands[])
16907 {
16908 rtx rval, mem, oldval, newval, scratch;
16909 machine_mode mode;
16910 bool is_weak;
16911 rtx_code_label *label1, *label2;
16912 rtx x, cond;
16913 enum memmodel model;
16914 rtx model_rtx;
16915
16916 rval = operands[0];
16917 mem = operands[1];
16918 oldval = operands[2];
16919 newval = operands[3];
16920 is_weak = (operands[4] != const0_rtx);
16921 model_rtx = operands[5];
16922 scratch = operands[7];
16923 mode = GET_MODE (mem);
16924 model = memmodel_from_int (INTVAL (model_rtx));
16925
16926 /* When OLDVAL is zero and we want the strong version we can emit a tighter
16927 loop:
16928 .label1:
16929 LD[A]XR rval, [mem]
16930 CBNZ rval, .label2
16931 ST[L]XR scratch, newval, [mem]
16932 CBNZ scratch, .label1
16933 .label2:
16934 CMP rval, 0. */
16935 bool strong_zero_p = !is_weak && oldval == const0_rtx;
16936
16937 label1 = NULL;
16938 if (!is_weak)
16939 {
16940 label1 = gen_label_rtx ();
16941 emit_label (label1);
16942 }
16943 label2 = gen_label_rtx ();
16944
16945 /* The initial load can be relaxed for a __sync operation since a final
16946 barrier will be emitted to stop code hoisting. */
16947 if (is_mm_sync (model))
16948 aarch64_emit_load_exclusive (mode, rval, mem,
16949 GEN_INT (MEMMODEL_RELAXED));
16950 else
16951 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16952
16953 if (strong_zero_p)
16954 {
16955 if (aarch64_track_speculation)
16956 {
16957 /* Emit an explicit compare instruction, so that we can correctly
16958 track the condition codes. */
16959 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16960 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16961 }
16962 else
16963 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16964
16965 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16966 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16967 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16968 }
16969 else
16970 {
16971 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16972 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16973 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16974 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16975 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16976 }
16977
16978 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16979
16980 if (!is_weak)
16981 {
16982 if (aarch64_track_speculation)
16983 {
16984 /* Emit an explicit compare instruction, so that we can correctly
16985 track the condition codes. */
16986 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16987 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16988 }
16989 else
16990 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16991
16992 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16993 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16994 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16995 }
16996 else
16997 {
16998 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16999 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
17000 emit_insn (gen_rtx_SET (cond, x));
17001 }
17002
17003 emit_label (label2);
17004 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17005 to set the condition flags. If this is not used it will be removed by
17006 later passes. */
17007 if (strong_zero_p)
17008 {
17009 cond = gen_rtx_REG (CCmode, CC_REGNUM);
17010 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
17011 emit_insn (gen_rtx_SET (cond, x));
17012 }
17013 /* Emit any final barrier needed for a __sync operation. */
17014 if (is_mm_sync (model))
17015 aarch64_emit_post_barrier (model);
17016 }
17017
17018 /* Split an atomic operation. */
17019
17020 void
17021 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
17022 rtx value, rtx model_rtx, rtx cond)
17023 {
17024 machine_mode mode = GET_MODE (mem);
17025 machine_mode wmode = (mode == DImode ? DImode : SImode);
17026 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
17027 const bool is_sync = is_mm_sync (model);
17028 rtx_code_label *label;
17029 rtx x;
17030
17031 /* Split the atomic operation into a sequence. */
17032 label = gen_label_rtx ();
17033 emit_label (label);
17034
17035 if (new_out)
17036 new_out = gen_lowpart (wmode, new_out);
17037 if (old_out)
17038 old_out = gen_lowpart (wmode, old_out);
17039 else
17040 old_out = new_out;
17041 value = simplify_gen_subreg (wmode, value, mode, 0);
17042
17043 /* The initial load can be relaxed for a __sync operation since a final
17044 barrier will be emitted to stop code hoisting. */
17045 if (is_sync)
17046 aarch64_emit_load_exclusive (mode, old_out, mem,
17047 GEN_INT (MEMMODEL_RELAXED));
17048 else
17049 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17050
17051 switch (code)
17052 {
17053 case SET:
17054 new_out = value;
17055 break;
17056
17057 case NOT:
17058 x = gen_rtx_AND (wmode, old_out, value);
17059 emit_insn (gen_rtx_SET (new_out, x));
17060 x = gen_rtx_NOT (wmode, new_out);
17061 emit_insn (gen_rtx_SET (new_out, x));
17062 break;
17063
17064 case MINUS:
17065 if (CONST_INT_P (value))
17066 {
17067 value = GEN_INT (-INTVAL (value));
17068 code = PLUS;
17069 }
17070 /* Fall through. */
17071
17072 default:
17073 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17074 emit_insn (gen_rtx_SET (new_out, x));
17075 break;
17076 }
17077
17078 aarch64_emit_store_exclusive (mode, cond, mem,
17079 gen_lowpart (mode, new_out), model_rtx);
17080
17081 if (aarch64_track_speculation)
17082 {
17083 /* Emit an explicit compare instruction, so that we can correctly
17084 track the condition codes. */
17085 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17086 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17087 }
17088 else
17089 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17090
17091 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17092 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17093 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17094
17095 /* Emit any final barrier needed for a __sync operation. */
17096 if (is_sync)
17097 aarch64_emit_post_barrier (model);
17098 }
17099
17100 static void
17101 aarch64_init_libfuncs (void)
17102 {
17103 /* Half-precision float operations. The compiler handles all operations
17104 with NULL libfuncs by converting to SFmode. */
17105
17106 /* Conversions. */
17107 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17108 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17109
17110 /* Arithmetic. */
17111 set_optab_libfunc (add_optab, HFmode, NULL);
17112 set_optab_libfunc (sdiv_optab, HFmode, NULL);
17113 set_optab_libfunc (smul_optab, HFmode, NULL);
17114 set_optab_libfunc (neg_optab, HFmode, NULL);
17115 set_optab_libfunc (sub_optab, HFmode, NULL);
17116
17117 /* Comparisons. */
17118 set_optab_libfunc (eq_optab, HFmode, NULL);
17119 set_optab_libfunc (ne_optab, HFmode, NULL);
17120 set_optab_libfunc (lt_optab, HFmode, NULL);
17121 set_optab_libfunc (le_optab, HFmode, NULL);
17122 set_optab_libfunc (ge_optab, HFmode, NULL);
17123 set_optab_libfunc (gt_optab, HFmode, NULL);
17124 set_optab_libfunc (unord_optab, HFmode, NULL);
17125 }
17126
17127 /* Target hook for c_mode_for_suffix. */
17128 static machine_mode
17129 aarch64_c_mode_for_suffix (char suffix)
17130 {
17131 if (suffix == 'q')
17132 return TFmode;
17133
17134 return VOIDmode;
17135 }
17136
17137 /* We can only represent floating point constants which will fit in
17138 "quarter-precision" values. These values are characterised by
17139 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
17140 by:
17141
17142 (-1)^s * (n/16) * 2^r
17143
17144 Where:
17145 's' is the sign bit.
17146 'n' is an integer in the range 16 <= n <= 31.
17147 'r' is an integer in the range -3 <= r <= 4. */
17148
17149 /* Return true iff X can be represented by a quarter-precision
17150 floating point immediate operand X. Note, we cannot represent 0.0. */
17151 bool
17152 aarch64_float_const_representable_p (rtx x)
17153 {
17154 /* This represents our current view of how many bits
17155 make up the mantissa. */
17156 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17157 int exponent;
17158 unsigned HOST_WIDE_INT mantissa, mask;
17159 REAL_VALUE_TYPE r, m;
17160 bool fail;
17161
17162 x = unwrap_const_vec_duplicate (x);
17163 if (!CONST_DOUBLE_P (x))
17164 return false;
17165
17166 if (GET_MODE (x) == VOIDmode
17167 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17168 return false;
17169
17170 r = *CONST_DOUBLE_REAL_VALUE (x);
17171
17172 /* We cannot represent infinities, NaNs or +/-zero. We won't
17173 know if we have +zero until we analyse the mantissa, but we
17174 can reject the other invalid values. */
17175 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17176 || REAL_VALUE_MINUS_ZERO (r))
17177 return false;
17178
17179 /* Extract exponent. */
17180 r = real_value_abs (&r);
17181 exponent = REAL_EXP (&r);
17182
17183 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17184 highest (sign) bit, with a fixed binary point at bit point_pos.
17185 m1 holds the low part of the mantissa, m2 the high part.
17186 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17187 bits for the mantissa, this can fail (low bits will be lost). */
17188 real_ldexp (&m, &r, point_pos - exponent);
17189 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17190
17191 /* If the low part of the mantissa has bits set we cannot represent
17192 the value. */
17193 if (w.ulow () != 0)
17194 return false;
17195 /* We have rejected the lower HOST_WIDE_INT, so update our
17196 understanding of how many bits lie in the mantissa and
17197 look only at the high HOST_WIDE_INT. */
17198 mantissa = w.elt (1);
17199 point_pos -= HOST_BITS_PER_WIDE_INT;
17200
17201 /* We can only represent values with a mantissa of the form 1.xxxx. */
17202 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17203 if ((mantissa & mask) != 0)
17204 return false;
17205
17206 /* Having filtered unrepresentable values, we may now remove all
17207 but the highest 5 bits. */
17208 mantissa >>= point_pos - 5;
17209
17210 /* We cannot represent the value 0.0, so reject it. This is handled
17211 elsewhere. */
17212 if (mantissa == 0)
17213 return false;
17214
17215 /* Then, as bit 4 is always set, we can mask it off, leaving
17216 the mantissa in the range [0, 15]. */
17217 mantissa &= ~(1 << 4);
17218 gcc_assert (mantissa <= 15);
17219
17220 /* GCC internally does not use IEEE754-like encoding (where normalized
17221 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17222 Our mantissa values are shifted 4 places to the left relative to
17223 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17224 by 5 places to correct for GCC's representation. */
17225 exponent = 5 - exponent;
17226
17227 return (exponent >= 0 && exponent <= 7);
17228 }
17229
17230 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17231 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17232 output MOVI/MVNI, ORR or BIC immediate. */
17233 char*
17234 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17235 enum simd_immediate_check which)
17236 {
17237 bool is_valid;
17238 static char templ[40];
17239 const char *mnemonic;
17240 const char *shift_op;
17241 unsigned int lane_count = 0;
17242 char element_char;
17243
17244 struct simd_immediate_info info;
17245
17246 /* This will return true to show const_vector is legal for use as either
17247 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17248 It will also update INFO to show how the immediate should be generated.
17249 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17250 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17251 gcc_assert (is_valid);
17252
17253 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17254 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17255
17256 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17257 {
17258 gcc_assert (info.insn == simd_immediate_info::MOV
17259 && info.u.mov.shift == 0);
17260 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17261 move immediate path. */
17262 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17263 info.u.mov.value = GEN_INT (0);
17264 else
17265 {
17266 const unsigned int buf_size = 20;
17267 char float_buf[buf_size] = {'\0'};
17268 real_to_decimal_for_mode (float_buf,
17269 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17270 buf_size, buf_size, 1, info.elt_mode);
17271
17272 if (lane_count == 1)
17273 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17274 else
17275 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17276 lane_count, element_char, float_buf);
17277 return templ;
17278 }
17279 }
17280
17281 gcc_assert (CONST_INT_P (info.u.mov.value));
17282
17283 if (which == AARCH64_CHECK_MOV)
17284 {
17285 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17286 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17287 ? "msl" : "lsl");
17288 if (lane_count == 1)
17289 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17290 mnemonic, UINTVAL (info.u.mov.value));
17291 else if (info.u.mov.shift)
17292 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17293 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17294 element_char, UINTVAL (info.u.mov.value), shift_op,
17295 info.u.mov.shift);
17296 else
17297 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17298 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17299 element_char, UINTVAL (info.u.mov.value));
17300 }
17301 else
17302 {
17303 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17304 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17305 if (info.u.mov.shift)
17306 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17307 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17308 element_char, UINTVAL (info.u.mov.value), "lsl",
17309 info.u.mov.shift);
17310 else
17311 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17312 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17313 element_char, UINTVAL (info.u.mov.value));
17314 }
17315 return templ;
17316 }
17317
17318 char*
17319 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17320 {
17321
17322 /* If a floating point number was passed and we desire to use it in an
17323 integer mode do the conversion to integer. */
17324 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17325 {
17326 unsigned HOST_WIDE_INT ival;
17327 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17328 gcc_unreachable ();
17329 immediate = gen_int_mode (ival, mode);
17330 }
17331
17332 machine_mode vmode;
17333 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17334 a 128 bit vector mode. */
17335 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17336
17337 vmode = aarch64_simd_container_mode (mode, width);
17338 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17339 return aarch64_output_simd_mov_immediate (v_op, width);
17340 }
17341
17342 /* Return the output string to use for moving immediate CONST_VECTOR
17343 into an SVE register. */
17344
17345 char *
17346 aarch64_output_sve_mov_immediate (rtx const_vector)
17347 {
17348 static char templ[40];
17349 struct simd_immediate_info info;
17350 char element_char;
17351
17352 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17353 gcc_assert (is_valid);
17354
17355 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17356
17357 machine_mode vec_mode = GET_MODE (const_vector);
17358 if (aarch64_sve_pred_mode_p (vec_mode))
17359 {
17360 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17361 if (info.insn == simd_immediate_info::MOV)
17362 {
17363 gcc_assert (info.u.mov.value == const0_rtx);
17364 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17365 }
17366 else
17367 {
17368 gcc_assert (info.insn == simd_immediate_info::PTRUE);
17369 unsigned int total_bytes;
17370 if (info.u.pattern == AARCH64_SV_ALL
17371 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17372 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17373 total_bytes / GET_MODE_SIZE (info.elt_mode));
17374 else
17375 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17376 svpattern_token (info.u.pattern));
17377 }
17378 return buf;
17379 }
17380
17381 if (info.insn == simd_immediate_info::INDEX)
17382 {
17383 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17384 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17385 element_char, INTVAL (info.u.index.base),
17386 INTVAL (info.u.index.step));
17387 return templ;
17388 }
17389
17390 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17391 {
17392 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17393 info.u.mov.value = GEN_INT (0);
17394 else
17395 {
17396 const int buf_size = 20;
17397 char float_buf[buf_size] = {};
17398 real_to_decimal_for_mode (float_buf,
17399 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17400 buf_size, buf_size, 1, info.elt_mode);
17401
17402 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17403 element_char, float_buf);
17404 return templ;
17405 }
17406 }
17407
17408 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17409 element_char, INTVAL (info.u.mov.value));
17410 return templ;
17411 }
17412
17413 /* Split operands into moves from op[1] + op[2] into op[0]. */
17414
17415 void
17416 aarch64_split_combinev16qi (rtx operands[3])
17417 {
17418 unsigned int dest = REGNO (operands[0]);
17419 unsigned int src1 = REGNO (operands[1]);
17420 unsigned int src2 = REGNO (operands[2]);
17421 machine_mode halfmode = GET_MODE (operands[1]);
17422 unsigned int halfregs = REG_NREGS (operands[1]);
17423 rtx destlo, desthi;
17424
17425 gcc_assert (halfmode == V16QImode);
17426
17427 if (src1 == dest && src2 == dest + halfregs)
17428 {
17429 /* No-op move. Can't split to nothing; emit something. */
17430 emit_note (NOTE_INSN_DELETED);
17431 return;
17432 }
17433
17434 /* Preserve register attributes for variable tracking. */
17435 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17436 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17437 GET_MODE_SIZE (halfmode));
17438
17439 /* Special case of reversed high/low parts. */
17440 if (reg_overlap_mentioned_p (operands[2], destlo)
17441 && reg_overlap_mentioned_p (operands[1], desthi))
17442 {
17443 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17444 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17445 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17446 }
17447 else if (!reg_overlap_mentioned_p (operands[2], destlo))
17448 {
17449 /* Try to avoid unnecessary moves if part of the result
17450 is in the right place already. */
17451 if (src1 != dest)
17452 emit_move_insn (destlo, operands[1]);
17453 if (src2 != dest + halfregs)
17454 emit_move_insn (desthi, operands[2]);
17455 }
17456 else
17457 {
17458 if (src2 != dest + halfregs)
17459 emit_move_insn (desthi, operands[2]);
17460 if (src1 != dest)
17461 emit_move_insn (destlo, operands[1]);
17462 }
17463 }
17464
17465 /* vec_perm support. */
17466
17467 struct expand_vec_perm_d
17468 {
17469 rtx target, op0, op1;
17470 vec_perm_indices perm;
17471 machine_mode vmode;
17472 unsigned int vec_flags;
17473 bool one_vector_p;
17474 bool testing_p;
17475 };
17476
17477 /* Generate a variable permutation. */
17478
17479 static void
17480 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17481 {
17482 machine_mode vmode = GET_MODE (target);
17483 bool one_vector_p = rtx_equal_p (op0, op1);
17484
17485 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17486 gcc_checking_assert (GET_MODE (op0) == vmode);
17487 gcc_checking_assert (GET_MODE (op1) == vmode);
17488 gcc_checking_assert (GET_MODE (sel) == vmode);
17489 gcc_checking_assert (TARGET_SIMD);
17490
17491 if (one_vector_p)
17492 {
17493 if (vmode == V8QImode)
17494 {
17495 /* Expand the argument to a V16QI mode by duplicating it. */
17496 rtx pair = gen_reg_rtx (V16QImode);
17497 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17498 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17499 }
17500 else
17501 {
17502 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17503 }
17504 }
17505 else
17506 {
17507 rtx pair;
17508
17509 if (vmode == V8QImode)
17510 {
17511 pair = gen_reg_rtx (V16QImode);
17512 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17513 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17514 }
17515 else
17516 {
17517 pair = gen_reg_rtx (OImode);
17518 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17519 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17520 }
17521 }
17522 }
17523
17524 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17525 NELT is the number of elements in the vector. */
17526
17527 void
17528 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17529 unsigned int nelt)
17530 {
17531 machine_mode vmode = GET_MODE (target);
17532 bool one_vector_p = rtx_equal_p (op0, op1);
17533 rtx mask;
17534
17535 /* The TBL instruction does not use a modulo index, so we must take care
17536 of that ourselves. */
17537 mask = aarch64_simd_gen_const_vector_dup (vmode,
17538 one_vector_p ? nelt - 1 : 2 * nelt - 1);
17539 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17540
17541 /* For big-endian, we also need to reverse the index within the vector
17542 (but not which vector). */
17543 if (BYTES_BIG_ENDIAN)
17544 {
17545 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17546 if (!one_vector_p)
17547 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17548 sel = expand_simple_binop (vmode, XOR, sel, mask,
17549 NULL, 0, OPTAB_LIB_WIDEN);
17550 }
17551 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17552 }
17553
17554 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17555
17556 static void
17557 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17558 {
17559 emit_insn (gen_rtx_SET (target,
17560 gen_rtx_UNSPEC (GET_MODE (target),
17561 gen_rtvec (2, op0, op1), code)));
17562 }
17563
17564 /* Expand an SVE vec_perm with the given operands. */
17565
17566 void
17567 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17568 {
17569 machine_mode data_mode = GET_MODE (target);
17570 machine_mode sel_mode = GET_MODE (sel);
17571 /* Enforced by the pattern condition. */
17572 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17573
17574 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17575 size of the two value vectors, i.e. the upper bits of the indices
17576 are effectively ignored. SVE TBL instead produces 0 for any
17577 out-of-range indices, so we need to modulo all the vec_perm indices
17578 to ensure they are all in range. */
17579 rtx sel_reg = force_reg (sel_mode, sel);
17580
17581 /* Check if the sel only references the first values vector. */
17582 if (GET_CODE (sel) == CONST_VECTOR
17583 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17584 {
17585 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17586 return;
17587 }
17588
17589 /* Check if the two values vectors are the same. */
17590 if (rtx_equal_p (op0, op1))
17591 {
17592 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17593 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17594 NULL, 0, OPTAB_DIRECT);
17595 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17596 return;
17597 }
17598
17599 /* Run TBL on for each value vector and combine the results. */
17600
17601 rtx res0 = gen_reg_rtx (data_mode);
17602 rtx res1 = gen_reg_rtx (data_mode);
17603 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17604 if (GET_CODE (sel) != CONST_VECTOR
17605 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17606 {
17607 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17608 2 * nunits - 1);
17609 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17610 NULL, 0, OPTAB_DIRECT);
17611 }
17612 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17613 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17614 NULL, 0, OPTAB_DIRECT);
17615 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17616 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17617 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17618 else
17619 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17620 }
17621
17622 /* Recognize patterns suitable for the TRN instructions. */
17623 static bool
17624 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17625 {
17626 HOST_WIDE_INT odd;
17627 poly_uint64 nelt = d->perm.length ();
17628 rtx out, in0, in1, x;
17629 machine_mode vmode = d->vmode;
17630
17631 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17632 return false;
17633
17634 /* Note that these are little-endian tests.
17635 We correct for big-endian later. */
17636 if (!d->perm[0].is_constant (&odd)
17637 || (odd != 0 && odd != 1)
17638 || !d->perm.series_p (0, 2, odd, 2)
17639 || !d->perm.series_p (1, 2, nelt + odd, 2))
17640 return false;
17641
17642 /* Success! */
17643 if (d->testing_p)
17644 return true;
17645
17646 in0 = d->op0;
17647 in1 = d->op1;
17648 /* We don't need a big-endian lane correction for SVE; see the comment
17649 at the head of aarch64-sve.md for details. */
17650 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17651 {
17652 x = in0, in0 = in1, in1 = x;
17653 odd = !odd;
17654 }
17655 out = d->target;
17656
17657 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17658 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17659 return true;
17660 }
17661
17662 /* Recognize patterns suitable for the UZP instructions. */
17663 static bool
17664 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17665 {
17666 HOST_WIDE_INT odd;
17667 rtx out, in0, in1, x;
17668 machine_mode vmode = d->vmode;
17669
17670 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17671 return false;
17672
17673 /* Note that these are little-endian tests.
17674 We correct for big-endian later. */
17675 if (!d->perm[0].is_constant (&odd)
17676 || (odd != 0 && odd != 1)
17677 || !d->perm.series_p (0, 1, odd, 2))
17678 return false;
17679
17680 /* Success! */
17681 if (d->testing_p)
17682 return true;
17683
17684 in0 = d->op0;
17685 in1 = d->op1;
17686 /* We don't need a big-endian lane correction for SVE; see the comment
17687 at the head of aarch64-sve.md for details. */
17688 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17689 {
17690 x = in0, in0 = in1, in1 = x;
17691 odd = !odd;
17692 }
17693 out = d->target;
17694
17695 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17696 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17697 return true;
17698 }
17699
17700 /* Recognize patterns suitable for the ZIP instructions. */
17701 static bool
17702 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17703 {
17704 unsigned int high;
17705 poly_uint64 nelt = d->perm.length ();
17706 rtx out, in0, in1, x;
17707 machine_mode vmode = d->vmode;
17708
17709 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17710 return false;
17711
17712 /* Note that these are little-endian tests.
17713 We correct for big-endian later. */
17714 poly_uint64 first = d->perm[0];
17715 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17716 || !d->perm.series_p (0, 2, first, 1)
17717 || !d->perm.series_p (1, 2, first + nelt, 1))
17718 return false;
17719 high = maybe_ne (first, 0U);
17720
17721 /* Success! */
17722 if (d->testing_p)
17723 return true;
17724
17725 in0 = d->op0;
17726 in1 = d->op1;
17727 /* We don't need a big-endian lane correction for SVE; see the comment
17728 at the head of aarch64-sve.md for details. */
17729 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17730 {
17731 x = in0, in0 = in1, in1 = x;
17732 high = !high;
17733 }
17734 out = d->target;
17735
17736 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17737 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17738 return true;
17739 }
17740
17741 /* Recognize patterns for the EXT insn. */
17742
17743 static bool
17744 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17745 {
17746 HOST_WIDE_INT location;
17747 rtx offset;
17748
17749 /* The first element always refers to the first vector.
17750 Check if the extracted indices are increasing by one. */
17751 if (d->vec_flags == VEC_SVE_PRED
17752 || !d->perm[0].is_constant (&location)
17753 || !d->perm.series_p (0, 1, location, 1))
17754 return false;
17755
17756 /* Success! */
17757 if (d->testing_p)
17758 return true;
17759
17760 /* The case where (location == 0) is a no-op for both big- and little-endian,
17761 and is removed by the mid-end at optimization levels -O1 and higher.
17762
17763 We don't need a big-endian lane correction for SVE; see the comment
17764 at the head of aarch64-sve.md for details. */
17765 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17766 {
17767 /* After setup, we want the high elements of the first vector (stored
17768 at the LSB end of the register), and the low elements of the second
17769 vector (stored at the MSB end of the register). So swap. */
17770 std::swap (d->op0, d->op1);
17771 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17772 to_constant () is safe since this is restricted to Advanced SIMD
17773 vectors. */
17774 location = d->perm.length ().to_constant () - location;
17775 }
17776
17777 offset = GEN_INT (location);
17778 emit_set_insn (d->target,
17779 gen_rtx_UNSPEC (d->vmode,
17780 gen_rtvec (3, d->op0, d->op1, offset),
17781 UNSPEC_EXT));
17782 return true;
17783 }
17784
17785 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17786 within each 64-bit, 32-bit or 16-bit granule. */
17787
17788 static bool
17789 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17790 {
17791 HOST_WIDE_INT diff;
17792 unsigned int i, size, unspec;
17793 machine_mode pred_mode;
17794
17795 if (d->vec_flags == VEC_SVE_PRED
17796 || !d->one_vector_p
17797 || !d->perm[0].is_constant (&diff))
17798 return false;
17799
17800 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17801 if (size == 8)
17802 {
17803 unspec = UNSPEC_REV64;
17804 pred_mode = VNx2BImode;
17805 }
17806 else if (size == 4)
17807 {
17808 unspec = UNSPEC_REV32;
17809 pred_mode = VNx4BImode;
17810 }
17811 else if (size == 2)
17812 {
17813 unspec = UNSPEC_REV16;
17814 pred_mode = VNx8BImode;
17815 }
17816 else
17817 return false;
17818
17819 unsigned int step = diff + 1;
17820 for (i = 0; i < step; ++i)
17821 if (!d->perm.series_p (i, step, diff - i, step))
17822 return false;
17823
17824 /* Success! */
17825 if (d->testing_p)
17826 return true;
17827
17828 if (d->vec_flags == VEC_SVE_DATA)
17829 {
17830 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
17831 rtx target = gen_reg_rtx (int_mode);
17832 if (BYTES_BIG_ENDIAN)
17833 /* The act of taking a subreg between INT_MODE and d->vmode
17834 is itself a reversing operation on big-endian targets;
17835 see the comment at the head of aarch64-sve.md for details.
17836 First reinterpret OP0 as INT_MODE without using a subreg
17837 and without changing the contents. */
17838 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
17839 else
17840 {
17841 /* For SVE we use REV[BHW] unspecs derived from the element size
17842 of v->mode and vector modes whose elements have SIZE bytes.
17843 This ensures that the vector modes match the predicate modes. */
17844 int unspec = aarch64_sve_rev_unspec (d->vmode);
17845 rtx pred = aarch64_ptrue_reg (pred_mode);
17846 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
17847 gen_lowpart (int_mode, d->op0)));
17848 }
17849 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17850 return true;
17851 }
17852 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17853 emit_set_insn (d->target, src);
17854 return true;
17855 }
17856
17857 /* Recognize patterns for the REV insn, which reverses elements within
17858 a full vector. */
17859
17860 static bool
17861 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17862 {
17863 poly_uint64 nelt = d->perm.length ();
17864
17865 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17866 return false;
17867
17868 if (!d->perm.series_p (0, 1, nelt - 1, -1))
17869 return false;
17870
17871 /* Success! */
17872 if (d->testing_p)
17873 return true;
17874
17875 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17876 emit_set_insn (d->target, src);
17877 return true;
17878 }
17879
17880 static bool
17881 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17882 {
17883 rtx out = d->target;
17884 rtx in0;
17885 HOST_WIDE_INT elt;
17886 machine_mode vmode = d->vmode;
17887 rtx lane;
17888
17889 if (d->vec_flags == VEC_SVE_PRED
17890 || d->perm.encoding ().encoded_nelts () != 1
17891 || !d->perm[0].is_constant (&elt))
17892 return false;
17893
17894 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17895 return false;
17896
17897 /* Success! */
17898 if (d->testing_p)
17899 return true;
17900
17901 /* The generic preparation in aarch64_expand_vec_perm_const_1
17902 swaps the operand order and the permute indices if it finds
17903 d->perm[0] to be in the second operand. Thus, we can always
17904 use d->op0 and need not do any extra arithmetic to get the
17905 correct lane number. */
17906 in0 = d->op0;
17907 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
17908
17909 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17910 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17911 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17912 return true;
17913 }
17914
17915 static bool
17916 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17917 {
17918 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17919 machine_mode vmode = d->vmode;
17920
17921 /* Make sure that the indices are constant. */
17922 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17923 for (unsigned int i = 0; i < encoded_nelts; ++i)
17924 if (!d->perm[i].is_constant ())
17925 return false;
17926
17927 if (d->testing_p)
17928 return true;
17929
17930 /* Generic code will try constant permutation twice. Once with the
17931 original mode and again with the elements lowered to QImode.
17932 So wait and don't do the selector expansion ourselves. */
17933 if (vmode != V8QImode && vmode != V16QImode)
17934 return false;
17935
17936 /* to_constant is safe since this routine is specific to Advanced SIMD
17937 vectors. */
17938 unsigned int nelt = d->perm.length ().to_constant ();
17939 for (unsigned int i = 0; i < nelt; ++i)
17940 /* If big-endian and two vectors we end up with a weird mixed-endian
17941 mode on NEON. Reverse the index within each word but not the word
17942 itself. to_constant is safe because we checked is_constant above. */
17943 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17944 ? d->perm[i].to_constant () ^ (nelt - 1)
17945 : d->perm[i].to_constant ());
17946
17947 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17948 sel = force_reg (vmode, sel);
17949
17950 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17951 return true;
17952 }
17953
17954 /* Try to implement D using an SVE TBL instruction. */
17955
17956 static bool
17957 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17958 {
17959 unsigned HOST_WIDE_INT nelt;
17960
17961 /* Permuting two variable-length vectors could overflow the
17962 index range. */
17963 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17964 return false;
17965
17966 if (d->testing_p)
17967 return true;
17968
17969 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17970 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17971 if (d->one_vector_p)
17972 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17973 else
17974 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17975 return true;
17976 }
17977
17978 static bool
17979 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17980 {
17981 /* The pattern matching functions above are written to look for a small
17982 number to begin the sequence (0, 1, N/2). If we begin with an index
17983 from the second operand, we can swap the operands. */
17984 poly_int64 nelt = d->perm.length ();
17985 if (known_ge (d->perm[0], nelt))
17986 {
17987 d->perm.rotate_inputs (1);
17988 std::swap (d->op0, d->op1);
17989 }
17990
17991 if ((d->vec_flags == VEC_ADVSIMD
17992 || d->vec_flags == VEC_SVE_DATA
17993 || d->vec_flags == VEC_SVE_PRED)
17994 && known_gt (nelt, 1))
17995 {
17996 if (aarch64_evpc_rev_local (d))
17997 return true;
17998 else if (aarch64_evpc_rev_global (d))
17999 return true;
18000 else if (aarch64_evpc_ext (d))
18001 return true;
18002 else if (aarch64_evpc_dup (d))
18003 return true;
18004 else if (aarch64_evpc_zip (d))
18005 return true;
18006 else if (aarch64_evpc_uzp (d))
18007 return true;
18008 else if (aarch64_evpc_trn (d))
18009 return true;
18010 if (d->vec_flags == VEC_SVE_DATA)
18011 return aarch64_evpc_sve_tbl (d);
18012 else if (d->vec_flags == VEC_ADVSIMD)
18013 return aarch64_evpc_tbl (d);
18014 }
18015 return false;
18016 }
18017
18018 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
18019
18020 static bool
18021 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18022 rtx op1, const vec_perm_indices &sel)
18023 {
18024 struct expand_vec_perm_d d;
18025
18026 /* Check whether the mask can be applied to a single vector. */
18027 if (sel.ninputs () == 1
18028 || (op0 && rtx_equal_p (op0, op1)))
18029 d.one_vector_p = true;
18030 else if (sel.all_from_input_p (0))
18031 {
18032 d.one_vector_p = true;
18033 op1 = op0;
18034 }
18035 else if (sel.all_from_input_p (1))
18036 {
18037 d.one_vector_p = true;
18038 op0 = op1;
18039 }
18040 else
18041 d.one_vector_p = false;
18042
18043 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18044 sel.nelts_per_input ());
18045 d.vmode = vmode;
18046 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18047 d.target = target;
18048 d.op0 = op0;
18049 d.op1 = op1;
18050 d.testing_p = !target;
18051
18052 if (!d.testing_p)
18053 return aarch64_expand_vec_perm_const_1 (&d);
18054
18055 rtx_insn *last = get_last_insn ();
18056 bool ret = aarch64_expand_vec_perm_const_1 (&d);
18057 gcc_assert (last == get_last_insn ());
18058
18059 return ret;
18060 }
18061
18062 /* Generate a byte permute mask for a register of mode MODE,
18063 which has NUNITS units. */
18064
18065 rtx
18066 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18067 {
18068 /* We have to reverse each vector because we dont have
18069 a permuted load that can reverse-load according to ABI rules. */
18070 rtx mask;
18071 rtvec v = rtvec_alloc (16);
18072 unsigned int i, j;
18073 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18074
18075 gcc_assert (BYTES_BIG_ENDIAN);
18076 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18077
18078 for (i = 0; i < nunits; i++)
18079 for (j = 0; j < usize; j++)
18080 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18081 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18082 return force_reg (V16QImode, mask);
18083 }
18084
18085 /* Expand an SVE integer comparison using the SVE equivalent of:
18086
18087 (set TARGET (CODE OP0 OP1)). */
18088
18089 void
18090 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18091 {
18092 machine_mode pred_mode = GET_MODE (target);
18093 machine_mode data_mode = GET_MODE (op0);
18094 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18095 op0, op1);
18096 if (!rtx_equal_p (target, res))
18097 emit_move_insn (target, res);
18098 }
18099
18100 /* Return the UNSPEC_COND_* code for comparison CODE. */
18101
18102 static unsigned int
18103 aarch64_unspec_cond_code (rtx_code code)
18104 {
18105 switch (code)
18106 {
18107 case NE:
18108 return UNSPEC_COND_FCMNE;
18109 case EQ:
18110 return UNSPEC_COND_FCMEQ;
18111 case LT:
18112 return UNSPEC_COND_FCMLT;
18113 case GT:
18114 return UNSPEC_COND_FCMGT;
18115 case LE:
18116 return UNSPEC_COND_FCMLE;
18117 case GE:
18118 return UNSPEC_COND_FCMGE;
18119 case UNORDERED:
18120 return UNSPEC_COND_FCMUO;
18121 default:
18122 gcc_unreachable ();
18123 }
18124 }
18125
18126 /* Emit:
18127
18128 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18129
18130 where <X> is the operation associated with comparison CODE.
18131 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18132
18133 static void
18134 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18135 bool known_ptrue_p, rtx op0, rtx op1)
18136 {
18137 rtx flag = gen_int_mode (known_ptrue_p, SImode);
18138 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18139 gen_rtvec (4, pred, flag, op0, op1),
18140 aarch64_unspec_cond_code (code));
18141 emit_set_insn (target, unspec);
18142 }
18143
18144 /* Emit the SVE equivalent of:
18145
18146 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18147 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18148 (set TARGET (ior:PRED_MODE TMP1 TMP2))
18149
18150 where <Xi> is the operation associated with comparison CODEi.
18151 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18152
18153 static void
18154 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18155 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18156 {
18157 machine_mode pred_mode = GET_MODE (pred);
18158 rtx tmp1 = gen_reg_rtx (pred_mode);
18159 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18160 rtx tmp2 = gen_reg_rtx (pred_mode);
18161 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18162 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18163 }
18164
18165 /* Emit the SVE equivalent of:
18166
18167 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18168 (set TARGET (not TMP))
18169
18170 where <X> is the operation associated with comparison CODE.
18171 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18172
18173 static void
18174 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18175 bool known_ptrue_p, rtx op0, rtx op1)
18176 {
18177 machine_mode pred_mode = GET_MODE (pred);
18178 rtx tmp = gen_reg_rtx (pred_mode);
18179 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18180 aarch64_emit_unop (target, one_cmpl_optab, tmp);
18181 }
18182
18183 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18184
18185 (set TARGET (CODE OP0 OP1))
18186
18187 If CAN_INVERT_P is true, the caller can also handle inverted results;
18188 return true if the result is in fact inverted. */
18189
18190 bool
18191 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18192 rtx op0, rtx op1, bool can_invert_p)
18193 {
18194 machine_mode pred_mode = GET_MODE (target);
18195 machine_mode data_mode = GET_MODE (op0);
18196
18197 rtx ptrue = aarch64_ptrue_reg (pred_mode);
18198 switch (code)
18199 {
18200 case UNORDERED:
18201 /* UNORDERED has no immediate form. */
18202 op1 = force_reg (data_mode, op1);
18203 /* fall through */
18204 case LT:
18205 case LE:
18206 case GT:
18207 case GE:
18208 case EQ:
18209 case NE:
18210 {
18211 /* There is native support for the comparison. */
18212 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18213 return false;
18214 }
18215
18216 case LTGT:
18217 /* This is a trapping operation (LT or GT). */
18218 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18219 return false;
18220
18221 case UNEQ:
18222 if (!flag_trapping_math)
18223 {
18224 /* This would trap for signaling NaNs. */
18225 op1 = force_reg (data_mode, op1);
18226 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18227 ptrue, true, op0, op1);
18228 return false;
18229 }
18230 /* fall through */
18231 case UNLT:
18232 case UNLE:
18233 case UNGT:
18234 case UNGE:
18235 if (flag_trapping_math)
18236 {
18237 /* Work out which elements are ordered. */
18238 rtx ordered = gen_reg_rtx (pred_mode);
18239 op1 = force_reg (data_mode, op1);
18240 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18241 ptrue, true, op0, op1);
18242
18243 /* Test the opposite condition for the ordered elements,
18244 then invert the result. */
18245 if (code == UNEQ)
18246 code = NE;
18247 else
18248 code = reverse_condition_maybe_unordered (code);
18249 if (can_invert_p)
18250 {
18251 aarch64_emit_sve_fp_cond (target, code,
18252 ordered, false, op0, op1);
18253 return true;
18254 }
18255 aarch64_emit_sve_invert_fp_cond (target, code,
18256 ordered, false, op0, op1);
18257 return false;
18258 }
18259 break;
18260
18261 case ORDERED:
18262 /* ORDERED has no immediate form. */
18263 op1 = force_reg (data_mode, op1);
18264 break;
18265
18266 default:
18267 gcc_unreachable ();
18268 }
18269
18270 /* There is native support for the inverse comparison. */
18271 code = reverse_condition_maybe_unordered (code);
18272 if (can_invert_p)
18273 {
18274 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18275 return true;
18276 }
18277 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18278 return false;
18279 }
18280
18281 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18282 of the data being selected and CMP_MODE is the mode of the values being
18283 compared. */
18284
18285 void
18286 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18287 rtx *ops)
18288 {
18289 machine_mode pred_mode
18290 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18291 GET_MODE_SIZE (cmp_mode)).require ();
18292 rtx pred = gen_reg_rtx (pred_mode);
18293 if (FLOAT_MODE_P (cmp_mode))
18294 {
18295 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18296 ops[4], ops[5], true))
18297 std::swap (ops[1], ops[2]);
18298 }
18299 else
18300 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18301
18302 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18303 ops[1] = force_reg (data_mode, ops[1]);
18304 /* The "false" value can only be zero if the "true" value is a constant. */
18305 if (register_operand (ops[1], data_mode)
18306 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18307 ops[2] = force_reg (data_mode, ops[2]);
18308
18309 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18310 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18311 }
18312
18313 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18314 true. However due to issues with register allocation it is preferable
18315 to avoid tieing integer scalar and FP scalar modes. Executing integer
18316 operations in general registers is better than treating them as scalar
18317 vector operations. This reduces latency and avoids redundant int<->FP
18318 moves. So tie modes if they are either the same class, or vector modes
18319 with other vector modes, vector structs or any scalar mode. */
18320
18321 static bool
18322 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18323 {
18324 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18325 return true;
18326
18327 /* We specifically want to allow elements of "structure" modes to
18328 be tieable to the structure. This more general condition allows
18329 other rarer situations too. The reason we don't extend this to
18330 predicate modes is that there are no predicate structure modes
18331 nor any specific instructions for extracting part of a predicate
18332 register. */
18333 if (aarch64_vector_data_mode_p (mode1)
18334 && aarch64_vector_data_mode_p (mode2))
18335 return true;
18336
18337 /* Also allow any scalar modes with vectors. */
18338 if (aarch64_vector_mode_supported_p (mode1)
18339 || aarch64_vector_mode_supported_p (mode2))
18340 return true;
18341
18342 return false;
18343 }
18344
18345 /* Return a new RTX holding the result of moving POINTER forward by
18346 AMOUNT bytes. */
18347
18348 static rtx
18349 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18350 {
18351 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18352
18353 return adjust_automodify_address (pointer, GET_MODE (pointer),
18354 next, amount);
18355 }
18356
18357 /* Return a new RTX holding the result of moving POINTER forward by the
18358 size of the mode it points to. */
18359
18360 static rtx
18361 aarch64_progress_pointer (rtx pointer)
18362 {
18363 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18364 }
18365
18366 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18367 MODE bytes. */
18368
18369 static void
18370 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18371 machine_mode mode)
18372 {
18373 rtx reg = gen_reg_rtx (mode);
18374
18375 /* "Cast" the pointers to the correct mode. */
18376 *src = adjust_address (*src, mode, 0);
18377 *dst = adjust_address (*dst, mode, 0);
18378 /* Emit the memcpy. */
18379 emit_move_insn (reg, *src);
18380 emit_move_insn (*dst, reg);
18381 /* Move the pointers forward. */
18382 *src = aarch64_progress_pointer (*src);
18383 *dst = aarch64_progress_pointer (*dst);
18384 }
18385
18386 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18387 we succeed, otherwise return false. */
18388
18389 bool
18390 aarch64_expand_cpymem (rtx *operands)
18391 {
18392 int n, mode_bits;
18393 rtx dst = operands[0];
18394 rtx src = operands[1];
18395 rtx base;
18396 machine_mode cur_mode = BLKmode, next_mode;
18397 bool speed_p = !optimize_function_for_size_p (cfun);
18398
18399 /* When optimizing for size, give a better estimate of the length of a
18400 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18401 will always require an even number of instructions to do now. And each
18402 operation requires both a load+store, so devide the max number by 2. */
18403 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18404
18405 /* We can't do anything smart if the amount to copy is not constant. */
18406 if (!CONST_INT_P (operands[2]))
18407 return false;
18408
18409 n = INTVAL (operands[2]);
18410
18411 /* Try to keep the number of instructions low. For all cases we will do at
18412 most two moves for the residual amount, since we'll always overlap the
18413 remainder. */
18414 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18415 return false;
18416
18417 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18418 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18419
18420 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18421 src = adjust_automodify_address (src, VOIDmode, base, 0);
18422
18423 /* Convert n to bits to make the rest of the code simpler. */
18424 n = n * BITS_PER_UNIT;
18425
18426 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18427 larger than TImode, but we should not use them for loads/stores here. */
18428 const int copy_limit = GET_MODE_BITSIZE (TImode);
18429
18430 while (n > 0)
18431 {
18432 /* Find the largest mode in which to do the copy in without over reading
18433 or writing. */
18434 opt_scalar_int_mode mode_iter;
18435 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18436 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18437 cur_mode = mode_iter.require ();
18438
18439 gcc_assert (cur_mode != BLKmode);
18440
18441 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18442 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18443
18444 n -= mode_bits;
18445
18446 /* Do certain trailing copies as overlapping if it's going to be
18447 cheaper. i.e. less instructions to do so. For instance doing a 15
18448 byte copy it's more efficient to do two overlapping 8 byte copies than
18449 8 + 6 + 1. */
18450 if (n > 0 && n <= 8 * BITS_PER_UNIT)
18451 {
18452 next_mode = smallest_mode_for_size (n, MODE_INT);
18453 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18454 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18455 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18456 n = n_bits;
18457 }
18458 }
18459
18460 return true;
18461 }
18462
18463 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18464 SImode stores. Handle the case when the constant has identical
18465 bottom and top halves. This is beneficial when the two stores can be
18466 merged into an STP and we avoid synthesising potentially expensive
18467 immediates twice. Return true if such a split is possible. */
18468
18469 bool
18470 aarch64_split_dimode_const_store (rtx dst, rtx src)
18471 {
18472 rtx lo = gen_lowpart (SImode, src);
18473 rtx hi = gen_highpart_mode (SImode, DImode, src);
18474
18475 bool size_p = optimize_function_for_size_p (cfun);
18476
18477 if (!rtx_equal_p (lo, hi))
18478 return false;
18479
18480 unsigned int orig_cost
18481 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18482 unsigned int lo_cost
18483 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18484
18485 /* We want to transform:
18486 MOV x1, 49370
18487 MOVK x1, 0x140, lsl 16
18488 MOVK x1, 0xc0da, lsl 32
18489 MOVK x1, 0x140, lsl 48
18490 STR x1, [x0]
18491 into:
18492 MOV w1, 49370
18493 MOVK w1, 0x140, lsl 16
18494 STP w1, w1, [x0]
18495 So we want to perform this only when we save two instructions
18496 or more. When optimizing for size, however, accept any code size
18497 savings we can. */
18498 if (size_p && orig_cost <= lo_cost)
18499 return false;
18500
18501 if (!size_p
18502 && (orig_cost <= lo_cost + 1))
18503 return false;
18504
18505 rtx mem_lo = adjust_address (dst, SImode, 0);
18506 if (!aarch64_mem_pair_operand (mem_lo, SImode))
18507 return false;
18508
18509 rtx tmp_reg = gen_reg_rtx (SImode);
18510 aarch64_expand_mov_immediate (tmp_reg, lo);
18511 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18512 /* Don't emit an explicit store pair as this may not be always profitable.
18513 Let the sched-fusion logic decide whether to merge them. */
18514 emit_move_insn (mem_lo, tmp_reg);
18515 emit_move_insn (mem_hi, tmp_reg);
18516
18517 return true;
18518 }
18519
18520 /* Generate RTL for a conditional branch with rtx comparison CODE in
18521 mode CC_MODE. The destination of the unlikely conditional branch
18522 is LABEL_REF. */
18523
18524 void
18525 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18526 rtx label_ref)
18527 {
18528 rtx x;
18529 x = gen_rtx_fmt_ee (code, VOIDmode,
18530 gen_rtx_REG (cc_mode, CC_REGNUM),
18531 const0_rtx);
18532
18533 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18534 gen_rtx_LABEL_REF (VOIDmode, label_ref),
18535 pc_rtx);
18536 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18537 }
18538
18539 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18540
18541 OP1 represents the TImode destination operand 1
18542 OP2 represents the TImode destination operand 2
18543 LOW_DEST represents the low half (DImode) of TImode operand 0
18544 LOW_IN1 represents the low half (DImode) of TImode operand 1
18545 LOW_IN2 represents the low half (DImode) of TImode operand 2
18546 HIGH_DEST represents the high half (DImode) of TImode operand 0
18547 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18548 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18549
18550 void
18551 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18552 rtx *low_in1, rtx *low_in2,
18553 rtx *high_dest, rtx *high_in1,
18554 rtx *high_in2)
18555 {
18556 *low_dest = gen_reg_rtx (DImode);
18557 *low_in1 = gen_lowpart (DImode, op1);
18558 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18559 subreg_lowpart_offset (DImode, TImode));
18560 *high_dest = gen_reg_rtx (DImode);
18561 *high_in1 = gen_highpart (DImode, op1);
18562 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18563 subreg_highpart_offset (DImode, TImode));
18564 }
18565
18566 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18567
18568 This function differs from 'arch64_addti_scratch_regs' in that
18569 OP1 can be an immediate constant (zero). We must call
18570 subreg_highpart_offset with DImode and TImode arguments, otherwise
18571 VOIDmode will be used for the const_int which generates an internal
18572 error from subreg_size_highpart_offset which does not expect a size of zero.
18573
18574 OP1 represents the TImode destination operand 1
18575 OP2 represents the TImode destination operand 2
18576 LOW_DEST represents the low half (DImode) of TImode operand 0
18577 LOW_IN1 represents the low half (DImode) of TImode operand 1
18578 LOW_IN2 represents the low half (DImode) of TImode operand 2
18579 HIGH_DEST represents the high half (DImode) of TImode operand 0
18580 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18581 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18582
18583
18584 void
18585 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18586 rtx *low_in1, rtx *low_in2,
18587 rtx *high_dest, rtx *high_in1,
18588 rtx *high_in2)
18589 {
18590 *low_dest = gen_reg_rtx (DImode);
18591 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18592 subreg_lowpart_offset (DImode, TImode));
18593
18594 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18595 subreg_lowpart_offset (DImode, TImode));
18596 *high_dest = gen_reg_rtx (DImode);
18597
18598 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18599 subreg_highpart_offset (DImode, TImode));
18600 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18601 subreg_highpart_offset (DImode, TImode));
18602 }
18603
18604 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18605
18606 OP0 represents the TImode destination operand 0
18607 LOW_DEST represents the low half (DImode) of TImode operand 0
18608 LOW_IN1 represents the low half (DImode) of TImode operand 1
18609 LOW_IN2 represents the low half (DImode) of TImode operand 2
18610 HIGH_DEST represents the high half (DImode) of TImode operand 0
18611 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18612 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18613 UNSIGNED_P is true if the operation is being performed on unsigned
18614 values. */
18615 void
18616 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18617 rtx low_in2, rtx high_dest, rtx high_in1,
18618 rtx high_in2, bool unsigned_p)
18619 {
18620 if (low_in2 == const0_rtx)
18621 {
18622 low_dest = low_in1;
18623 high_in2 = force_reg (DImode, high_in2);
18624 if (unsigned_p)
18625 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18626 else
18627 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18628 }
18629 else
18630 {
18631 if (CONST_INT_P (low_in2))
18632 {
18633 high_in2 = force_reg (DImode, high_in2);
18634 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18635 GEN_INT (-INTVAL (low_in2))));
18636 }
18637 else
18638 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18639
18640 if (unsigned_p)
18641 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18642 else
18643 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18644 }
18645
18646 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18647 emit_move_insn (gen_highpart (DImode, op0), high_dest);
18648
18649 }
18650
18651 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18652
18653 static unsigned HOST_WIDE_INT
18654 aarch64_asan_shadow_offset (void)
18655 {
18656 if (TARGET_ILP32)
18657 return (HOST_WIDE_INT_1 << 29);
18658 else
18659 return (HOST_WIDE_INT_1 << 36);
18660 }
18661
18662 static rtx
18663 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18664 int code, tree treeop0, tree treeop1)
18665 {
18666 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18667 rtx op0, op1;
18668 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18669 insn_code icode;
18670 struct expand_operand ops[4];
18671
18672 start_sequence ();
18673 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18674
18675 op_mode = GET_MODE (op0);
18676 if (op_mode == VOIDmode)
18677 op_mode = GET_MODE (op1);
18678
18679 switch (op_mode)
18680 {
18681 case E_QImode:
18682 case E_HImode:
18683 case E_SImode:
18684 cmp_mode = SImode;
18685 icode = CODE_FOR_cmpsi;
18686 break;
18687
18688 case E_DImode:
18689 cmp_mode = DImode;
18690 icode = CODE_FOR_cmpdi;
18691 break;
18692
18693 case E_SFmode:
18694 cmp_mode = SFmode;
18695 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18696 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18697 break;
18698
18699 case E_DFmode:
18700 cmp_mode = DFmode;
18701 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18702 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18703 break;
18704
18705 default:
18706 end_sequence ();
18707 return NULL_RTX;
18708 }
18709
18710 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18711 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18712 if (!op0 || !op1)
18713 {
18714 end_sequence ();
18715 return NULL_RTX;
18716 }
18717 *prep_seq = get_insns ();
18718 end_sequence ();
18719
18720 create_fixed_operand (&ops[0], op0);
18721 create_fixed_operand (&ops[1], op1);
18722
18723 start_sequence ();
18724 if (!maybe_expand_insn (icode, 2, ops))
18725 {
18726 end_sequence ();
18727 return NULL_RTX;
18728 }
18729 *gen_seq = get_insns ();
18730 end_sequence ();
18731
18732 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18733 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18734 }
18735
18736 static rtx
18737 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18738 int cmp_code, tree treeop0, tree treeop1, int bit_code)
18739 {
18740 rtx op0, op1, target;
18741 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18742 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18743 insn_code icode;
18744 struct expand_operand ops[6];
18745 int aarch64_cond;
18746
18747 push_to_sequence (*prep_seq);
18748 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18749
18750 op_mode = GET_MODE (op0);
18751 if (op_mode == VOIDmode)
18752 op_mode = GET_MODE (op1);
18753
18754 switch (op_mode)
18755 {
18756 case E_QImode:
18757 case E_HImode:
18758 case E_SImode:
18759 cmp_mode = SImode;
18760 icode = CODE_FOR_ccmpsi;
18761 break;
18762
18763 case E_DImode:
18764 cmp_mode = DImode;
18765 icode = CODE_FOR_ccmpdi;
18766 break;
18767
18768 case E_SFmode:
18769 cmp_mode = SFmode;
18770 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18771 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18772 break;
18773
18774 case E_DFmode:
18775 cmp_mode = DFmode;
18776 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18777 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18778 break;
18779
18780 default:
18781 end_sequence ();
18782 return NULL_RTX;
18783 }
18784
18785 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18786 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18787 if (!op0 || !op1)
18788 {
18789 end_sequence ();
18790 return NULL_RTX;
18791 }
18792 *prep_seq = get_insns ();
18793 end_sequence ();
18794
18795 target = gen_rtx_REG (cc_mode, CC_REGNUM);
18796 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18797
18798 if (bit_code != AND)
18799 {
18800 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18801 GET_MODE (XEXP (prev, 0))),
18802 VOIDmode, XEXP (prev, 0), const0_rtx);
18803 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18804 }
18805
18806 create_fixed_operand (&ops[0], XEXP (prev, 0));
18807 create_fixed_operand (&ops[1], target);
18808 create_fixed_operand (&ops[2], op0);
18809 create_fixed_operand (&ops[3], op1);
18810 create_fixed_operand (&ops[4], prev);
18811 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18812
18813 push_to_sequence (*gen_seq);
18814 if (!maybe_expand_insn (icode, 6, ops))
18815 {
18816 end_sequence ();
18817 return NULL_RTX;
18818 }
18819
18820 *gen_seq = get_insns ();
18821 end_sequence ();
18822
18823 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18824 }
18825
18826 #undef TARGET_GEN_CCMP_FIRST
18827 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18828
18829 #undef TARGET_GEN_CCMP_NEXT
18830 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18831
18832 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18833 instruction fusion of some sort. */
18834
18835 static bool
18836 aarch64_macro_fusion_p (void)
18837 {
18838 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18839 }
18840
18841
18842 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
18843 should be kept together during scheduling. */
18844
18845 static bool
18846 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18847 {
18848 rtx set_dest;
18849 rtx prev_set = single_set (prev);
18850 rtx curr_set = single_set (curr);
18851 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
18852 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18853
18854 if (!aarch64_macro_fusion_p ())
18855 return false;
18856
18857 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18858 {
18859 /* We are trying to match:
18860 prev (mov) == (set (reg r0) (const_int imm16))
18861 curr (movk) == (set (zero_extract (reg r0)
18862 (const_int 16)
18863 (const_int 16))
18864 (const_int imm16_1)) */
18865
18866 set_dest = SET_DEST (curr_set);
18867
18868 if (GET_CODE (set_dest) == ZERO_EXTRACT
18869 && CONST_INT_P (SET_SRC (curr_set))
18870 && CONST_INT_P (SET_SRC (prev_set))
18871 && CONST_INT_P (XEXP (set_dest, 2))
18872 && INTVAL (XEXP (set_dest, 2)) == 16
18873 && REG_P (XEXP (set_dest, 0))
18874 && REG_P (SET_DEST (prev_set))
18875 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18876 {
18877 return true;
18878 }
18879 }
18880
18881 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18882 {
18883
18884 /* We're trying to match:
18885 prev (adrp) == (set (reg r1)
18886 (high (symbol_ref ("SYM"))))
18887 curr (add) == (set (reg r0)
18888 (lo_sum (reg r1)
18889 (symbol_ref ("SYM"))))
18890 Note that r0 need not necessarily be the same as r1, especially
18891 during pre-regalloc scheduling. */
18892
18893 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18894 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18895 {
18896 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18897 && REG_P (XEXP (SET_SRC (curr_set), 0))
18898 && REGNO (XEXP (SET_SRC (curr_set), 0))
18899 == REGNO (SET_DEST (prev_set))
18900 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18901 XEXP (SET_SRC (curr_set), 1)))
18902 return true;
18903 }
18904 }
18905
18906 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18907 {
18908
18909 /* We're trying to match:
18910 prev (movk) == (set (zero_extract (reg r0)
18911 (const_int 16)
18912 (const_int 32))
18913 (const_int imm16_1))
18914 curr (movk) == (set (zero_extract (reg r0)
18915 (const_int 16)
18916 (const_int 48))
18917 (const_int imm16_2)) */
18918
18919 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18920 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18921 && REG_P (XEXP (SET_DEST (prev_set), 0))
18922 && REG_P (XEXP (SET_DEST (curr_set), 0))
18923 && REGNO (XEXP (SET_DEST (prev_set), 0))
18924 == REGNO (XEXP (SET_DEST (curr_set), 0))
18925 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18926 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18927 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18928 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18929 && CONST_INT_P (SET_SRC (prev_set))
18930 && CONST_INT_P (SET_SRC (curr_set)))
18931 return true;
18932
18933 }
18934 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18935 {
18936 /* We're trying to match:
18937 prev (adrp) == (set (reg r0)
18938 (high (symbol_ref ("SYM"))))
18939 curr (ldr) == (set (reg r1)
18940 (mem (lo_sum (reg r0)
18941 (symbol_ref ("SYM")))))
18942 or
18943 curr (ldr) == (set (reg r1)
18944 (zero_extend (mem
18945 (lo_sum (reg r0)
18946 (symbol_ref ("SYM")))))) */
18947 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18948 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18949 {
18950 rtx curr_src = SET_SRC (curr_set);
18951
18952 if (GET_CODE (curr_src) == ZERO_EXTEND)
18953 curr_src = XEXP (curr_src, 0);
18954
18955 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18956 && REG_P (XEXP (XEXP (curr_src, 0), 0))
18957 && REGNO (XEXP (XEXP (curr_src, 0), 0))
18958 == REGNO (SET_DEST (prev_set))
18959 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18960 XEXP (SET_SRC (prev_set), 0)))
18961 return true;
18962 }
18963 }
18964
18965 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18966 && any_condjump_p (curr))
18967 {
18968 unsigned int condreg1, condreg2;
18969 rtx cc_reg_1;
18970 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18971 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18972
18973 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18974 && prev
18975 && modified_in_p (cc_reg_1, prev))
18976 {
18977 enum attr_type prev_type = get_attr_type (prev);
18978
18979 /* FIXME: this misses some which is considered simple arthematic
18980 instructions for ThunderX. Simple shifts are missed here. */
18981 if (prev_type == TYPE_ALUS_SREG
18982 || prev_type == TYPE_ALUS_IMM
18983 || prev_type == TYPE_LOGICS_REG
18984 || prev_type == TYPE_LOGICS_IMM)
18985 return true;
18986 }
18987 }
18988
18989 if (prev_set
18990 && curr_set
18991 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18992 && any_condjump_p (curr))
18993 {
18994 /* We're trying to match:
18995 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18996 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18997 (const_int 0))
18998 (label_ref ("SYM"))
18999 (pc)) */
19000 if (SET_DEST (curr_set) == (pc_rtx)
19001 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
19002 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
19003 && REG_P (SET_DEST (prev_set))
19004 && REGNO (SET_DEST (prev_set))
19005 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
19006 {
19007 /* Fuse ALU operations followed by conditional branch instruction. */
19008 switch (get_attr_type (prev))
19009 {
19010 case TYPE_ALU_IMM:
19011 case TYPE_ALU_SREG:
19012 case TYPE_ADC_REG:
19013 case TYPE_ADC_IMM:
19014 case TYPE_ADCS_REG:
19015 case TYPE_ADCS_IMM:
19016 case TYPE_LOGIC_REG:
19017 case TYPE_LOGIC_IMM:
19018 case TYPE_CSEL:
19019 case TYPE_ADR:
19020 case TYPE_MOV_IMM:
19021 case TYPE_SHIFT_REG:
19022 case TYPE_SHIFT_IMM:
19023 case TYPE_BFM:
19024 case TYPE_RBIT:
19025 case TYPE_REV:
19026 case TYPE_EXTEND:
19027 return true;
19028
19029 default:;
19030 }
19031 }
19032 }
19033
19034 return false;
19035 }
19036
19037 /* Return true iff the instruction fusion described by OP is enabled. */
19038
19039 bool
19040 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
19041 {
19042 return (aarch64_tune_params.fusible_ops & op) != 0;
19043 }
19044
19045 /* If MEM is in the form of [base+offset], extract the two parts
19046 of address and set to BASE and OFFSET, otherwise return false
19047 after clearing BASE and OFFSET. */
19048
19049 bool
19050 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19051 {
19052 rtx addr;
19053
19054 gcc_assert (MEM_P (mem));
19055
19056 addr = XEXP (mem, 0);
19057
19058 if (REG_P (addr))
19059 {
19060 *base = addr;
19061 *offset = const0_rtx;
19062 return true;
19063 }
19064
19065 if (GET_CODE (addr) == PLUS
19066 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19067 {
19068 *base = XEXP (addr, 0);
19069 *offset = XEXP (addr, 1);
19070 return true;
19071 }
19072
19073 *base = NULL_RTX;
19074 *offset = NULL_RTX;
19075
19076 return false;
19077 }
19078
19079 /* Types for scheduling fusion. */
19080 enum sched_fusion_type
19081 {
19082 SCHED_FUSION_NONE = 0,
19083 SCHED_FUSION_LD_SIGN_EXTEND,
19084 SCHED_FUSION_LD_ZERO_EXTEND,
19085 SCHED_FUSION_LD,
19086 SCHED_FUSION_ST,
19087 SCHED_FUSION_NUM
19088 };
19089
19090 /* If INSN is a load or store of address in the form of [base+offset],
19091 extract the two parts and set to BASE and OFFSET. Return scheduling
19092 fusion type this INSN is. */
19093
19094 static enum sched_fusion_type
19095 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19096 {
19097 rtx x, dest, src;
19098 enum sched_fusion_type fusion = SCHED_FUSION_LD;
19099
19100 gcc_assert (INSN_P (insn));
19101 x = PATTERN (insn);
19102 if (GET_CODE (x) != SET)
19103 return SCHED_FUSION_NONE;
19104
19105 src = SET_SRC (x);
19106 dest = SET_DEST (x);
19107
19108 machine_mode dest_mode = GET_MODE (dest);
19109
19110 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19111 return SCHED_FUSION_NONE;
19112
19113 if (GET_CODE (src) == SIGN_EXTEND)
19114 {
19115 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19116 src = XEXP (src, 0);
19117 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19118 return SCHED_FUSION_NONE;
19119 }
19120 else if (GET_CODE (src) == ZERO_EXTEND)
19121 {
19122 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19123 src = XEXP (src, 0);
19124 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19125 return SCHED_FUSION_NONE;
19126 }
19127
19128 if (GET_CODE (src) == MEM && REG_P (dest))
19129 extract_base_offset_in_addr (src, base, offset);
19130 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19131 {
19132 fusion = SCHED_FUSION_ST;
19133 extract_base_offset_in_addr (dest, base, offset);
19134 }
19135 else
19136 return SCHED_FUSION_NONE;
19137
19138 if (*base == NULL_RTX || *offset == NULL_RTX)
19139 fusion = SCHED_FUSION_NONE;
19140
19141 return fusion;
19142 }
19143
19144 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19145
19146 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19147 and PRI are only calculated for these instructions. For other instruction,
19148 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
19149 type instruction fusion can be added by returning different priorities.
19150
19151 It's important that irrelevant instructions get the largest FUSION_PRI. */
19152
19153 static void
19154 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19155 int *fusion_pri, int *pri)
19156 {
19157 int tmp, off_val;
19158 rtx base, offset;
19159 enum sched_fusion_type fusion;
19160
19161 gcc_assert (INSN_P (insn));
19162
19163 tmp = max_pri - 1;
19164 fusion = fusion_load_store (insn, &base, &offset);
19165 if (fusion == SCHED_FUSION_NONE)
19166 {
19167 *pri = tmp;
19168 *fusion_pri = tmp;
19169 return;
19170 }
19171
19172 /* Set FUSION_PRI according to fusion type and base register. */
19173 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19174
19175 /* Calculate PRI. */
19176 tmp /= 2;
19177
19178 /* INSN with smaller offset goes first. */
19179 off_val = (int)(INTVAL (offset));
19180 if (off_val >= 0)
19181 tmp -= (off_val & 0xfffff);
19182 else
19183 tmp += ((- off_val) & 0xfffff);
19184
19185 *pri = tmp;
19186 return;
19187 }
19188
19189 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19190 Adjust priority of sha1h instructions so they are scheduled before
19191 other SHA1 instructions. */
19192
19193 static int
19194 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19195 {
19196 rtx x = PATTERN (insn);
19197
19198 if (GET_CODE (x) == SET)
19199 {
19200 x = SET_SRC (x);
19201
19202 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19203 return priority + 10;
19204 }
19205
19206 return priority;
19207 }
19208
19209 /* Given OPERANDS of consecutive load/store, check if we can merge
19210 them into ldp/stp. LOAD is true if they are load instructions.
19211 MODE is the mode of memory operands. */
19212
19213 bool
19214 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19215 machine_mode mode)
19216 {
19217 HOST_WIDE_INT offval_1, offval_2, msize;
19218 enum reg_class rclass_1, rclass_2;
19219 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19220
19221 if (load)
19222 {
19223 mem_1 = operands[1];
19224 mem_2 = operands[3];
19225 reg_1 = operands[0];
19226 reg_2 = operands[2];
19227 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19228 if (REGNO (reg_1) == REGNO (reg_2))
19229 return false;
19230 }
19231 else
19232 {
19233 mem_1 = operands[0];
19234 mem_2 = operands[2];
19235 reg_1 = operands[1];
19236 reg_2 = operands[3];
19237 }
19238
19239 /* The mems cannot be volatile. */
19240 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19241 return false;
19242
19243 /* If we have SImode and slow unaligned ldp,
19244 check the alignment to be at least 8 byte. */
19245 if (mode == SImode
19246 && (aarch64_tune_params.extra_tuning_flags
19247 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19248 && !optimize_size
19249 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19250 return false;
19251
19252 /* Check if the addresses are in the form of [base+offset]. */
19253 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19254 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19255 return false;
19256 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19257 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19258 return false;
19259
19260 /* Check if the bases are same. */
19261 if (!rtx_equal_p (base_1, base_2))
19262 return false;
19263
19264 /* The operands must be of the same size. */
19265 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19266 GET_MODE_SIZE (GET_MODE (mem_2))));
19267
19268 offval_1 = INTVAL (offset_1);
19269 offval_2 = INTVAL (offset_2);
19270 /* We should only be trying this for fixed-sized modes. There is no
19271 SVE LDP/STP instruction. */
19272 msize = GET_MODE_SIZE (mode).to_constant ();
19273 /* Check if the offsets are consecutive. */
19274 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19275 return false;
19276
19277 /* Check if the addresses are clobbered by load. */
19278 if (load)
19279 {
19280 if (reg_mentioned_p (reg_1, mem_1))
19281 return false;
19282
19283 /* In increasing order, the last load can clobber the address. */
19284 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19285 return false;
19286 }
19287
19288 /* One of the memory accesses must be a mempair operand.
19289 If it is not the first one, they need to be swapped by the
19290 peephole. */
19291 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19292 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19293 return false;
19294
19295 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19296 rclass_1 = FP_REGS;
19297 else
19298 rclass_1 = GENERAL_REGS;
19299
19300 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19301 rclass_2 = FP_REGS;
19302 else
19303 rclass_2 = GENERAL_REGS;
19304
19305 /* Check if the registers are of same class. */
19306 if (rclass_1 != rclass_2)
19307 return false;
19308
19309 return true;
19310 }
19311
19312 /* Given OPERANDS of consecutive load/store that can be merged,
19313 swap them if they are not in ascending order. */
19314 void
19315 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19316 {
19317 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19318 HOST_WIDE_INT offval_1, offval_2;
19319
19320 if (load)
19321 {
19322 mem_1 = operands[1];
19323 mem_2 = operands[3];
19324 }
19325 else
19326 {
19327 mem_1 = operands[0];
19328 mem_2 = operands[2];
19329 }
19330
19331 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19332 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19333
19334 offval_1 = INTVAL (offset_1);
19335 offval_2 = INTVAL (offset_2);
19336
19337 if (offval_1 > offval_2)
19338 {
19339 /* Irrespective of whether this is a load or a store,
19340 we do the same swap. */
19341 std::swap (operands[0], operands[2]);
19342 std::swap (operands[1], operands[3]);
19343 }
19344 }
19345
19346 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19347 comparison between the two. */
19348 int
19349 aarch64_host_wide_int_compare (const void *x, const void *y)
19350 {
19351 return wi::cmps (* ((const HOST_WIDE_INT *) x),
19352 * ((const HOST_WIDE_INT *) y));
19353 }
19354
19355 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19356 other pointing to a REG rtx containing an offset, compare the offsets
19357 of the two pairs.
19358
19359 Return:
19360
19361 1 iff offset (X) > offset (Y)
19362 0 iff offset (X) == offset (Y)
19363 -1 iff offset (X) < offset (Y) */
19364 int
19365 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19366 {
19367 const rtx * operands_1 = (const rtx *) x;
19368 const rtx * operands_2 = (const rtx *) y;
19369 rtx mem_1, mem_2, base, offset_1, offset_2;
19370
19371 if (MEM_P (operands_1[0]))
19372 mem_1 = operands_1[0];
19373 else
19374 mem_1 = operands_1[1];
19375
19376 if (MEM_P (operands_2[0]))
19377 mem_2 = operands_2[0];
19378 else
19379 mem_2 = operands_2[1];
19380
19381 /* Extract the offsets. */
19382 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19383 extract_base_offset_in_addr (mem_2, &base, &offset_2);
19384
19385 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19386
19387 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19388 }
19389
19390 /* Given OPERANDS of consecutive load/store, check if we can merge
19391 them into ldp/stp by adjusting the offset. LOAD is true if they
19392 are load instructions. MODE is the mode of memory operands.
19393
19394 Given below consecutive stores:
19395
19396 str w1, [xb, 0x100]
19397 str w1, [xb, 0x104]
19398 str w1, [xb, 0x108]
19399 str w1, [xb, 0x10c]
19400
19401 Though the offsets are out of the range supported by stp, we can
19402 still pair them after adjusting the offset, like:
19403
19404 add scratch, xb, 0x100
19405 stp w1, w1, [scratch]
19406 stp w1, w1, [scratch, 0x8]
19407
19408 The peephole patterns detecting this opportunity should guarantee
19409 the scratch register is avaliable. */
19410
19411 bool
19412 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19413 scalar_mode mode)
19414 {
19415 const int num_insns = 4;
19416 enum reg_class rclass;
19417 HOST_WIDE_INT offvals[num_insns], msize;
19418 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19419
19420 if (load)
19421 {
19422 for (int i = 0; i < num_insns; i++)
19423 {
19424 reg[i] = operands[2 * i];
19425 mem[i] = operands[2 * i + 1];
19426
19427 gcc_assert (REG_P (reg[i]));
19428 }
19429
19430 /* Do not attempt to merge the loads if the loads clobber each other. */
19431 for (int i = 0; i < 8; i += 2)
19432 for (int j = i + 2; j < 8; j += 2)
19433 if (reg_overlap_mentioned_p (operands[i], operands[j]))
19434 return false;
19435 }
19436 else
19437 for (int i = 0; i < num_insns; i++)
19438 {
19439 mem[i] = operands[2 * i];
19440 reg[i] = operands[2 * i + 1];
19441 }
19442
19443 /* Skip if memory operand is by itself valid for ldp/stp. */
19444 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19445 return false;
19446
19447 for (int i = 0; i < num_insns; i++)
19448 {
19449 /* The mems cannot be volatile. */
19450 if (MEM_VOLATILE_P (mem[i]))
19451 return false;
19452
19453 /* Check if the addresses are in the form of [base+offset]. */
19454 extract_base_offset_in_addr (mem[i], base + i, offset + i);
19455 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19456 return false;
19457 }
19458
19459 /* Check if the registers are of same class. */
19460 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19461 ? FP_REGS : GENERAL_REGS;
19462
19463 for (int i = 1; i < num_insns; i++)
19464 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19465 {
19466 if (rclass != FP_REGS)
19467 return false;
19468 }
19469 else
19470 {
19471 if (rclass != GENERAL_REGS)
19472 return false;
19473 }
19474
19475 /* Only the last register in the order in which they occur
19476 may be clobbered by the load. */
19477 if (rclass == GENERAL_REGS && load)
19478 for (int i = 0; i < num_insns - 1; i++)
19479 if (reg_mentioned_p (reg[i], mem[i]))
19480 return false;
19481
19482 /* Check if the bases are same. */
19483 for (int i = 0; i < num_insns - 1; i++)
19484 if (!rtx_equal_p (base[i], base[i + 1]))
19485 return false;
19486
19487 for (int i = 0; i < num_insns; i++)
19488 offvals[i] = INTVAL (offset[i]);
19489
19490 msize = GET_MODE_SIZE (mode);
19491
19492 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19493 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19494 aarch64_host_wide_int_compare);
19495
19496 if (!(offvals[1] == offvals[0] + msize
19497 && offvals[3] == offvals[2] + msize))
19498 return false;
19499
19500 /* Check that offsets are within range of each other. The ldp/stp
19501 instructions have 7 bit immediate offsets, so use 0x80. */
19502 if (offvals[2] - offvals[0] >= msize * 0x80)
19503 return false;
19504
19505 /* The offsets must be aligned with respect to each other. */
19506 if (offvals[0] % msize != offvals[2] % msize)
19507 return false;
19508
19509 /* If we have SImode and slow unaligned ldp,
19510 check the alignment to be at least 8 byte. */
19511 if (mode == SImode
19512 && (aarch64_tune_params.extra_tuning_flags
19513 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19514 && !optimize_size
19515 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19516 return false;
19517
19518 return true;
19519 }
19520
19521 /* Given OPERANDS of consecutive load/store, this function pairs them
19522 into LDP/STP after adjusting the offset. It depends on the fact
19523 that the operands can be sorted so the offsets are correct for STP.
19524 MODE is the mode of memory operands. CODE is the rtl operator
19525 which should be applied to all memory operands, it's SIGN_EXTEND,
19526 ZERO_EXTEND or UNKNOWN. */
19527
19528 bool
19529 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19530 scalar_mode mode, RTX_CODE code)
19531 {
19532 rtx base, offset_1, offset_3, t1, t2;
19533 rtx mem_1, mem_2, mem_3, mem_4;
19534 rtx temp_operands[8];
19535 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19536 stp_off_upper_limit, stp_off_lower_limit, msize;
19537
19538 /* We make changes on a copy as we may still bail out. */
19539 for (int i = 0; i < 8; i ++)
19540 temp_operands[i] = operands[i];
19541
19542 /* Sort the operands. */
19543 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19544
19545 /* Copy the memory operands so that if we have to bail for some
19546 reason the original addresses are unchanged. */
19547 if (load)
19548 {
19549 mem_1 = copy_rtx (temp_operands[1]);
19550 mem_2 = copy_rtx (temp_operands[3]);
19551 mem_3 = copy_rtx (temp_operands[5]);
19552 mem_4 = copy_rtx (temp_operands[7]);
19553 }
19554 else
19555 {
19556 mem_1 = copy_rtx (temp_operands[0]);
19557 mem_2 = copy_rtx (temp_operands[2]);
19558 mem_3 = copy_rtx (temp_operands[4]);
19559 mem_4 = copy_rtx (temp_operands[6]);
19560 gcc_assert (code == UNKNOWN);
19561 }
19562
19563 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19564 extract_base_offset_in_addr (mem_3, &base, &offset_3);
19565 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19566 && offset_3 != NULL_RTX);
19567
19568 /* Adjust offset so it can fit in LDP/STP instruction. */
19569 msize = GET_MODE_SIZE (mode);
19570 stp_off_upper_limit = msize * (0x40 - 1);
19571 stp_off_lower_limit = - msize * 0x40;
19572
19573 off_val_1 = INTVAL (offset_1);
19574 off_val_3 = INTVAL (offset_3);
19575
19576 /* The base offset is optimally half way between the two STP/LDP offsets. */
19577 if (msize <= 4)
19578 base_off = (off_val_1 + off_val_3) / 2;
19579 else
19580 /* However, due to issues with negative LDP/STP offset generation for
19581 larger modes, for DF, DI and vector modes. we must not use negative
19582 addresses smaller than 9 signed unadjusted bits can store. This
19583 provides the most range in this case. */
19584 base_off = off_val_1;
19585
19586 /* Adjust the base so that it is aligned with the addresses but still
19587 optimal. */
19588 if (base_off % msize != off_val_1 % msize)
19589 /* Fix the offset, bearing in mind we want to make it bigger not
19590 smaller. */
19591 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19592 else if (msize <= 4)
19593 /* The negative range of LDP/STP is one larger than the positive range. */
19594 base_off += msize;
19595
19596 /* Check if base offset is too big or too small. We can attempt to resolve
19597 this issue by setting it to the maximum value and seeing if the offsets
19598 still fit. */
19599 if (base_off >= 0x1000)
19600 {
19601 base_off = 0x1000 - 1;
19602 /* We must still make sure that the base offset is aligned with respect
19603 to the address. But it may may not be made any bigger. */
19604 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19605 }
19606
19607 /* Likewise for the case where the base is too small. */
19608 if (base_off <= -0x1000)
19609 {
19610 base_off = -0x1000 + 1;
19611 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19612 }
19613
19614 /* Offset of the first STP/LDP. */
19615 new_off_1 = off_val_1 - base_off;
19616
19617 /* Offset of the second STP/LDP. */
19618 new_off_3 = off_val_3 - base_off;
19619
19620 /* The offsets must be within the range of the LDP/STP instructions. */
19621 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19622 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19623 return false;
19624
19625 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19626 new_off_1), true);
19627 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19628 new_off_1 + msize), true);
19629 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19630 new_off_3), true);
19631 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19632 new_off_3 + msize), true);
19633
19634 if (!aarch64_mem_pair_operand (mem_1, mode)
19635 || !aarch64_mem_pair_operand (mem_3, mode))
19636 return false;
19637
19638 if (code == ZERO_EXTEND)
19639 {
19640 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19641 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19642 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19643 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19644 }
19645 else if (code == SIGN_EXTEND)
19646 {
19647 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19648 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19649 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19650 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19651 }
19652
19653 if (load)
19654 {
19655 operands[0] = temp_operands[0];
19656 operands[1] = mem_1;
19657 operands[2] = temp_operands[2];
19658 operands[3] = mem_2;
19659 operands[4] = temp_operands[4];
19660 operands[5] = mem_3;
19661 operands[6] = temp_operands[6];
19662 operands[7] = mem_4;
19663 }
19664 else
19665 {
19666 operands[0] = mem_1;
19667 operands[1] = temp_operands[1];
19668 operands[2] = mem_2;
19669 operands[3] = temp_operands[3];
19670 operands[4] = mem_3;
19671 operands[5] = temp_operands[5];
19672 operands[6] = mem_4;
19673 operands[7] = temp_operands[7];
19674 }
19675
19676 /* Emit adjusting instruction. */
19677 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19678 /* Emit ldp/stp instructions. */
19679 t1 = gen_rtx_SET (operands[0], operands[1]);
19680 t2 = gen_rtx_SET (operands[2], operands[3]);
19681 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19682 t1 = gen_rtx_SET (operands[4], operands[5]);
19683 t2 = gen_rtx_SET (operands[6], operands[7]);
19684 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19685 return true;
19686 }
19687
19688 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19689 it isn't worth branching around empty masked ops (including masked
19690 stores). */
19691
19692 static bool
19693 aarch64_empty_mask_is_expensive (unsigned)
19694 {
19695 return false;
19696 }
19697
19698 /* Return 1 if pseudo register should be created and used to hold
19699 GOT address for PIC code. */
19700
19701 bool
19702 aarch64_use_pseudo_pic_reg (void)
19703 {
19704 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19705 }
19706
19707 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19708
19709 static int
19710 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19711 {
19712 switch (XINT (x, 1))
19713 {
19714 case UNSPEC_GOTSMALLPIC:
19715 case UNSPEC_GOTSMALLPIC28K:
19716 case UNSPEC_GOTTINYPIC:
19717 return 0;
19718 default:
19719 break;
19720 }
19721
19722 return default_unspec_may_trap_p (x, flags);
19723 }
19724
19725
19726 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19727 return the log2 of that value. Otherwise return -1. */
19728
19729 int
19730 aarch64_fpconst_pow_of_2 (rtx x)
19731 {
19732 const REAL_VALUE_TYPE *r;
19733
19734 if (!CONST_DOUBLE_P (x))
19735 return -1;
19736
19737 r = CONST_DOUBLE_REAL_VALUE (x);
19738
19739 if (REAL_VALUE_NEGATIVE (*r)
19740 || REAL_VALUE_ISNAN (*r)
19741 || REAL_VALUE_ISINF (*r)
19742 || !real_isinteger (r, DFmode))
19743 return -1;
19744
19745 return exact_log2 (real_to_integer (r));
19746 }
19747
19748 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
19749 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
19750 return n. Otherwise return -1. */
19751
19752 int
19753 aarch64_fpconst_pow2_recip (rtx x)
19754 {
19755 REAL_VALUE_TYPE r0;
19756
19757 if (!CONST_DOUBLE_P (x))
19758 return -1;
19759
19760 r0 = *CONST_DOUBLE_REAL_VALUE (x);
19761 if (exact_real_inverse (DFmode, &r0)
19762 && !REAL_VALUE_NEGATIVE (r0))
19763 {
19764 int ret = exact_log2 (real_to_integer (&r0));
19765 if (ret >= 1 && ret <= 32)
19766 return ret;
19767 }
19768 return -1;
19769 }
19770
19771 /* If X is a vector of equal CONST_DOUBLE values and that value is
19772 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19773
19774 int
19775 aarch64_vec_fpconst_pow_of_2 (rtx x)
19776 {
19777 int nelts;
19778 if (GET_CODE (x) != CONST_VECTOR
19779 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19780 return -1;
19781
19782 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19783 return -1;
19784
19785 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19786 if (firstval <= 0)
19787 return -1;
19788
19789 for (int i = 1; i < nelts; i++)
19790 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19791 return -1;
19792
19793 return firstval;
19794 }
19795
19796 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19797 to float.
19798
19799 __fp16 always promotes through this hook.
19800 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19801 through the generic excess precision logic rather than here. */
19802
19803 static tree
19804 aarch64_promoted_type (const_tree t)
19805 {
19806 if (SCALAR_FLOAT_TYPE_P (t)
19807 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19808 return float_type_node;
19809
19810 return NULL_TREE;
19811 }
19812
19813 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19814
19815 static bool
19816 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19817 optimization_type opt_type)
19818 {
19819 switch (op)
19820 {
19821 case rsqrt_optab:
19822 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19823
19824 default:
19825 return true;
19826 }
19827 }
19828
19829 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19830
19831 static unsigned int
19832 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19833 int *offset)
19834 {
19835 /* Polynomial invariant 1 == (VG / 2) - 1. */
19836 gcc_assert (i == 1);
19837 *factor = 2;
19838 *offset = 1;
19839 return AARCH64_DWARF_VG;
19840 }
19841
19842 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19843 if MODE is HFmode, and punt to the generic implementation otherwise. */
19844
19845 static bool
19846 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19847 {
19848 return (mode == HFmode
19849 ? true
19850 : default_libgcc_floating_mode_supported_p (mode));
19851 }
19852
19853 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19854 if MODE is HFmode, and punt to the generic implementation otherwise. */
19855
19856 static bool
19857 aarch64_scalar_mode_supported_p (scalar_mode mode)
19858 {
19859 return (mode == HFmode
19860 ? true
19861 : default_scalar_mode_supported_p (mode));
19862 }
19863
19864 /* Set the value of FLT_EVAL_METHOD.
19865 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19866
19867 0: evaluate all operations and constants, whose semantic type has at
19868 most the range and precision of type float, to the range and
19869 precision of float; evaluate all other operations and constants to
19870 the range and precision of the semantic type;
19871
19872 N, where _FloatN is a supported interchange floating type
19873 evaluate all operations and constants, whose semantic type has at
19874 most the range and precision of _FloatN type, to the range and
19875 precision of the _FloatN type; evaluate all other operations and
19876 constants to the range and precision of the semantic type;
19877
19878 If we have the ARMv8.2-A extensions then we support _Float16 in native
19879 precision, so we should set this to 16. Otherwise, we support the type,
19880 but want to evaluate expressions in float precision, so set this to
19881 0. */
19882
19883 static enum flt_eval_method
19884 aarch64_excess_precision (enum excess_precision_type type)
19885 {
19886 switch (type)
19887 {
19888 case EXCESS_PRECISION_TYPE_FAST:
19889 case EXCESS_PRECISION_TYPE_STANDARD:
19890 /* We can calculate either in 16-bit range and precision or
19891 32-bit range and precision. Make that decision based on whether
19892 we have native support for the ARMv8.2-A 16-bit floating-point
19893 instructions or not. */
19894 return (TARGET_FP_F16INST
19895 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19896 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19897 case EXCESS_PRECISION_TYPE_IMPLICIT:
19898 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19899 default:
19900 gcc_unreachable ();
19901 }
19902 return FLT_EVAL_METHOD_UNPREDICTABLE;
19903 }
19904
19905 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
19906 scheduled for speculative execution. Reject the long-running division
19907 and square-root instructions. */
19908
19909 static bool
19910 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19911 {
19912 switch (get_attr_type (insn))
19913 {
19914 case TYPE_SDIV:
19915 case TYPE_UDIV:
19916 case TYPE_FDIVS:
19917 case TYPE_FDIVD:
19918 case TYPE_FSQRTS:
19919 case TYPE_FSQRTD:
19920 case TYPE_NEON_FP_SQRT_S:
19921 case TYPE_NEON_FP_SQRT_D:
19922 case TYPE_NEON_FP_SQRT_S_Q:
19923 case TYPE_NEON_FP_SQRT_D_Q:
19924 case TYPE_NEON_FP_DIV_S:
19925 case TYPE_NEON_FP_DIV_D:
19926 case TYPE_NEON_FP_DIV_S_Q:
19927 case TYPE_NEON_FP_DIV_D_Q:
19928 return false;
19929 default:
19930 return true;
19931 }
19932 }
19933
19934 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
19935
19936 static int
19937 aarch64_compute_pressure_classes (reg_class *classes)
19938 {
19939 int i = 0;
19940 classes[i++] = GENERAL_REGS;
19941 classes[i++] = FP_REGS;
19942 /* PR_REGS isn't a useful pressure class because many predicate pseudo
19943 registers need to go in PR_LO_REGS at some point during their
19944 lifetime. Splitting it into two halves has the effect of making
19945 all predicates count against PR_LO_REGS, so that we try whenever
19946 possible to restrict the number of live predicates to 8. This
19947 greatly reduces the amount of spilling in certain loops. */
19948 classes[i++] = PR_LO_REGS;
19949 classes[i++] = PR_HI_REGS;
19950 return i;
19951 }
19952
19953 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
19954
19955 static bool
19956 aarch64_can_change_mode_class (machine_mode from,
19957 machine_mode to, reg_class_t)
19958 {
19959 if (BYTES_BIG_ENDIAN)
19960 {
19961 bool from_sve_p = aarch64_sve_data_mode_p (from);
19962 bool to_sve_p = aarch64_sve_data_mode_p (to);
19963
19964 /* Don't allow changes between SVE data modes and non-SVE modes.
19965 See the comment at the head of aarch64-sve.md for details. */
19966 if (from_sve_p != to_sve_p)
19967 return false;
19968
19969 /* Don't allow changes in element size: lane 0 of the new vector
19970 would not then be lane 0 of the old vector. See the comment
19971 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19972 description.
19973
19974 In the worst case, this forces a register to be spilled in
19975 one mode and reloaded in the other, which handles the
19976 endianness correctly. */
19977 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19978 return false;
19979 }
19980 return true;
19981 }
19982
19983 /* Implement TARGET_EARLY_REMAT_MODES. */
19984
19985 static void
19986 aarch64_select_early_remat_modes (sbitmap modes)
19987 {
19988 /* SVE values are not normally live across a call, so it should be
19989 worth doing early rematerialization even in VL-specific mode. */
19990 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19991 if (aarch64_sve_mode_p ((machine_mode) i))
19992 bitmap_set_bit (modes, i);
19993 }
19994
19995 /* Override the default target speculation_safe_value. */
19996 static rtx
19997 aarch64_speculation_safe_value (machine_mode mode,
19998 rtx result, rtx val, rtx failval)
19999 {
20000 /* Maybe we should warn if falling back to hard barriers. They are
20001 likely to be noticably more expensive than the alternative below. */
20002 if (!aarch64_track_speculation)
20003 return default_speculation_safe_value (mode, result, val, failval);
20004
20005 if (!REG_P (val))
20006 val = copy_to_mode_reg (mode, val);
20007
20008 if (!aarch64_reg_or_zero (failval, mode))
20009 failval = copy_to_mode_reg (mode, failval);
20010
20011 emit_insn (gen_despeculate_copy (mode, result, val, failval));
20012 return result;
20013 }
20014
20015 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20016 Look into the tuning structure for an estimate.
20017 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20018 Advanced SIMD 128 bits. */
20019
20020 static HOST_WIDE_INT
20021 aarch64_estimated_poly_value (poly_int64 val)
20022 {
20023 enum aarch64_sve_vector_bits_enum width_source
20024 = aarch64_tune_params.sve_width;
20025
20026 /* If we still don't have an estimate, use the default. */
20027 if (width_source == SVE_SCALABLE)
20028 return default_estimated_poly_value (val);
20029
20030 HOST_WIDE_INT over_128 = width_source - 128;
20031 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
20032 }
20033
20034
20035 /* Return true for types that could be supported as SIMD return or
20036 argument types. */
20037
20038 static bool
20039 supported_simd_type (tree t)
20040 {
20041 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
20042 {
20043 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
20044 return s == 1 || s == 2 || s == 4 || s == 8;
20045 }
20046 return false;
20047 }
20048
20049 /* Return true for types that currently are supported as SIMD return
20050 or argument types. */
20051
20052 static bool
20053 currently_supported_simd_type (tree t, tree b)
20054 {
20055 if (COMPLEX_FLOAT_TYPE_P (t))
20056 return false;
20057
20058 if (TYPE_SIZE (t) != TYPE_SIZE (b))
20059 return false;
20060
20061 return supported_simd_type (t);
20062 }
20063
20064 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
20065
20066 static int
20067 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20068 struct cgraph_simd_clone *clonei,
20069 tree base_type, int num)
20070 {
20071 tree t, ret_type, arg_type;
20072 unsigned int elt_bits, vec_bits, count;
20073
20074 if (!TARGET_SIMD)
20075 return 0;
20076
20077 if (clonei->simdlen
20078 && (clonei->simdlen < 2
20079 || clonei->simdlen > 1024
20080 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20081 {
20082 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20083 "unsupported simdlen %d", clonei->simdlen);
20084 return 0;
20085 }
20086
20087 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20088 if (TREE_CODE (ret_type) != VOID_TYPE
20089 && !currently_supported_simd_type (ret_type, base_type))
20090 {
20091 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20092 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20093 "GCC does not currently support mixed size types "
20094 "for %<simd%> functions");
20095 else if (supported_simd_type (ret_type))
20096 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20097 "GCC does not currently support return type %qT "
20098 "for %<simd%> functions", ret_type);
20099 else
20100 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20101 "unsupported return type %qT for %<simd%> functions",
20102 ret_type);
20103 return 0;
20104 }
20105
20106 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20107 {
20108 arg_type = TREE_TYPE (t);
20109
20110 if (!currently_supported_simd_type (arg_type, base_type))
20111 {
20112 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20113 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20114 "GCC does not currently support mixed size types "
20115 "for %<simd%> functions");
20116 else
20117 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20118 "GCC does not currently support argument type %qT "
20119 "for %<simd%> functions", arg_type);
20120 return 0;
20121 }
20122 }
20123
20124 clonei->vecsize_mangle = 'n';
20125 clonei->mask_mode = VOIDmode;
20126 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20127 if (clonei->simdlen == 0)
20128 {
20129 count = 2;
20130 vec_bits = (num == 0 ? 64 : 128);
20131 clonei->simdlen = vec_bits / elt_bits;
20132 }
20133 else
20134 {
20135 count = 1;
20136 vec_bits = clonei->simdlen * elt_bits;
20137 if (vec_bits != 64 && vec_bits != 128)
20138 {
20139 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20140 "GCC does not currently support simdlen %d for type %qT",
20141 clonei->simdlen, base_type);
20142 return 0;
20143 }
20144 }
20145 clonei->vecsize_int = vec_bits;
20146 clonei->vecsize_float = vec_bits;
20147 return count;
20148 }
20149
20150 /* Implement TARGET_SIMD_CLONE_ADJUST. */
20151
20152 static void
20153 aarch64_simd_clone_adjust (struct cgraph_node *node)
20154 {
20155 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20156 use the correct ABI. */
20157
20158 tree t = TREE_TYPE (node->decl);
20159 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20160 TYPE_ATTRIBUTES (t));
20161 }
20162
20163 /* Implement TARGET_SIMD_CLONE_USABLE. */
20164
20165 static int
20166 aarch64_simd_clone_usable (struct cgraph_node *node)
20167 {
20168 switch (node->simdclone->vecsize_mangle)
20169 {
20170 case 'n':
20171 if (!TARGET_SIMD)
20172 return -1;
20173 return 0;
20174 default:
20175 gcc_unreachable ();
20176 }
20177 }
20178
20179 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20180
20181 static int
20182 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20183 {
20184 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20185 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20186 return 0;
20187 return 1;
20188 }
20189
20190 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20191
20192 static const char *
20193 aarch64_get_multilib_abi_name (void)
20194 {
20195 if (TARGET_BIG_END)
20196 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20197 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20198 }
20199
20200 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20201 global variable based guard use the default else
20202 return a null tree. */
20203 static tree
20204 aarch64_stack_protect_guard (void)
20205 {
20206 if (aarch64_stack_protector_guard == SSP_GLOBAL)
20207 return default_stack_protect_guard ();
20208
20209 return NULL_TREE;
20210 }
20211
20212 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
20213 section at the end if needed. */
20214 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
20215 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
20216 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
20217 void
20218 aarch64_file_end_indicate_exec_stack ()
20219 {
20220 file_end_indicate_exec_stack ();
20221
20222 unsigned feature_1_and = 0;
20223 if (aarch64_bti_enabled ())
20224 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20225
20226 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20227 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20228
20229 if (feature_1_and)
20230 {
20231 /* Generate .note.gnu.property section. */
20232 switch_to_section (get_section (".note.gnu.property",
20233 SECTION_NOTYPE, NULL));
20234
20235 /* PT_NOTE header: namesz, descsz, type.
20236 namesz = 4 ("GNU\0")
20237 descsz = 16 (Size of the program property array)
20238 [(12 + padding) * Number of array elements]
20239 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20240 assemble_align (POINTER_SIZE);
20241 assemble_integer (GEN_INT (4), 4, 32, 1);
20242 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20243 assemble_integer (GEN_INT (5), 4, 32, 1);
20244
20245 /* PT_NOTE name. */
20246 assemble_string ("GNU", 4);
20247
20248 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20249 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20250 datasz = 4
20251 data = feature_1_and. */
20252 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20253 assemble_integer (GEN_INT (4), 4, 32, 1);
20254 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20255
20256 /* Pad the size of the note to the required alignment. */
20257 assemble_align (POINTER_SIZE);
20258 }
20259 }
20260 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20261 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20262 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20263
20264 /* Target-specific selftests. */
20265
20266 #if CHECKING_P
20267
20268 namespace selftest {
20269
20270 /* Selftest for the RTL loader.
20271 Verify that the RTL loader copes with a dump from
20272 print_rtx_function. This is essentially just a test that class
20273 function_reader can handle a real dump, but it also verifies
20274 that lookup_reg_by_dump_name correctly handles hard regs.
20275 The presence of hard reg names in the dump means that the test is
20276 target-specific, hence it is in this file. */
20277
20278 static void
20279 aarch64_test_loading_full_dump ()
20280 {
20281 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20282
20283 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20284
20285 rtx_insn *insn_1 = get_insn_by_uid (1);
20286 ASSERT_EQ (NOTE, GET_CODE (insn_1));
20287
20288 rtx_insn *insn_15 = get_insn_by_uid (15);
20289 ASSERT_EQ (INSN, GET_CODE (insn_15));
20290 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20291
20292 /* Verify crtl->return_rtx. */
20293 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20294 ASSERT_EQ (0, REGNO (crtl->return_rtx));
20295 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20296 }
20297
20298 /* Run all target-specific selftests. */
20299
20300 static void
20301 aarch64_run_selftests (void)
20302 {
20303 aarch64_test_loading_full_dump ();
20304 }
20305
20306 } // namespace selftest
20307
20308 #endif /* #if CHECKING_P */
20309
20310 #undef TARGET_STACK_PROTECT_GUARD
20311 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20312
20313 #undef TARGET_ADDRESS_COST
20314 #define TARGET_ADDRESS_COST aarch64_address_cost
20315
20316 /* This hook will determines whether unnamed bitfields affect the alignment
20317 of the containing structure. The hook returns true if the structure
20318 should inherit the alignment requirements of an unnamed bitfield's
20319 type. */
20320 #undef TARGET_ALIGN_ANON_BITFIELD
20321 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20322
20323 #undef TARGET_ASM_ALIGNED_DI_OP
20324 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20325
20326 #undef TARGET_ASM_ALIGNED_HI_OP
20327 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20328
20329 #undef TARGET_ASM_ALIGNED_SI_OP
20330 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20331
20332 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20333 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20334 hook_bool_const_tree_hwi_hwi_const_tree_true
20335
20336 #undef TARGET_ASM_FILE_START
20337 #define TARGET_ASM_FILE_START aarch64_start_file
20338
20339 #undef TARGET_ASM_OUTPUT_MI_THUNK
20340 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20341
20342 #undef TARGET_ASM_SELECT_RTX_SECTION
20343 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20344
20345 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20346 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20347
20348 #undef TARGET_BUILD_BUILTIN_VA_LIST
20349 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20350
20351 #undef TARGET_CALLEE_COPIES
20352 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
20353
20354 #undef TARGET_CAN_ELIMINATE
20355 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20356
20357 #undef TARGET_CAN_INLINE_P
20358 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20359
20360 #undef TARGET_CANNOT_FORCE_CONST_MEM
20361 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20362
20363 #undef TARGET_CASE_VALUES_THRESHOLD
20364 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20365
20366 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20367 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20368
20369 /* Only the least significant bit is used for initialization guard
20370 variables. */
20371 #undef TARGET_CXX_GUARD_MASK_BIT
20372 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20373
20374 #undef TARGET_C_MODE_FOR_SUFFIX
20375 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20376
20377 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20378 #undef TARGET_DEFAULT_TARGET_FLAGS
20379 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20380 #endif
20381
20382 #undef TARGET_CLASS_MAX_NREGS
20383 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20384
20385 #undef TARGET_BUILTIN_DECL
20386 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20387
20388 #undef TARGET_BUILTIN_RECIPROCAL
20389 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20390
20391 #undef TARGET_C_EXCESS_PRECISION
20392 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20393
20394 #undef TARGET_EXPAND_BUILTIN
20395 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20396
20397 #undef TARGET_EXPAND_BUILTIN_VA_START
20398 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20399
20400 #undef TARGET_FOLD_BUILTIN
20401 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20402
20403 #undef TARGET_FUNCTION_ARG
20404 #define TARGET_FUNCTION_ARG aarch64_function_arg
20405
20406 #undef TARGET_FUNCTION_ARG_ADVANCE
20407 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20408
20409 #undef TARGET_FUNCTION_ARG_BOUNDARY
20410 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20411
20412 #undef TARGET_FUNCTION_ARG_PADDING
20413 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20414
20415 #undef TARGET_GET_RAW_RESULT_MODE
20416 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20417 #undef TARGET_GET_RAW_ARG_MODE
20418 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20419
20420 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20421 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20422
20423 #undef TARGET_FUNCTION_VALUE
20424 #define TARGET_FUNCTION_VALUE aarch64_function_value
20425
20426 #undef TARGET_FUNCTION_VALUE_REGNO_P
20427 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20428
20429 #undef TARGET_GIMPLE_FOLD_BUILTIN
20430 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20431
20432 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20433 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20434
20435 #undef TARGET_INIT_BUILTINS
20436 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20437
20438 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20439 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20440 aarch64_ira_change_pseudo_allocno_class
20441
20442 #undef TARGET_LEGITIMATE_ADDRESS_P
20443 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20444
20445 #undef TARGET_LEGITIMATE_CONSTANT_P
20446 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20447
20448 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20449 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20450 aarch64_legitimize_address_displacement
20451
20452 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20453 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20454
20455 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20456 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20457 aarch64_libgcc_floating_mode_supported_p
20458
20459 #undef TARGET_MANGLE_TYPE
20460 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20461
20462 #undef TARGET_MEMORY_MOVE_COST
20463 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20464
20465 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20466 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20467
20468 #undef TARGET_MUST_PASS_IN_STACK
20469 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20470
20471 /* This target hook should return true if accesses to volatile bitfields
20472 should use the narrowest mode possible. It should return false if these
20473 accesses should use the bitfield container type. */
20474 #undef TARGET_NARROW_VOLATILE_BITFIELD
20475 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20476
20477 #undef TARGET_OPTION_OVERRIDE
20478 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20479
20480 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20481 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20482 aarch64_override_options_after_change
20483
20484 #undef TARGET_OPTION_SAVE
20485 #define TARGET_OPTION_SAVE aarch64_option_save
20486
20487 #undef TARGET_OPTION_RESTORE
20488 #define TARGET_OPTION_RESTORE aarch64_option_restore
20489
20490 #undef TARGET_OPTION_PRINT
20491 #define TARGET_OPTION_PRINT aarch64_option_print
20492
20493 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20494 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20495
20496 #undef TARGET_SET_CURRENT_FUNCTION
20497 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20498
20499 #undef TARGET_PASS_BY_REFERENCE
20500 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20501
20502 #undef TARGET_PREFERRED_RELOAD_CLASS
20503 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20504
20505 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20506 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20507
20508 #undef TARGET_PROMOTED_TYPE
20509 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20510
20511 #undef TARGET_SECONDARY_RELOAD
20512 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20513
20514 #undef TARGET_SHIFT_TRUNCATION_MASK
20515 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20516
20517 #undef TARGET_SETUP_INCOMING_VARARGS
20518 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20519
20520 #undef TARGET_STRUCT_VALUE_RTX
20521 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20522
20523 #undef TARGET_REGISTER_MOVE_COST
20524 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20525
20526 #undef TARGET_RETURN_IN_MEMORY
20527 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20528
20529 #undef TARGET_RETURN_IN_MSB
20530 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20531
20532 #undef TARGET_RTX_COSTS
20533 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20534
20535 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20536 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20537
20538 #undef TARGET_SCHED_ISSUE_RATE
20539 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20540
20541 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20542 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20543 aarch64_sched_first_cycle_multipass_dfa_lookahead
20544
20545 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20546 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20547 aarch64_first_cycle_multipass_dfa_lookahead_guard
20548
20549 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20550 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20551 aarch64_get_separate_components
20552
20553 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20554 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20555 aarch64_components_for_bb
20556
20557 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20558 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20559 aarch64_disqualify_components
20560
20561 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20562 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20563 aarch64_emit_prologue_components
20564
20565 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20566 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20567 aarch64_emit_epilogue_components
20568
20569 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20570 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20571 aarch64_set_handled_components
20572
20573 #undef TARGET_TRAMPOLINE_INIT
20574 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20575
20576 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20577 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20578
20579 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20580 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20581
20582 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20583 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20584 aarch64_builtin_support_vector_misalignment
20585
20586 #undef TARGET_ARRAY_MODE
20587 #define TARGET_ARRAY_MODE aarch64_array_mode
20588
20589 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20590 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20591
20592 #undef TARGET_VECTORIZE_ADD_STMT_COST
20593 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20594
20595 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20596 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20597 aarch64_builtin_vectorization_cost
20598
20599 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20600 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20601
20602 #undef TARGET_VECTORIZE_BUILTINS
20603 #define TARGET_VECTORIZE_BUILTINS
20604
20605 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20606 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20607 aarch64_builtin_vectorized_function
20608
20609 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20610 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20611 aarch64_autovectorize_vector_sizes
20612
20613 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20614 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20615 aarch64_atomic_assign_expand_fenv
20616
20617 /* Section anchor support. */
20618
20619 #undef TARGET_MIN_ANCHOR_OFFSET
20620 #define TARGET_MIN_ANCHOR_OFFSET -256
20621
20622 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20623 byte offset; we can do much more for larger data types, but have no way
20624 to determine the size of the access. We assume accesses are aligned. */
20625 #undef TARGET_MAX_ANCHOR_OFFSET
20626 #define TARGET_MAX_ANCHOR_OFFSET 4095
20627
20628 #undef TARGET_VECTOR_ALIGNMENT
20629 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20630
20631 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20632 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20633 aarch64_vectorize_preferred_vector_alignment
20634 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20635 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20636 aarch64_simd_vector_alignment_reachable
20637
20638 /* vec_perm support. */
20639
20640 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20641 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20642 aarch64_vectorize_vec_perm_const
20643
20644 #undef TARGET_VECTORIZE_GET_MASK_MODE
20645 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20646 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20647 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20648 aarch64_empty_mask_is_expensive
20649 #undef TARGET_PREFERRED_ELSE_VALUE
20650 #define TARGET_PREFERRED_ELSE_VALUE \
20651 aarch64_preferred_else_value
20652
20653 #undef TARGET_INIT_LIBFUNCS
20654 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20655
20656 #undef TARGET_FIXED_CONDITION_CODE_REGS
20657 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20658
20659 #undef TARGET_FLAGS_REGNUM
20660 #define TARGET_FLAGS_REGNUM CC_REGNUM
20661
20662 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20663 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20664
20665 #undef TARGET_ASAN_SHADOW_OFFSET
20666 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20667
20668 #undef TARGET_LEGITIMIZE_ADDRESS
20669 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20670
20671 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20672 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20673
20674 #undef TARGET_CAN_USE_DOLOOP_P
20675 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20676
20677 #undef TARGET_SCHED_ADJUST_PRIORITY
20678 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20679
20680 #undef TARGET_SCHED_MACRO_FUSION_P
20681 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20682
20683 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20684 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20685
20686 #undef TARGET_SCHED_FUSION_PRIORITY
20687 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20688
20689 #undef TARGET_UNSPEC_MAY_TRAP_P
20690 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20691
20692 #undef TARGET_USE_PSEUDO_PIC_REG
20693 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20694
20695 #undef TARGET_PRINT_OPERAND
20696 #define TARGET_PRINT_OPERAND aarch64_print_operand
20697
20698 #undef TARGET_PRINT_OPERAND_ADDRESS
20699 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20700
20701 #undef TARGET_OPTAB_SUPPORTED_P
20702 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20703
20704 #undef TARGET_OMIT_STRUCT_RETURN_REG
20705 #define TARGET_OMIT_STRUCT_RETURN_REG true
20706
20707 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20708 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20709 aarch64_dwarf_poly_indeterminate_value
20710
20711 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20712 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20713 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20714
20715 #undef TARGET_HARD_REGNO_NREGS
20716 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20717 #undef TARGET_HARD_REGNO_MODE_OK
20718 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20719
20720 #undef TARGET_MODES_TIEABLE_P
20721 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20722
20723 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20724 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20725 aarch64_hard_regno_call_part_clobbered
20726
20727 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20728 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20729 aarch64_remove_extra_call_preserved_regs
20730
20731 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20732 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20733 aarch64_return_call_with_max_clobbers
20734
20735 #undef TARGET_CONSTANT_ALIGNMENT
20736 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20737
20738 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20739 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20740 aarch64_stack_clash_protection_alloca_probe_range
20741
20742 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20743 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20744
20745 #undef TARGET_CAN_CHANGE_MODE_CLASS
20746 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20747
20748 #undef TARGET_SELECT_EARLY_REMAT_MODES
20749 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20750
20751 #undef TARGET_SPECULATION_SAFE_VALUE
20752 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20753
20754 #undef TARGET_ESTIMATED_POLY_VALUE
20755 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20756
20757 #undef TARGET_ATTRIBUTE_TABLE
20758 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20759
20760 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20761 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20762 aarch64_simd_clone_compute_vecsize_and_simdlen
20763
20764 #undef TARGET_SIMD_CLONE_ADJUST
20765 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20766
20767 #undef TARGET_SIMD_CLONE_USABLE
20768 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20769
20770 #undef TARGET_COMP_TYPE_ATTRIBUTES
20771 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20772
20773 #undef TARGET_GET_MULTILIB_ABI_NAME
20774 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20775
20776 #if CHECKING_P
20777 #undef TARGET_RUN_TARGET_SELFTESTS
20778 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20779 #endif /* #if CHECKING_P */
20780
20781 #undef TARGET_ASM_POST_CFI_STARTPROC
20782 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20783
20784 struct gcc_target targetm = TARGET_INITIALIZER;
20785
20786 #include "gt-aarch64.h"